# Content Analysis

In [1791]:
import numpy as np
import pandas as pd
from cmath import nan

In [1792]:
tweets = pd.read_csv('DATA/Clean_Tweets_All_Info.csv')

In [1793]:
tweets.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis = 1, inplace = True)

In [1794]:
tweets.head(2)

Unnamed: 0,tweet_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,mentioned_users,hashtags,emojis,urls,cleaned_text,coords,geometry
0,1575493060024143874,6160,@_Hermano_7 PSL players would riot 😭,2022-09-29 14:29:51+00:00,0,1,0,0,{'place_id': 'e564d30dc173d2a8'},"Johannesburg, South Africa",['@_Hermano_7'],,['😭'],,psl player would riot,"[27.7518557, -26.5126489, 28.1843404, -26.0396...",POINT (27.968098050000002 -26.276138500000002)
1,1575489974316584961,1329,Spx yesterday's rally\nPoof gone https://t.co/...,2022-09-29 14:17:35+00:00,0,0,1,0,{'place_id': 'a02e6c261fa62b42'},"Benoni, South Africa",,,,https://t.co/hvAYVEs6NU,spx yesterday's rally\npoof go,"[28.2722463, -26.2315204, 28.4449594, -26.0681...",POINT (28.358602849999997 -26.14985695)


### Convert String Representation of a List to an Actual List of Strings

In [1795]:
def split_string(string):
    if string == string:
        string = string.lower()
        string = string.replace("'", "")
        out = string.strip('][').split(', ')
        return out
    else:
        return nan

### Define String Extractor

In [1796]:
def string_substring(string, substring):
    # Lower ensures that the vocab is case insensitive
    string = string.lower()
    substring = substring.lower()

    if substring in string:
        return True
    else:
        return False

### Content Analysis method
Takes in a vocabulary as argument and outputs a list that corresponds to a list of concepts for each tweet

In [1797]:
def content_analysis(vocabulary):
    list = []

    # Loop Through Tweets
    for iter_t, tweet in tweets.iterrows():
        sub_list = []
        #Loop Through Vocab
        for iter_g, vocab in vocabulary.iterrows():
            # Loop Through Phrases in Vocab
            for phrase in vocab['Phrases']:
                # Check if Phrase is in Text
                if string_substring(tweet['text'], phrase):
                    if vocab['Conditional_Phrases'] != vocab['Conditional_Phrases']:
                        sub_list.append(vocab['Concept'])
                        break
                    else:
                        # Loop Through Conditional Phrases in Vocab
                        for con_phrase in vocab['Conditional_Phrases']:
                            # Check to see if there are Conditional Phrases otherwise break
                            if string_substring(tweet['text'], con_phrase):
                                sub_list.append(vocab['Concept'])
                                break
                            # Check if the conditional phrase has a NOT Operator
                            elif '%not%' in con_phrase:
                                con_phrase = con_phrase[6:]
                                if not string_substring(tweet['text'], con_phrase):
                                    sub_list.append(vocab['Concept'])
                                    break
        if len(sub_list) != 0:
            list.append(sub_list)
        else:
            list.append(nan)
    return list

### Best attempt I could using pandas functions opposed to nested for loops
Just use above method, if it is too slow then coome back to this one and try improve it

In [1798]:
# s = grievances.explode('Phrases').set_index('Phrases')['Concept']
# tweets['Grievances'] = (tweets['text'].str.split()
#                  .explode().map(s).dropna()
#                  .groupby(level=0).agg(set)
#                 )

In [1799]:
# Generic Code:
# s = df_2.explode('Words').set_index('Words')['Name']
# df_1['Names'] = (df_1['Paragraph'].str.split()
#                  .explode().map(s).dropna()
#                  .groupby(level=0).agg(set)
#                 )

### Load in All Vocabs
And ensure all vocab lists set up correctly

In [1800]:
grievances = pd.read_excel('DATA/Vocabularies/grievances.xlsx')
grievances.drop('Unnamed: 0', axis = 1, inplace = True)
grievances['Phrases'] = grievances['Phrases'].apply(lambda x: split_string(x))
grievances['Conditional_Phrases'] = grievances['Conditional_Phrases'].apply(lambda x: split_string(x))

In [1801]:
triggers = pd.read_excel('DATA/Vocabularies/trigger.xlsx')
triggers.drop('Unnamed: 0', axis = 1, inplace = True)
triggers['Phrases'] = triggers['Phrases'].apply(lambda x: split_string(x))
triggers['Conditional_Phrases'] = triggers['Conditional_Phrases'].apply(lambda x: split_string(x))

In [1802]:
tactics = pd.read_excel('DATA/Vocabularies/tactic.xlsx')
tactics.drop('Unnamed: 0', axis = 1, inplace = True)
tactics['Phrases'] = tactics['Phrases'].apply(lambda x: split_string(x))
tactics['Conditional_Phrases'] = tactics['Conditional_Phrases'].apply(lambda x: split_string(x))

In [1803]:
actors = pd.read_excel('DATA/Vocabularies/actors.xlsx')
actors.drop('Unnamed: 0', axis = 1, inplace = True)
actors['Phrases'] = actors['Phrases'].apply(lambda x: split_string(x))
actors['Conditional_Phrases'] = actors['Conditional_Phrases'].apply(lambda x: split_string(x))

In [1804]:
locations = pd.read_excel('DATA/Vocabularies/locations.xlsx')
locations.drop('Unnamed: 0', axis = 1, inplace = True)
locations['Phrases'] = locations['Phrases'].apply(lambda x: split_string(x))
locations['Conditional_Phrases'] = locations['Conditional_Phrases'].apply(lambda x: split_string(x))

In [1805]:
weapons = pd.read_excel('DATA/Vocabularies/weapons.xlsx')
weapons.drop('Unnamed: 0', axis = 1, inplace = True)
weapons['Phrases'] = weapons['Phrases'].apply(lambda x: split_string(x))
weapons['Conditional_Phrases'] = weapons['Conditional_Phrases'].apply(lambda x: split_string(x))

In [1806]:
eventualities = pd.read_excel('DATA/Vocabularies/eventuality.xlsx')
eventualities.drop('Unnamed: 0', axis = 1, inplace = True)
eventualities['Phrases'] = eventualities['Phrases'].apply(lambda x: split_string(x))
eventualities['Conditional_Phrases'] = eventualities['Conditional_Phrases'].apply(lambda x: split_string(x))

In [1807]:
curiosities = pd.read_excel('DATA/Vocabularies/curiosity.xlsx')
curiosities.drop('Unnamed: 0', axis = 1, inplace = True)
curiosities['Phrases'] = curiosities['Phrases'].apply(lambda x: split_string(x))
curiosities['Conditional_Phrases'] = curiosities['Conditional_Phrases'].apply(lambda x: split_string(x))

In [1808]:
non_protests = pd.read_excel('DATA/Vocabularies/non_protest.xlsx')
non_protests.drop('Unnamed: 0', axis = 1, inplace = True)
non_protests['Phrases'] = non_protests['Phrases'].apply(lambda x: split_string(x))
non_protests['Conditional_Phrases'] = non_protests['Conditional_Phrases'].apply(lambda x: split_string(x))

###  Content Analysis for all vocabs:
* Grievances
* Trigger
* Tactic
* Actors
* Location
* Weapons
* Eventuality
* Curiosity
* Non-Protest

### *NB Find a way to remove 'A value is trying to be set on a copy of a slice from a DataFrame.' Warning

In [1809]:
# Hide Warning for now
pd.options.mode.chained_assignment = None  # default='warn'

In [1810]:
tweets['grievances'] = content_analysis(grievances)

In [1811]:
tweets['triggers'] = content_analysis(triggers)

In [1812]:
tweets['tactics'] = content_analysis(tactics)

In [1813]:
tweets['actors'] = content_analysis(actors)

In [1814]:
tweets['locations'] = content_analysis(locations)

In [1815]:
tweets['weapons'] = content_analysis(weapons)

In [1816]:
tweets['eventualities'] = content_analysis(eventualities)

In [1817]:
tweets['curiosities'] = content_analysis(curiosities)

In [1818]:
tweets['non_protests'] = content_analysis(non_protests)

### Drop Tweets That Are Non-Protest
Unless contains another field

In [1819]:
tweets.shape

(19977, 26)

In [1820]:
tweets = tweets[(tweets['non_protests'].isna()) | 
                 tweets['grievances'].notna() | 
                 tweets['triggers'].notna() |
                 tweets['tactics'].notna()]

In [1821]:
tweets.shape

(18869, 26)

In [1822]:
tweets.head(3)

Unnamed: 0,tweet_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,...,geometry,grievances,triggers,tactics,actors,locations,weapons,eventualities,curiosities,non_protests
0,1575493060024143874,6160,@_Hermano_7 PSL players would riot 😭,2022-09-29 14:29:51+00:00,0,1,0,0,{'place_id': 'e564d30dc173d2a8'},"Johannesburg, South Africa",...,POINT (27.968098050000002 -26.276138500000002),,,[Disrupt],,,,,,[Football matches]
2,1575482837989474304,1436,@JohnPerlman A driverless car wouldn't work in...,2022-09-29 13:49:14+00:00,1,0,0,0,{'place_id': '0e587c59401d0a27'},"Pretoria, South Africa",...,POINT (28.184066 -25.75260515),[Services B],,[Disrupt],,,,,,
4,1575418000047472641,985,"Thunder ⛈ Fire Go Strike You, Fo De Ting You D...",2022-09-29 09:31:35+00:00,0,0,0,0,{'place_id': '2cef54f8b7d99a87'},"Kokosi, South Africa",...,POINT (27.46964785 -26.5042489),,,[Disrupt],,,,,,


In [1823]:
tweets.to_csv('tweet_content.csv')