# Content Analysis

In [1]:
import numpy as np
import pandas as pd
from cmath import nan

In [2]:
tweets = pd.read_csv('DATA/Clean_Tweets_All_Info.csv')

In [3]:
tweets.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis = 1, inplace = True)

In [4]:
tweets.head(2)

Unnamed: 0,tweet_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,mentioned_users,hashtags,emojis,urls,cleaned_text,coords,geometry
0,1575493060024143874,6160,@_Hermano_7 PSL players would riot 😭,2022-09-29 14:29:51+00:00,0,1,0,0,{'place_id': 'e564d30dc173d2a8'},"Johannesburg, South Africa",['@_Hermano_7'],,['😭'],,psl player would riot,"[27.7518557, -26.5126489, 28.1843404, -26.0396...",POINT (27.968098050000002 -26.276138500000002)
1,1575489974316584961,1329,Spx yesterday's rally\nPoof gone https://t.co/...,2022-09-29 14:17:35+00:00,0,0,1,0,{'place_id': 'a02e6c261fa62b42'},"Benoni, South Africa",,,,https://t.co/hvAYVEs6NU,spx yesterday's rally\npoof go,"[28.2722463, -26.2315204, 28.4449594, -26.0681...",POINT (28.358602849999997 -26.14985695)


In [5]:
# tweets = tweets.head(100)

### Convert String Representation of a List to an Actual List of Strings

In [6]:
def split_string(string):
    if string == string:
        string = string.lower()
        string = string.replace("'", "")
        out = string.strip('][').split(', ')
        return out
    else:
        return nan

### Define String Extractor

In [7]:
def string_substring(string, substring):
    # Lower ensures that the vocab is case insensitive
    string = string.lower()
    substring = substring.lower()

    if substring in string:
        return True
    else:
        return False

### Content Analysis method
Takes in a vocabulary as argument and outputs a list that corresponds to a list of concepts for each tweet

In [8]:
def content_analysis(vocabulary):
    list_ = []

    # Loop Through Tweets
    for iter_t, tweet in tweets.iterrows():
        sub_list = []
        #Loop Through Vocab
        for iter_g, vocab in vocabulary.iterrows():
            # Loop Through Phrases in Vocab
            for phrase in vocab['Phrases']:
                # Check if Phrase is in Text
                if string_substring(tweet['text'], phrase):
                    if vocab['Conditional_Phrases'] != vocab['Conditional_Phrases']:
                        sub_list.append(vocab['Concept'])
                        break
                    else:
                        # Loop Through Conditional Phrases in Vocab
                        for con_phrase in vocab['Conditional_Phrases']:
                            # Check to see if there are Conditional Phrases otherwise break
                            if string_substring(tweet['text'], con_phrase):
                                sub_list.append(vocab['Concept'])
                                break
                            # Check if the conditional phrase has a NOT Operator
                            elif '%not%' in con_phrase:
                                con_phrase = con_phrase[6:]
                                if not string_substring(tweet['text'], con_phrase):
                                    sub_list.append(vocab['Concept'])
                                    break
        if len(sub_list) != 0:
            list_.append(sub_list)
        else:
            list_.append(nan)
    return list_

### Best attempt I could using pandas functions opposed to nested for loops
Just use above method, if it is too slow then coome back to this one and try improve it

In [9]:
# s = grievances.explode('Phrases').set_index('Phrases')['Concept']
# tweets['Grievances'] = (tweets['text'].str.split()
#                  .explode().map(s).dropna()
#                  .groupby(level=0).agg(set)
#                 )

In [10]:
# Generic Code:
# s = df_2.explode('Words').set_index('Words')['Name']
# df_1['Names'] = (df_1['Paragraph'].str.split()
#                  .explode().map(s).dropna()
#                  .groupby(level=0).agg(set)
#                 )

### Load in All Vocabs
And ensure all vocab lists set up correctly

In [11]:
grievances = pd.read_excel('DATA/Vocabularies/grievances.xlsx')
grievances.drop('Unnamed: 0', axis = 1, inplace = True)
grievances['Phrases'] = grievances['Phrases'].apply(lambda x: split_string(x))
grievances['Conditional_Phrases'] = grievances['Conditional_Phrases'].apply(lambda x: split_string(x))

In [12]:
triggers = pd.read_excel('DATA/Vocabularies/trigger.xlsx')
triggers.drop('Unnamed: 0', axis = 1, inplace = True)
triggers['Phrases'] = triggers['Phrases'].apply(lambda x: split_string(x))
triggers['Conditional_Phrases'] = triggers['Conditional_Phrases'].apply(lambda x: split_string(x))

In [13]:
tactics = pd.read_excel('DATA/Vocabularies/tactic.xlsx')
tactics.drop('Unnamed: 0', axis = 1, inplace = True)
tactics['Phrases'] = tactics['Phrases'].apply(lambda x: split_string(x))
tactics['Conditional_Phrases'] = tactics['Conditional_Phrases'].apply(lambda x: split_string(x))

In [14]:
actors = pd.read_excel('DATA/Vocabularies/actors.xlsx')
actors.drop('Unnamed: 0', axis = 1, inplace = True)
actors['Phrases'] = actors['Phrases'].apply(lambda x: split_string(x))
actors['Conditional_Phrases'] = actors['Conditional_Phrases'].apply(lambda x: split_string(x))

In [15]:
locations = pd.read_excel('DATA/Vocabularies/locations.xlsx')
locations.drop('Unnamed: 0', axis = 1, inplace = True)
locations['Phrases'] = locations['Phrases'].apply(lambda x: split_string(x))
locations['Conditional_Phrases'] = locations['Conditional_Phrases'].apply(lambda x: split_string(x))

In [16]:
weapons = pd.read_excel('DATA/Vocabularies/weapons.xlsx')
weapons.drop('Unnamed: 0', axis = 1, inplace = True)
weapons['Phrases'] = weapons['Phrases'].apply(lambda x: split_string(x))
weapons['Conditional_Phrases'] = weapons['Conditional_Phrases'].apply(lambda x: split_string(x))

In [17]:
eventualities = pd.read_excel('DATA/Vocabularies/eventuality.xlsx')
eventualities.drop('Unnamed: 0', axis = 1, inplace = True)
eventualities['Phrases'] = eventualities['Phrases'].apply(lambda x: split_string(x))
eventualities['Conditional_Phrases'] = eventualities['Conditional_Phrases'].apply(lambda x: split_string(x))

In [18]:
curiosities = pd.read_excel('DATA/Vocabularies/curiosity.xlsx')
curiosities.drop('Unnamed: 0', axis = 1, inplace = True)
curiosities['Phrases'] = curiosities['Phrases'].apply(lambda x: split_string(x))
curiosities['Conditional_Phrases'] = curiosities['Conditional_Phrases'].apply(lambda x: split_string(x))

In [19]:
non_protests = pd.read_excel('DATA/Vocabularies/non_protest.xlsx')
non_protests.drop('Unnamed: 0', axis = 1, inplace = True)
non_protests['Phrases'] = non_protests['Phrases'].apply(lambda x: split_string(x))
non_protests['Conditional_Phrases'] = non_protests['Conditional_Phrases'].apply(lambda x: split_string(x))

###  Content Analysis for all vocabs:
* Grievances
* Trigger
* Tactic
* Actors
* Location
* Weapons
* Eventuality
* Curiosity
* Non-Protest

</br>
And Remove any duplicates

In [20]:
def unique_list(x):
    if x == x:
        return list(set(x))
    else:
        return nan

In [21]:
tweets['grievances'] = content_analysis(grievances)
tweets['grievances'] = tweets['grievances'].apply(lambda x: unique_list(x))

In [22]:
tweets['triggers'] = content_analysis(triggers)
tweets['triggers'] = tweets['triggers'].apply(lambda x: unique_list(x))

In [23]:
tweets['tactics'] = content_analysis(tactics)
tweets['tactics'] = tweets['tactics'].apply(lambda x: unique_list(x))

In [24]:
tweets['actors'] = content_analysis(actors)
tweets['actors'] = tweets['actors'].apply(lambda x: unique_list(x))

In [25]:
tweets['locations'] = content_analysis(locations)
tweets['locations'] = tweets['locations'].apply(lambda x: unique_list(x))

In [26]:
tweets['weapons'] = content_analysis(weapons)
tweets['weapons'] = tweets['weapons'].apply(lambda x: unique_list(x))

In [27]:
tweets['eventualities'] = content_analysis(eventualities)
tweets['eventualities'] = tweets['eventualities'].apply(lambda x: unique_list(x))

In [28]:
tweets['curiosities'] = content_analysis(curiosities)
tweets['curiosities'] = tweets['curiosities'].apply(lambda x: unique_list(x))

In [29]:
tweets['non_protests'] = content_analysis(non_protests)
tweets['non_protests'] = tweets['non_protests'].apply(lambda x: unique_list(x))

### Drop Tweets That Are Non-Protest
Unless contains another field

In [30]:
tweets.shape

(19977, 26)

In [31]:
# tweets = tweets[(tweets['non_protests'].isna()) | 
#                  tweets['grievances'].notna() | 
#                  tweets['triggers'].notna() |
#                  tweets['tactics'].notna()]

In [32]:
tweets.shape

(19977, 26)

In [33]:
tweets.head(3)

Unnamed: 0,tweet_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,...,geometry,grievances,triggers,tactics,actors,locations,weapons,eventualities,curiosities,non_protests
0,1575493060024143874,6160,@_Hermano_7 PSL players would riot 😭,2022-09-29 14:29:51+00:00,0,1,0,0,{'place_id': 'e564d30dc173d2a8'},"Johannesburg, South Africa",...,POINT (27.968098050000002 -26.276138500000002),,,[Disrupt],,,,,,[Football matches]
1,1575489974316584961,1329,Spx yesterday's rally\nPoof gone https://t.co/...,2022-09-29 14:17:35+00:00,0,0,1,0,{'place_id': 'a02e6c261fa62b42'},"Benoni, South Africa",...,POINT (28.358602849999997 -26.14985695),,,,[Political Party],,,,,[Election campaigns]
2,1575482837989474304,1436,@JohnPerlman A driverless car wouldn't work in...,2022-09-29 13:49:14+00:00,1,0,0,0,{'place_id': '0e587c59401d0a27'},"Pretoria, South Africa",...,POINT (28.184066 -25.75260515),[Services B],,[Disrupt],,,,,,


In [34]:
tweets.to_csv('DATA/tweet_content.csv')

### Content Table and Frequency Analysis

In [35]:
content_table = pd.read_excel('DATA/Protest is SA   SAS rules V2 - Separated.xlsx')
content_table = content_table.drop('Phrases', axis =1)
content_table = content_table.drop('Conditional_Phrases', axis =1)
content_table = content_table.drop('Afrikaans_Phrases', axis =1)
content_table = content_table.drop('Rule', axis =1)
content_table['Occurances'] = 0

In [36]:
content_table.head()

Unnamed: 0,Label,Concept,Occurances
0,Grievance,Capitalism,0
1,Grievance,Values,0
2,Grievance,Conditions,0
3,Grievance,Contract end,0
4,Grievance,Community recognition,0


In [37]:
def count_occurances(col_name, label):
    global content_table
    occurances = list(content_table['Occurances'])
    for iter_t, tweet in tweets.iterrows():
            if tweet[col_name] == tweet[col_name]:
                for content in tweet[col_name]:
                    for iter_c, concept in content_table.iterrows():
                        if concept['Concept'] == content:
                            if concept['Label'] == label:
                                occurances[iter_c] += 1
                            
    content_table = content_table.drop('Occurances', axis =1)
    content_table['Occurances'] = occurances

In [38]:
count_occurances('grievances', 'Grievance')
print('Counted Grievances!')
count_occurances('triggers', 'Trigger')
print('Counted Triggers!')
count_occurances('tactics', 'Tactic')
print('Counted Tactics!')
count_occurances('actors', 'Actors')
print('Counted Actors!')
count_occurances('locations', 'Location')
print('Counted Locations!')
count_occurances('weapons', 'Weapons')
print('Counted Weapons!')
count_occurances('eventualities', 'Eventuality')
print('Counted Eventualities!')
count_occurances('curiosities', 'Curiosity')
print('Counted Curiosities!')
count_occurances('non_protests', 'Non-protest')
print('Counted Non-Protests!')
print('Done!')

Counted Grievances!
Counted Triggers!
Counted Tactics!
Counted Actors!
Counted Locations!
Counted Weapons!
Counted Eventualities!
Counted Curiosities!
Counted Non-Protests!
Done!


In [39]:
content_table.head(10)

Unnamed: 0,Label,Concept,Occurances
0,Grievance,Capitalism,11
1,Grievance,Values,50
2,Grievance,Conditions,496
3,Grievance,Contract end,22
4,Grievance,Community recognition,11
5,Grievance,Demolitions,43
6,Grievance,Education,507
7,Grievance,Election outcome,14
8,Grievance,Electricity,639
9,Grievance,Labour related,558


In [40]:
content_table.to_csv('DATA/content_table.csv')