# Content Analysis

In [311]:
import numpy as np
import pandas as pd
from cmath import nan

In [312]:
tweets = pd.read_csv('DATA/Clean_Tweets_All_Info.csv')

In [313]:
tweets.drop(['Unnamed: 0.1', 'Unnamed: 0'], axis = 1, inplace = True)

In [314]:
tweets.head(2)

Unnamed: 0,tweet_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,cleaned_text,coords,geometry
0,923338250261008386,9188,Thatha celebrity https://t.co/aykbdCw1Pf,2017-10-25 23:59:28+00:00,0,1,0,0,{'place_id': 'e564d30dc173d2a8'},"Johannesburg, South Africa",thatha celebrity,"[27.7518557, -26.5126489, 28.1843404, -26.0396...",POINT (27.968098050000002 -26.276138500000002)
1,923338197563854848,598,@Baba_Bangude Shoda is a kakistðŸ˜‚,2017-10-25 23:59:15+00:00,0,0,0,0,{'place_id': '557dd67203c6e04c'},"Vereeniging, South Africa",shoda kakist,"[27.8525058, -26.7255165, 28.0249225, -26.5247...",POINT (27.93871415 -26.625155550000002)


In [315]:
# tweets = tweets.head(100)

### Convert String Representation of a List to an Actual List of Strings

In [316]:
def split_string(string):
    if string == string:
        string = string.lower()
        string = string.replace("'", "")
        out = string.strip('][').split(', ')
        return out
    else:
        return nan

### Define String Extractor

In [317]:
def string_substring(string, substring):
    # Lower ensures that the vocab is case insensitive
    string = string.lower()
    substring = substring.lower()

    if substring in string:
        return True
    else:
        return False

### Content Analysis method
Takes in a vocabulary as argument and outputs a list that corresponds to a list of concepts for each tweet

In [318]:
def content_analysis(vocabulary):
    list_ = []

    # Loop Through Tweets
    for iter_t, tweet in tweets.iterrows():
        sub_list = []
        #Loop Through Vocab
        for iter_g, vocab in vocabulary.iterrows():
            # Loop Through Phrases in Vocab
            for phrase in vocab['Phrases']:
                # Check if Phrase is in Text
                if string_substring(tweet['text'], phrase):
                    if vocab['Conditional_Phrases'] != vocab['Conditional_Phrases']:
                        sub_list.append(vocab['Concept'])
                        break
                    else:
                        # Loop Through Conditional Phrases in Vocab
                        for con_phrase in vocab['Conditional_Phrases']:
                            # Check to see if there are Conditional Phrases otherwise break
                            if string_substring(tweet['text'], con_phrase):
                                sub_list.append(vocab['Concept'])
                                break
                            # Check if the conditional phrase has a NOT Operator
                            elif '%not%' in con_phrase:
                                con_phrase = con_phrase[6:]
                                if not string_substring(tweet['text'], con_phrase):
                                    sub_list.append(vocab['Concept'])
                                    break
        if len(sub_list) != 0:
            list_.append(sub_list)
        else:
            list_.append(nan)
    return list_

### Best attempt I could using pandas functions opposed to nested for loops
Just use above method, if it is too slow then coome back to this one and try improve it

In [319]:
# s = grievances.explode('Phrases').set_index('Phrases')['Concept']
# tweets['Grievances'] = (tweets['text'].str.split()
#                  .explode().map(s).dropna()
#                  .groupby(level=0).agg(set)
#                 )

In [320]:
# Generic Code:
# s = df_2.explode('Words').set_index('Words')['Name']
# df_1['Names'] = (df_1['Paragraph'].str.split()
#                  .explode().map(s).dropna()
#                  .groupby(level=0).agg(set)
#                 )

### Load in All Vocabs
And ensure all vocab lists set up correctly

In [321]:
grievances = pd.read_excel('DATA/Vocabularies/grievances.xlsx')
grievances.drop('Unnamed: 0', axis = 1, inplace = True)
grievances['Phrases'] = grievances['Phrases'].apply(lambda x: split_string(x))
grievances['Conditional_Phrases'] = grievances['Conditional_Phrases'].apply(lambda x: split_string(x))

In [322]:
triggers = pd.read_excel('DATA/Vocabularies/trigger.xlsx')
triggers.drop('Unnamed: 0', axis = 1, inplace = True)
triggers['Phrases'] = triggers['Phrases'].apply(lambda x: split_string(x))
triggers['Conditional_Phrases'] = triggers['Conditional_Phrases'].apply(lambda x: split_string(x))

In [323]:
tactics = pd.read_excel('DATA/Vocabularies/tactic.xlsx')
tactics.drop('Unnamed: 0', axis = 1, inplace = True)
tactics['Phrases'] = tactics['Phrases'].apply(lambda x: split_string(x))
tactics['Conditional_Phrases'] = tactics['Conditional_Phrases'].apply(lambda x: split_string(x))

In [324]:
actors = pd.read_excel('DATA/Vocabularies/actors.xlsx')
actors.drop('Unnamed: 0', axis = 1, inplace = True)
actors['Phrases'] = actors['Phrases'].apply(lambda x: split_string(x))
actors['Conditional_Phrases'] = actors['Conditional_Phrases'].apply(lambda x: split_string(x))

In [325]:
locations = pd.read_excel('DATA/Vocabularies/locations.xlsx')
locations.drop('Unnamed: 0', axis = 1, inplace = True)
locations['Phrases'] = locations['Phrases'].apply(lambda x: split_string(x))
locations['Conditional_Phrases'] = locations['Conditional_Phrases'].apply(lambda x: split_string(x))

In [326]:
weapons = pd.read_excel('DATA/Vocabularies/weapons.xlsx')
weapons.drop('Unnamed: 0', axis = 1, inplace = True)
weapons['Phrases'] = weapons['Phrases'].apply(lambda x: split_string(x))
weapons['Conditional_Phrases'] = weapons['Conditional_Phrases'].apply(lambda x: split_string(x))

In [327]:
eventualities = pd.read_excel('DATA/Vocabularies/eventuality.xlsx')
eventualities.drop('Unnamed: 0', axis = 1, inplace = True)
eventualities['Phrases'] = eventualities['Phrases'].apply(lambda x: split_string(x))
eventualities['Conditional_Phrases'] = eventualities['Conditional_Phrases'].apply(lambda x: split_string(x))

In [328]:
curiosities = pd.read_excel('DATA/Vocabularies/curiosity.xlsx')
curiosities.drop('Unnamed: 0', axis = 1, inplace = True)
curiosities['Phrases'] = curiosities['Phrases'].apply(lambda x: split_string(x))
curiosities['Conditional_Phrases'] = curiosities['Conditional_Phrases'].apply(lambda x: split_string(x))

In [329]:
non_protests = pd.read_excel('DATA/Vocabularies/non_protest.xlsx')
non_protests.drop('Unnamed: 0', axis = 1, inplace = True)
non_protests['Phrases'] = non_protests['Phrases'].apply(lambda x: split_string(x))
non_protests['Conditional_Phrases'] = non_protests['Conditional_Phrases'].apply(lambda x: split_string(x))

###  Content Analysis for all vocabs:
* Grievances
* Trigger
* Tactic
* Actors
* Location
* Weapons
* Eventuality
* Curiosity
* Non-Protest

</br>
And Remove any duplicates

In [330]:
def unique_list(x):
    if x == x:
        return list(set(x))
    else:
        return nan

In [331]:
tweets['grievances'] = content_analysis(grievances)
tweets['grievances'] = tweets['grievances'].apply(lambda x: unique_list(x))

In [332]:
tweets['triggers'] = content_analysis(triggers)
tweets['triggers'] = tweets['triggers'].apply(lambda x: unique_list(x))

In [333]:
tweets['tactics'] = content_analysis(tactics)
tweets['tactics'] = tweets['tactics'].apply(lambda x: unique_list(x))

In [334]:
tweets['actors'] = content_analysis(actors)
tweets['actors'] = tweets['actors'].apply(lambda x: unique_list(x))

In [335]:
tweets['locations'] = content_analysis(locations)
tweets['locations'] = tweets['locations'].apply(lambda x: unique_list(x))

In [336]:
tweets['weapons'] = content_analysis(weapons)
tweets['weapons'] = tweets['weapons'].apply(lambda x: unique_list(x))

In [337]:
tweets['eventualities'] = content_analysis(eventualities)
tweets['eventualities'] = tweets['eventualities'].apply(lambda x: unique_list(x))

In [338]:
tweets['curiosities'] = content_analysis(curiosities)
tweets['curiosities'] = tweets['curiosities'].apply(lambda x: unique_list(x))

In [339]:
tweets['non_protests'] = content_analysis(non_protests)
tweets['non_protests'] = tweets['non_protests'].apply(lambda x: unique_list(x))

### Drop Tweets That Are Non-Protest
Unless contains another field

In [340]:
tweets.shape

(46601, 22)

In [341]:
tweets = tweets[(tweets['non_protests'].isna()) | 
                 tweets['grievances'].notna() | 
                 tweets['triggers'].notna() |
                 tweets['tactics'].notna()]

In [342]:
tweets.shape

(45073, 22)

In [343]:
tweets = tweets[(tweets['grievances'].notna()) | 
                 (tweets['triggers'].notna()) |
                 (tweets['tactics'].notna())|
                 (tweets['actors'].notna()) |
                 (tweets['locations'].notna()) |
                 (tweets['weapons'].notna()) |
                 (tweets['eventualities'].notna()) |
                 (tweets['curiosities'].notna())]

In [344]:
tweets.shape

(12509, 22)

In [345]:
tweets.head(3)

Unnamed: 0,tweet_id,author_followers,text,created_at,retweets,replies,likes,quote_count,place_id,place_name,...,geometry,grievances,triggers,tactics,actors,locations,weapons,eventualities,curiosities,non_protests
1,923338197563854848,598,@Baba_Bangude Shoda is a kakistðŸ˜‚,2017-10-25 23:59:15+00:00,0,0,0,0,{'place_id': '557dd67203c6e04c'},"Vereeniging, South Africa",...,POINT (27.93871415 -26.625155550000002),,,[Boycott],[Political Party],,,,,
2,923338139451772929,5590,My Zulu comprehension doesn't allow me to enjo...,2017-10-25 23:59:01+00:00,0,0,0,0,{'place_id': '8b9ec16fdc0d7e55'},"Cape Town, South Africa",...,POINT (18.4241 -33.9249),,,,[Political Party],,,,,
3,923338012288782336,9188,Today is going to be a beautiful day ðŸ˜Š https:/...,2017-10-25 23:58:31+00:00,0,0,0,1,{'place_id': 'e564d30dc173d2a8'},"Johannesburg, South Africa",...,POINT (27.968098050000002 -26.276138500000002),,,,[Political Party],,,,,


In [346]:
tweets.to_csv('DATA/tweet_content.csv')

### Content Table and Frequency Analysis

In [347]:
content_table = pd.read_excel('DATA/Protest is SA   SAS rules V2 - Separated.xlsx')
content_table = content_table.drop('Phrases', axis =1)
content_table = content_table.drop('Conditional_Phrases', axis =1)
content_table = content_table.drop('Afrikaans_Phrases', axis =1)
content_table = content_table.drop('Rule', axis =1)
content_table['Occurances'] = 0

In [348]:
content_table.head()

Unnamed: 0,Label,Concept,Occurances
0,Grievance,Capitalism,0
1,Grievance,Values,0
2,Grievance,Conditions,0
3,Grievance,Contract end,0
4,Grievance,Community recognition,0


In [349]:
def count_occurances(col_name, label):
    global content_table
    occurances = list(content_table['Occurances'])
    for iter_t, tweet in tweets.iterrows():
            if tweet[col_name] == tweet[col_name]:
                for content in tweet[col_name]:
                    for iter_c, concept in content_table.iterrows():
                        if concept['Concept'] == content:
                            if concept['Label'] == label:
                                occurances[iter_c] += 1
                            
    content_table = content_table.drop('Occurances', axis =1)
    content_table['Occurances'] = occurances

In [350]:
count_occurances('grievances', 'Grievance')
print('Counted Grievances!')
count_occurances('triggers', 'Trigger')
print('Counted Triggers!')
count_occurances('tactics', 'Tactic')
print('Counted Tactics!')
count_occurances('actors', 'Actors')
print('Counted Actors!')
count_occurances('locations', 'Location')
print('Counted Locations!')
count_occurances('weapons', 'Weapons')
print('Counted Weapons!')
count_occurances('eventualities', 'Eventuality')
print('Counted Eventualities!')
count_occurances('curiosities', 'Curiosity')
print('Counted Curiosities!')
count_occurances('non_protests', 'Non-protest')
print('Counted Non-Protests!')
print('Done!')

Counted Grievances!
Counted Triggers!
Counted Tactics!
Counted Actors!
Counted Locations!
Counted Weapons!
Counted Eventualities!
Counted Curiosities!
Counted Non-Protests!
Done!


In [351]:
content_table.head(10)

Unnamed: 0,Label,Concept,Occurances
0,Grievance,Capitalism,9
1,Grievance,Values,43
2,Grievance,Conditions,404
3,Grievance,Contract end,8
4,Grievance,Community recognition,16
5,Grievance,Demolitions,21
6,Grievance,Education,47
7,Grievance,Election outcome,0
8,Grievance,Electricity,321
9,Grievance,Labour related,153


In [352]:
content_table.to_csv('DATA/content_table.csv')

### Content Frequency Table

In [353]:
total = len(tweets)
print(total)

12509


In [354]:
frequency_table = content_table.groupby('Label').Occurances.sum().reset_index()

In [355]:
frequency_table['Frequency'] = frequency_table['Occurances'].apply(lambda x: round((x/total)*100,2))

In [356]:
frequency_table.head(10)

Unnamed: 0,Label,Occurances,Frequency
0,Actors,4347,34.75
1,Curiosity,458,3.66
2,Eventuality,188,1.5
3,Grievance,4107,32.83
4,Location,1684,13.46
5,Non-protest,596,4.76
6,Tactic,4521,36.14
7,Trigger,1857,14.85
8,Weapons,113,0.9
