# Cleaning up and Creating Content Vocabulary

### Import Everything
Just easier that way

In [109]:
import numpy as np
import pandas as pd
import os

# Word processing libraries
import re
from nltk.corpus import wordnet
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

#Nan
from cmath import nan

In [110]:
vocab = pd.read_excel('DATA/Protest is SA   SAS rules V2 - Separated.xlsx')

In [111]:
vocab.shape

(92, 6)

In [112]:
vocab.head(3)

Unnamed: 0,Label,Concept,Phrases,Conditional_Phrases,Afrikaans_Phrases,Rule
0,Grievance,Capitalism,"Capitalism,neoliberalism,privatisation,privati...",,"Kapitalisme, privatisasie, privatiseer, neolib...",Multiple grievances ok
1,Grievance,Values,"Abuse,against xenophobia",,"mishandeling, teen xenofobie",Multiple grievances ok
2,Grievance,Conditions,"poverty,unemployment,jobs,job",,"armoede, werkloosheid, werke",Multiple grievances ok


In [113]:
vocab = vocab.dropna(how = 'all')

In [114]:
vocab.shape

(92, 6)

In [115]:
vocab[(vocab['Afrikaans_Phrases'].isnull())]

Unnamed: 0,Label,Concept,Phrases,Conditional_Phrases,Afrikaans_Phrases,Rule
9,Grievance,Labour related,"Work,workers,Company,Employ,employer,employee,...","corrupt,fired,dismissed,policy,other people,ho...",,Multiple grievances ok


### Drop Rule
Important to take note of what the rule column means: </br>
Rule indicates whether text can be grouped into multiple categories </br>
Rules are as follows:
Can Contain Multiple:
* Grievance
* Actors
* Locations
* Weapons
* Eventualities
* Curiosity

Can Only Have 1:
* Trigger
* Tactic</br>
Re-think this because it could be possible to have multiple. </br>
Events could be a combination of multiple things </br>
eg: could be a gathering and a march</br>

Discard:
* Non-protest</br>
    Unless it contains:
    * Grievance
    * Trigger
    * Tactic
    * Actor -> only for "16 days of activism"


Now we can drop Rule

In [116]:
vocab = vocab.drop('Rule', axis = 1)

In [117]:
vocab.head(3)

Unnamed: 0,Label,Concept,Phrases,Conditional_Phrases,Afrikaans_Phrases
0,Grievance,Capitalism,"Capitalism,neoliberalism,privatisation,privati...",,"Kapitalisme, privatisasie, privatiseer, neolib..."
1,Grievance,Values,"Abuse,against xenophobia",,"mishandeling, teen xenofobie"
2,Grievance,Conditions,"poverty,unemployment,jobs,job",,"armoede, werkloosheid, werke"


### Drop Afrikaans for now
We can always create functionality for it at a later date

In [118]:
vocab = vocab.drop('Afrikaans_Phrases', axis = 1)

In [119]:
vocab.head(3)

Unnamed: 0,Label,Concept,Phrases,Conditional_Phrases
0,Grievance,Capitalism,"Capitalism,neoliberalism,privatisation,privati...",
1,Grievance,Values,"Abuse,against xenophobia",
2,Grievance,Conditions,"poverty,unemployment,jobs,job",


### Lemmatise Vocabulary so that it can easily be matched to text in tweet

In [120]:
# Define the function to implement POS tagging:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


# Define the main function to clean text in various ways:
def clean_text(text):
    if text != text:
        return nan

    else:
        # Apply regex expressions first before converting string to list of tokens/words:

        # 5. Convert text to lowercase
        text = text.lower()
        
        # 6. tokenize text and remove punctuation
        text = [word.strip(string.punctuation) for word in text.split(" ")]
        
        # 7. remove numbers
        text = [word for word in text if not any(c.isdigit() for c in word)]
        
        # 8. remove stop words
        stop = stopwords.words('english')
        text = [x for x in text if x not in stop]
        
        # 9. remove empty tokens
        text = [t for t in text if len(t) > 0]
        
        # 10. pos tag text and lemmatize text
        pos_tags = pos_tag(text)
        text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
        
        # 11. remove words with only one letter
        text = [t for t in text if len(t) > 1]
        
        # join all
        text = " ".join(text)
    
    return(text)

In [121]:
vocab.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92 entries, 0 to 91
Data columns (total 4 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Label                92 non-null     object
 1   Concept              92 non-null     object
 2   Phrases              92 non-null     object
 3   Conditional_Phrases  15 non-null     object
dtypes: object(4)
memory usage: 3.0+ KB


In [122]:
vocab['cleaned_phrases'] = vocab['Phrases'].apply(lambda x: clean_text(x))
vocab['cleaned_conditional_phrases'] = vocab['Conditional_Phrases'].apply(lambda x: clean_text(x))

In [123]:
vocab.Phrases = vocab.Phrases.str. split(',')
vocab.Conditional_Phrases = vocab.Conditional_Phrases.str.split(',')
vocab.cleaned_phrases = vocab.cleaned_phrases.str. split(' ')
vocab.cleaned_conditional_phrases = vocab.cleaned_conditional_phrases.str. split(' ')

### Matching Content Analysis
Do content analysis for both lematised and non lematised text to make sure

In [124]:
vocab.head(10)

Unnamed: 0,Label,Concept,Phrases,Conditional_Phrases,cleaned_phrases,cleaned_conditional_phrases
0,Grievance,Capitalism,"[Capitalism, neoliberalism, privatisation, pri...",,"[capitalism,neoliberalism,privatisation,privat...",
1,Grievance,Values,"[Abuse, against xenophobia]",,"[abuse,against, xenophobia]",
2,Grievance,Conditions,"[poverty, unemployment, jobs, job]",,"[poverty,unemployment,jobs,job]",
3,Grievance,Contract end,"[contracts expire, project end, completed, con...",,"[contract, expire,project, end,completed,contr...",
4,Grievance,Community recognition,"[their rights, recognition]",,"[rights,recognition]",
5,Grievance,Demolitions,"[destroyed, evicted, demolished, relocated, re...",,"[destroyed,evicted,demolished,relocated,remova...",
6,Grievance,Education,"[School, university, students, student, varsit...","[fees, costs, teacher, permission, results, po...","[school,university,students,student,varsity,va...","[fees,costs,teacher,permission,results,policy,..."
7,Grievance,Election outcome,[Election],"[outcome, result, winner, unfair, cheat, wrong...",[election],"[outcome,result,winner,unfair,cheat,wrong,coun..."
8,Grievance,Electricity,"[Electricity, Power, connections, loadshedding...","[cost, price, supply, cut, disconnect, loadshe...","[electricity,power,connections,loadshedding,lo...","[cost,price,supply,cut,disconnect,loadshedding..."
9,Grievance,Labour related,"[Work, workers, Company, Employ, employer, emp...","[corrupt, fired, dismissed, policy, other peop...","[work,workers,company,employ,employer,employee...","[corrupt,fired,dismissed,policy,other, people,..."


In [125]:
print(vocab.Phrases[0])

['Capitalism', 'neoliberalism', 'privatisation', 'privatise', 'privatize', 'privatization']


### Split Up various Labels Into Own Dataframe

#### Grievances

In [126]:
grievances = vocab.loc[vocab['Label'] == 'Grievance']
grievances = grievances.drop('Label', axis = 1)
grievances.to_excel('DATA/Vocabularies/grievances.xlsx')
grievances.shape

(36, 5)

In [127]:
grievances.head()

Unnamed: 0,Concept,Phrases,Conditional_Phrases,cleaned_phrases,cleaned_conditional_phrases
0,Capitalism,"[Capitalism, neoliberalism, privatisation, pri...",,"[capitalism,neoliberalism,privatisation,privat...",
1,Values,"[Abuse, against xenophobia]",,"[abuse,against, xenophobia]",
2,Conditions,"[poverty, unemployment, jobs, job]",,"[poverty,unemployment,jobs,job]",
3,Contract end,"[contracts expire, project end, completed, con...",,"[contract, expire,project, end,completed,contr...",
4,Community recognition,"[their rights, recognition]",,"[rights,recognition]",


#### Trigger

In [128]:
trigger = vocab.loc[vocab['Label'] == 'Trigger']
trigger = trigger.drop('Label', axis = 1)
trigger.to_excel('DATA/Vocabularies/trigger.xlsx')
trigger.shape

(13, 5)

In [129]:
trigger.head()

Unnamed: 0,Concept,Phrases,Conditional_Phrases,cleaned_phrases,cleaned_conditional_phrases
36,Court hearing,"[court, hearing]",,"[court,hearing]",
37,Project begin,"[not local, not from area, outside, only be lo...",,"[local,not, area,outside,only, local,only, loc...",
38,National Strike,"[COSATU, NUMSA, national strike]",,"[cosatu,numsa,national, strike]",
39,Dismissals,"[Fired, suspended, dismissed, discipline, diss...",,"[fired,suspended,dismissed,discipline,dissmiss...",
40,Wage disputes,"[Payment, salary, cheque, overtime, wage, wage...",,"[payment,salary,cheque,overtime,wage,wages,rem...",


#### Tactic

In [130]:
tactic = vocab.loc[vocab['Label'] == 'Tactic']
tactic = tactic.drop('Label', axis = 1)
tactic.to_excel('DATA/Vocabularies/tactic.xlsx')
tactic.shape

(14, 5)

In [131]:
tactic.head()

Unnamed: 0,Concept,Phrases,Conditional_Phrases,cleaned_phrases,cleaned_conditional_phrases
49,Hostage,"[Captive, hostage, lock-in, abduct]",,"[captive,hostage,lock-in,abduct]",
50,Attack,"[Atack, Attack, Throw, threw, pelt, torch, cho...",,"[atack,attack,throw,threw,pelt,torch,choas,los...",
51,Disrupt,"[Tyre, Block, tire, burn, Fire, Bricks, Stones...",,"[tyre,block,tire,burn,fire,bricks,stones,rubbi...",
52,M&M,[March],[Memorandum],[march],[memorandum]
53,Vandalism,"[damaged, vandalised, break, broke]",,"[damaged,vandalised,break,broke]",


#### Actors

In [132]:
actors = vocab.loc[vocab['Label'] == 'Actors']
actors = actors.drop('Label', axis = 1)
actors.to_excel('DATA/Vocabularies/actors.xlsx')
actors.shape

(4, 5)

In [133]:
actors.head()

Unnamed: 0,Concept,Phrases,Conditional_Phrases,cleaned_phrases,cleaned_conditional_phrases
63,Union,"[COSATU, FOSATU, NEHAWU, SATAWU, Allied, AMCU,...",,"[cosatu,fosatu,nehawu,satawu,allied,amcu,nuhhr...",
64,Political Party,"[ANC, Congress, DA, EFF, IFP, NFP, Cope, UDM, ...",,"[anc,congress,da,eff,ifp,nfp,cope,udm,sacp,acdp]",
65,Civic org,"[association, residents, concerned, ratepayers...",,"[association,residents,concerned,ratepayers,ta...",
66,Church,"[church, congregation, believers, temple, mosq...",,"[church,congregation,believers,temple,mosque,s...",


#### Locations

In [134]:
locations = vocab.loc[vocab['Label'] == 'Location']
locations = locations.drop('Label', axis = 1)
locations.to_excel('DATA/Vocabularies/locations.xlsx')
locations.shape

(7, 5)

In [135]:
locations.head()

Unnamed: 0,Concept,Phrases,Conditional_Phrases,cleaned_phrases,cleaned_conditional_phrases
67,Informal area,"[Shacks, sharks, informal, hostel, squatter, c...",,"[shacks,sharks,informal,hostel,squatter,camp,t...",
68,Court,"[Court, hearing]",,"[court,hearing]",
69,Business premises,"[company, business, gate, premises, factory]",,"[company,business,gate,premises,factory]",
70,Stadium,"[Stadium, sports field]",,"[stadium,sports, field]",
71,Church,"[church, congregation, believers, temple, musl...",,"[church,congregation,believers,temple,muslim,j...",


#### Weapons

In [136]:
weapons = vocab.loc[vocab['Label'] == 'Weapons']
weapons = weapons.drop('Label', axis = 1)
weapons.to_excel('DATA/Vocabularies/weapons.xlsx')
weapons.shape

(3, 5)

In [137]:
weapons.head()

Unnamed: 0,Concept,Phrases,Conditional_Phrases,cleaned_phrases,cleaned_conditional_phrases
74,Crowd weapons,"[Weapon, Knife, Traditional weapon, knobkierie...",,"[weapon,knife,traditional, weapon,knobkierie,k...",
75,Crowd projectiles,"[Throw, Threw, Attack, attacked ]","[Stone, rock, brick, brike ]","[throw,threw,attack,attacked]","[stone,rock,brick,brike]"
76,Police weapons,"[Gun, cannon, canon, shotgun, s/gun, rubber, s...",,"[gun,cannon,canon,shotgun,s/gun,rubber,stungre...",


#### Eventuality

In [138]:
eventuality = vocab.loc[vocab['Label'] == 'Eventuality']
eventuality = eventuality.drop('Label', axis = 1)
eventuality.to_excel('DATA/Vocabularies/eventuality.xlsx')
eventuality.shape

(2, 5)

In [139]:
eventuality.head()

Unnamed: 0,Concept,Phrases,Conditional_Phrases,cleaned_phrases,cleaned_conditional_phrases
77,Police attack,"[Police, SAPS, officer]","[beat, raid, injury, rubber, stungrenade, stun...","[police,saps,officer]","[beat,raid,injury,rubber,stungrenade,stuntgren..."
78,Vigilantism,"[Beat, beating, mob justice, necklace, necklac...",,"[beat,beating,mob, justice,necklace,necklacing...",


#### Curiosity

In [140]:
curiosity = vocab.loc[vocab['Label'] == 'Curiosity']
curiosity = curiosity.drop('Label', axis = 1)
curiosity.to_excel('DATA/Vocabularies/curiosity.xlsx')
curiosity.shape

(2, 5)

In [141]:
curiosity.head()

Unnamed: 0,Concept,Phrases,Conditional_Phrases,cleaned_phrases,cleaned_conditional_phrases
79,Movements,"[Rebel, rebellion, uprising, movement]",,"[rebel,rebellion,uprising,movement]",
80,Special Keywords,"[Lunch, scab, scabs, swearing, loot, looting, ...",,"[lunch,scab,scabs,swearing,loot,looting,sabc,p...",


#### Non-protest

In [142]:
non_protest = vocab.loc[vocab['Label'] == 'Non-protest']
non_protest = non_protest.drop('Label', axis = 1)
non_protest.to_excel('DATA/Vocabularies/non_protest.xlsx')
non_protest.shape

(11, 5)

In [143]:
non_protest.head()

Unnamed: 0,Concept,Phrases,Conditional_Phrases,cleaned_phrases,cleaned_conditional_phrases
81,Football matches,"[Football, soccer, PSL, score, Bafana]",,"[football,soccer,psl,score,bafana]",
82,Other sport,"[Cricket, rugby, game, athletics, race, marath...",,"[cricket,rugby,game,athletics,race,marathon,pr...",
83,Tribal court meetings,"[Tribal court, traditional court, imbizo]",,"[tribal, court,traditional, court,imbizo]",
84,Funerals,"[Memorial, funeral, burial, after tears, mourn...",,"[memorial,funeral,burial,after, tears,mourners...",
85,Church,"[Church service, ZCC, Shembe, prayer, dedicati...",,"[church, service,zcc,shembe,prayer,dedication]",
