# Cleaning up and Creating Content Vocabulary

### Import Everything
Just easier that way

In [279]:
import numpy as np
import pandas as pd
import os

# Word processing libraries
import re
from nltk.corpus import wordnet
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

#Nan
from cmath import nan

In [280]:
vocab = pd.read_excel('DATA/Protest is SA   SAS rules V2.xlsx')

In [281]:
vocab.shape

(100, 5)

In [282]:
vocab.head(3)

Unnamed: 0,Label,Concept,Phrases,Afrikaans Phrases,Rule
0,Grievance,Capitalism,"Capitalism, neoliberalism, privatisation, priv...","Kapitalisme, privatisasie, privatiseer, neolib...",Multiple grievances ok
1,Grievance,Values,"Abuse, against xenophobia,","mishandeling, teen xenofobie",Multiple grievances ok
2,Grievance,Conditions,"poverty, unemployment, jobs,","armoede, werkloosheid, werke",Multiple grievances ok


In [283]:
vocab = vocab.dropna(how = 'all')

In [284]:
vocab.shape

(92, 5)

In [285]:
vocab[(vocab['Afrikaans Phrases'].isnull())]

Unnamed: 0,Label,Concept,Phrases,Afrikaans Phrases,Rule
9,Grievance,Labour related,(Work(ers) OR Company OR Employ(er) OR Labour ...,,Multiple grievances ok


### Drop Rule
Important to take note of what the rule column means: </br>
Rule indicates whether text can be grouped into multiple categories </br>
Rules are as follows:
Can Contain Multiple:
* Grievance
* Actors
* Locations
* Weapons
* Eventualities
* Curiosity

Can Only Have 1:
* Trigger
* Tactic

Discard:
* Non-protest</br>
    Unless it contains:
    * Grievance
    * Trigger
    * Tactic
    * Actor -> only for "16 days of activism"


Now we can drop Rule

In [286]:
vocab = vocab.drop('Rule', axis = 1)

In [287]:
vocab.head(3)

Unnamed: 0,Label,Concept,Phrases,Afrikaans Phrases
0,Grievance,Capitalism,"Capitalism, neoliberalism, privatisation, priv...","Kapitalisme, privatisasie, privatiseer, neolib..."
1,Grievance,Values,"Abuse, against xenophobia,","mishandeling, teen xenofobie"
2,Grievance,Conditions,"poverty, unemployment, jobs,","armoede, werkloosheid, werke"


### Drop Afrikaans for now
We can always create functionality for it at a later date

In [288]:
vocab = vocab.drop('Afrikaans Phrases', axis = 1)

In [289]:
vocab.head(3)

Unnamed: 0,Label,Concept,Phrases
0,Grievance,Capitalism,"Capitalism, neoliberalism, privatisation, priv..."
1,Grievance,Values,"Abuse, against xenophobia,"
2,Grievance,Conditions,"poverty, unemployment, jobs,"


### Lemmatise Vocabulary so that it can easily be matched to text in tweet

In [None]:
# Define the function to implement POS tagging:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


# Define the main function to clean text in various ways:
def clean_text(text):
    
    # Apply regex expressions first before converting string to list of tokens/words:
    # 1. remove @usernames
   
    # 5. Convert text to lowercase
    text = text.lower()
    
    # 6. tokenize text and remove punctuation
    # text = [word.strip(string.punctuation) for word in text.split(" ")]
    
    # 7. remove numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    
    # 8. remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    
    # 9. remove empty tokens
    text = [t for t in text if len(t) > 0]
    
    # 10. pos tag text and lemmatize text
    pos_tags = pos_tag(text)
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    
    # 11. remove words with only one letter
    text = [t for t in text if len(t) > 1]
    
    # join all
    text = " ".join(text)
    
    return(text)

### Split Up various Labels Into Own Dataframe

#### Grievances

In [290]:
grievances = vocab.loc[vocab['Label'] == 'Grievance']
grievances = grievances.drop('Label', axis = 1)
grievances.to_excel('DATA/Vocabularies/grievances.xlsx')
grievances.shape

(36, 2)

In [291]:
grievances.head()

Unnamed: 0,Concept,Phrases
0,Capitalism,"Capitalism, neoliberalism, privatisation, priv..."
1,Values,"Abuse, against xenophobia,"
2,Conditions,"poverty, unemployment, jobs,"
3,Contract end,"contracts expire, project end, completed"
4,Community recognition,"their rights, recognition,"


#### Trigger

In [292]:
trigger = vocab.loc[vocab['Label'] == 'Trigger']
trigger = trigger.drop('Label', axis = 1)
trigger.to_excel('DATA/Vocabularies/trigger.xlsx')
trigger.shape

(13, 2)

In [293]:
trigger.head()

Unnamed: 0,Concept,Phrases
37,Court hearing,"court, hearing,"
38,Project begin,"not local, not from area, outside, only be loc..."
39,National Strike,"COSATU, NUMSA, national strike,"
40,Dismissals,"Fired, suspended, dismissed, disciplin*"
41,Wage disputes,"Payment, salary, cheque, overtime, wage, wages..."


#### Tactic

In [294]:
tactic = vocab.loc[vocab['Label'] == 'Tactic']
tactic = tactic.drop('Label', axis = 1)
tactic.to_excel('DATA/Vocabularies/tactic.xlsx')
tactic.shape

(14, 2)

In [295]:
tactic.head()

Unnamed: 0,Concept,Phrases
51,Hostage,"Captive, hostage, lock-in"
52,Attack,"Atack, Attack, Throw, threw, pelt, torch, choa..."
53,Disrupt,"Tyre, Block, tire, burn, Fire, Bricks, Stones,..."
54,M&M,March AND Memorandum
55,Vandalism,"damaged, vandalised, break, broke"


#### Actors

In [296]:
actors = vocab.loc[vocab['Label'] == 'Actors']
actors = actors.drop('Label', axis = 1)
actors.to_excel('DATA/Vocabularies/actors.xlsx')
actors.shape

(4, 2)

In [297]:
actors.head()

Unnamed: 0,Concept,Phrases
66,Union,"COSATU, FOSATU, NEHAWU, SATAWU, Allied, AMCU, ..."
67,Political Party,"ANC, Congress, DA, EFF, IFP, NFP, Cope, UDM, S..."
68,Civic org,"association, residents, concerned, ratepayers,..."
69,Church,"church, congregation, believers, temple, mosqu..."


#### Locations

In [298]:
locations = vocab.loc[vocab['Label'] == 'Location']
locations = locations.drop('Label', axis = 1)
locations.to_excel('DATA/Vocabularies/locations.xlsx')
locations.shape

(7, 2)

In [299]:
locations.head()

Unnamed: 0,Concept,Phrases
71,Informal area,"Shacks, sharks, informal, hostel, squatter, ca..."
72,Court,"Court, hearing"
73,Business premises,"company, business, gate, premises, factory,"
74,Stadium,"Stadium, sports field"
75,Church,"church, congregation, believers, temple, musli..."


#### Weapons

In [300]:
weapons = vocab.loc[vocab['Label'] == 'Weapons']
weapons = weapons.drop('Label', axis = 1)
weapons.to_excel('DATA/Vocabularies/weapons.xlsx')
weapons.shape

(3, 2)

In [301]:
weapons.head()

Unnamed: 0,Concept,Phrases
79,Crowd weapons,"Weapon, Knife, Traditional weapons, knobkierie..."
80,Crowd projectiles,"(Throw OR Threw OR Attack(ed) ) AND ( Stones, ..."
81,Police weapons,"Gun, cannon, canon, shotgun, s/gun, rubber, st..."


#### Eventuality

In [302]:
eventuality = vocab.loc[vocab['Label'] == 'Eventuality']
eventuality = eventuality.drop('Label', axis = 1)
eventuality.to_excel('DATA/Vocabularies/eventuality.xlsx')
eventuality.shape

(2, 2)

In [303]:
eventuality.head()

Unnamed: 0,Concept,Phrases
83,Police attack,"Police AND (beat, raid, injury, rubber, stungr..."
84,Vigilantism,"Beat, beating, mob justice, necklace, necklaci..."


#### Curiosity

In [304]:
curiosity = vocab.loc[vocab['Label'] == 'Curiosity']
curiosity = curiosity.drop('Label', axis = 1)
curiosity.to_excel('DATA/Vocabularies/curiosity.xlsx')
curiosity.shape

(2, 2)

In [305]:
curiosity.head()

Unnamed: 0,Concept,Phrases
86,Movements,"Rebel, rebellion, uprising, movement,"
87,Special Keywords,"Lunch, scab(s), swearing, loot, looting, SABC,..."


#### Non-protest

In [306]:
non_protest = vocab.loc[vocab['Label'] == 'Non-protest']
non_protest = non_protest.drop('Label', axis = 1)
non_protest.to_excel('DATA/Vocabularies/non_protest.xlsx')
non_protest.shape

(11, 2)

In [307]:
non_protest.head()

Unnamed: 0,Concept,Phrases
89,Football matches,"Football, soccer, PSL, score, Bafana"
90,Other sport,"Cricket, rugby, games, athletics, race, marathon"
91,Tribal court meetings,"Tribal court, traditional court, imbizo,"
92,Funerals,"Memorial, funeral, burial, after tears, mourners"
93,Church,"Church service, ZCC, Shembe, prayer, dedication"
