# Cleaning up and Creating Content Vocabulary

### Import Everything
Just easier that way

In [31]:
import numpy as np
import pandas as pd
import os

# Word processing libraries
import re
from nltk.corpus import wordnet
import string
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import WordNetLemmatizer

#Nan
from cmath import nan

In [32]:
vocab = pd.read_excel('DATA/Protest is SA   SAS rules V2 - Separated.xlsx')

In [33]:
vocab.shape

(100, 6)

In [34]:
vocab.head(3)

Unnamed: 0,Label,Concept,Phrases,Conditional Phrases,Afrikaans Phrases,Rule
0,Grievance,Capitalism,"Capitalism, neoliberalism, privatisation, priv...",,"Kapitalisme, privatisasie, privatiseer, neolib...",Multiple grievances ok
1,Grievance,Values,"Abuse, against xenophobia,",,"mishandeling, teen xenofobie",Multiple grievances ok
2,Grievance,Conditions,"poverty, unemployment, jobs,",,"armoede, werkloosheid, werke",Multiple grievances ok


In [35]:
vocab = vocab.dropna(how = 'all')

In [36]:
vocab.shape

(92, 6)

In [37]:
vocab[(vocab['Afrikaans Phrases'].isnull())]

Unnamed: 0,Label,Concept,Phrases,Conditional Phrases,Afrikaans Phrases,Rule
9,Grievance,Labour related,"Work, workers, Company, Employ, employer, empl...","corrupt, fired, dismissed, policy, other peopl...",,Multiple grievances ok


### Drop Rule
Important to take note of what the rule column means: </br>
Rule indicates whether text can be grouped into multiple categories </br>
Rules are as follows:
Can Contain Multiple:
* Grievance
* Actors
* Locations
* Weapons
* Eventualities
* Curiosity

Can Only Have 1:
* Trigger
* Tactic

Discard:
* Non-protest</br>
    Unless it contains:
    * Grievance
    * Trigger
    * Tactic
    * Actor -> only for "16 days of activism"


Now we can drop Rule

In [38]:
vocab = vocab.drop('Rule', axis = 1)

In [39]:
vocab.head(3)

Unnamed: 0,Label,Concept,Phrases,Conditional Phrases,Afrikaans Phrases
0,Grievance,Capitalism,"Capitalism, neoliberalism, privatisation, priv...",,"Kapitalisme, privatisasie, privatiseer, neolib..."
1,Grievance,Values,"Abuse, against xenophobia,",,"mishandeling, teen xenofobie"
2,Grievance,Conditions,"poverty, unemployment, jobs,",,"armoede, werkloosheid, werke"


### Drop Afrikaans for now
We can always create functionality for it at a later date

In [40]:
vocab = vocab.drop('Afrikaans Phrases', axis = 1)

In [41]:
vocab.head(3)

Unnamed: 0,Label,Concept,Phrases,Conditional Phrases
0,Grievance,Capitalism,"Capitalism, neoliberalism, privatisation, priv...",
1,Grievance,Values,"Abuse, against xenophobia,",
2,Grievance,Conditions,"poverty, unemployment, jobs,",


### Lemmatise Vocabulary so that it can easily be matched to text in tweet

In [42]:
# Define the function to implement POS tagging:
def get_wordnet_pos(pos_tag):
    if pos_tag.startswith('J'):
        return wordnet.ADJ
    elif pos_tag.startswith('V'):
        return wordnet.VERB
    elif pos_tag.startswith('N'):
        return wordnet.NOUN
    elif pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


# Define the main function to clean text in various ways:
def clean_text(text):
    
    # Apply regex expressions first before converting string to list of tokens/words:
    # 1. remove @usernames
   
    # 5. Convert text to lowercase
    text = text.lower()
    
    # 6. tokenize text and remove punctuation
    # text = [word.strip(string.punctuation) for word in text.split(" ")]
    
    # 7. remove numbers
    text = [word for word in text if not any(c.isdigit() for c in word)]
    
    # 8. remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    
    # 9. remove empty tokens
    text = [t for t in text if len(t) > 0]
    
    # 10. pos tag text and lemmatize text
    pos_tags = pos_tag(text)
    text = [WordNetLemmatizer().lemmatize(t[0], get_wordnet_pos(t[1])) for t in pos_tags]
    
    # 11. remove words with only one letter
    text = [t for t in text if len(t) > 1]
    
    # join all
    text = " ".join(text)
    
    return(text)

### Matching Content Analysis
Do content analysis for both lematised and non lematised text to make sure

### Split Up various Labels Into Own Dataframe

#### Grievances

In [43]:
grievances = vocab.loc[vocab['Label'] == 'Grievance']
grievances = grievances.drop('Label', axis = 1)
grievances.to_excel('DATA/Vocabularies/grievances.xlsx')
grievances.shape

(36, 3)

In [44]:
grievances.head()

Unnamed: 0,Concept,Phrases,Conditional Phrases
0,Capitalism,"Capitalism, neoliberalism, privatisation, priv...",
1,Values,"Abuse, against xenophobia,",
2,Conditions,"poverty, unemployment, jobs,",
3,Contract end,"contracts expire, project end, completed",
4,Community recognition,"their rights, recognition,",


#### Trigger

In [45]:
trigger = vocab.loc[vocab['Label'] == 'Trigger']
trigger = trigger.drop('Label', axis = 1)
trigger.to_excel('DATA/Vocabularies/trigger.xlsx')
trigger.shape

(13, 3)

In [46]:
trigger.head()

Unnamed: 0,Concept,Phrases,Conditional Phrases
37,Court hearing,"court, hearing,",
38,Project begin,"not local, not from area, outside, only be loc...",
39,National Strike,"COSATU, NUMSA, national strike,",
40,Dismissals,"Fired, suspended, dismissed, discipline",
41,Wage disputes,"Payment, salary, cheque, overtime, wage, wages...",


#### Tactic

In [47]:
tactic = vocab.loc[vocab['Label'] == 'Tactic']
tactic = tactic.drop('Label', axis = 1)
tactic.to_excel('DATA/Vocabularies/tactic.xlsx')
tactic.shape

(14, 3)

In [48]:
tactic.head()

Unnamed: 0,Concept,Phrases,Conditional Phrases
51,Hostage,"Captive, hostage, lock-in",
52,Attack,"Atack, Attack, Throw, threw, pelt, torch, choa...",
53,Disrupt,"Tyre, Block, tire, burn, Fire, Bricks, Stones,...",
54,M&M,March,Memorandum
55,Vandalism,"damaged, vandalised, break, broke",


#### Actors

In [49]:
actors = vocab.loc[vocab['Label'] == 'Actors']
actors = actors.drop('Label', axis = 1)
actors.to_excel('DATA/Vocabularies/actors.xlsx')
actors.shape

(4, 3)

In [50]:
actors.head()

Unnamed: 0,Concept,Phrases,Conditional Phrases
66,Union,"COSATU, FOSATU, NEHAWU, SATAWU, Allied, AMCU, ...",
67,Political Party,"ANC, Congress, DA, EFF, IFP, NFP, Cope, UDM, S...",
68,Civic org,"association, residents, concerned, ratepayers,...",
69,Church,"church, congregation, believers, temple, mosqu...",


#### Locations

In [51]:
locations = vocab.loc[vocab['Label'] == 'Location']
locations = locations.drop('Label', axis = 1)
locations.to_excel('DATA/Vocabularies/locations.xlsx')
locations.shape

(7, 3)

In [52]:
locations.head()

Unnamed: 0,Concept,Phrases,Conditional Phrases
71,Informal area,"Shacks, sharks, informal, hostel, squatter, ca...",
72,Court,"Court, hearing",
73,Business premises,"company, business, gate, premises, factory,",
74,Stadium,"Stadium, sports field",
75,Church,"church, congregation, believers, temple, musli...",


#### Weapons

In [53]:
weapons = vocab.loc[vocab['Label'] == 'Weapons']
weapons = weapons.drop('Label', axis = 1)
weapons.to_excel('DATA/Vocabularies/weapons.xlsx')
weapons.shape

(3, 3)

In [54]:
weapons.head()

Unnamed: 0,Concept,Phrases,Conditional Phrases
79,Crowd weapons,"Weapon, Knife, Traditional weapons, knobkierie...",
80,Crowd projectiles,"Throw, Threw, Attack, attacked","Stones, rocks, bricks, brike"
81,Police weapons,"Gun, cannon, canon, shotgun, s/gun, rubber, st...",


#### Eventuality

In [55]:
eventuality = vocab.loc[vocab['Label'] == 'Eventuality']
eventuality = eventuality.drop('Label', axis = 1)
eventuality.to_excel('DATA/Vocabularies/eventuality.xlsx')
eventuality.shape

(2, 3)

In [56]:
eventuality.head()

Unnamed: 0,Concept,Phrases,Conditional Phrases
83,Police attack,Police,"beat, raid, injury, rubber, stungrenade, stunt..."
84,Vigilantism,"Beat, beating, mob justice, necklace, necklaci...",


#### Curiosity

In [57]:
curiosity = vocab.loc[vocab['Label'] == 'Curiosity']
curiosity = curiosity.drop('Label', axis = 1)
curiosity.to_excel('DATA/Vocabularies/curiosity.xlsx')
curiosity.shape

(2, 3)

In [58]:
curiosity.head()

Unnamed: 0,Concept,Phrases,Conditional Phrases
86,Movements,"Rebel, rebellion, uprising, movement,",
87,Special Keywords,"Lunch, scab, scabs, swearing, loot, looting, S...",


#### Non-protest

In [59]:
non_protest = vocab.loc[vocab['Label'] == 'Non-protest']
non_protest = non_protest.drop('Label', axis = 1)
non_protest.to_excel('DATA/Vocabularies/non_protest.xlsx')
non_protest.shape

(11, 3)

In [60]:
non_protest.head()

Unnamed: 0,Concept,Phrases,Conditional Phrases
89,Football matches,"Football, soccer, PSL, score, Bafana",
90,Other sport,"Cricket, rugby, games, athletics, race, marath...",
91,Tribal court meetings,"Tribal court, traditional court, imbizo,",
92,Funerals,"Memorial, funeral, burial, after tears, mourners",
93,Church,"Church service, ZCC, Shembe, prayer, dedication",
