# NLP

In [1]:
import re
import pandas as pd
import numpy as np
import pickle

import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import nltk.corpus
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer

import seaborn as sns

from math import ceil

from sklearn import metrics

[nltk_data] Downloading package punkt to /home/gbs/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/gbs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/gbs/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/gbs/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /home/gbs/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# 1. Preprocessing




## 1.1. Loading data

In [2]:
df = pd.read_csv("labeled_data.csv")
df.drop(df.columns[0], inplace=True, axis=1)
hatred_dict = pd.read_csv("refined_ngram_dict.csv")

In [3]:
df

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...
24778,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,6,0,6,0,1,youu got wild bitches tellin you lies


In [4]:
hatred_dict

Unnamed: 0,ngram,prophate
0,allah akbar,0.870
1,blacks,0.583
2,chink,0.467
3,chinks,0.542
4,dykes,0.602
...,...,...
173,nigga you a lame,0.556
174,niggers are in my,0.714
175,wit a lame nigga,0.556
176,you a lame bitch,0.556


In [5]:
df.isna().sum()

count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,count,hate_speech,offensive_language,neither,class
count,24783.0,24783.0,24783.0,24783.0,24783.0
mean,3.243473,0.280515,2.413711,0.549247,1.110277
std,0.88306,0.631851,1.399459,1.113299,0.462089
min,3.0,0.0,0.0,0.0,0.0
25%,3.0,0.0,2.0,0.0,1.0
50%,3.0,0.0,3.0,0.0,1.0
75%,3.0,0.0,3.0,0.0,1.0
max,9.0,7.0,9.0,9.0,2.0


## 1.2. Preparing the data to preprocess

In [7]:
data_set = pd.DataFrame()
data_set['clean'] = df['tweet']

### Limiting processed data
To reduce memory usage a data limit can be applied.

In [8]:
N = None

In [9]:
if N is not None:
    data_set = data_set[0:N]
    df = df[0:N]

## 1.3. Correction dictionaries

### Contractions

In [10]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [11]:
ext = {d:d.replace("'", '') for d in contractions.keys()}
contractions.update({ext[d]:contractions[d] for d in ext.keys()})

### Others

In [12]:
adjustments = {
    'ya': 'you',
    'aw': '',
    'dat': 'that',
    'dem': 'them',
    'll': 'will',
    'u': 'you'
}

abbreviations = {
    'idk': 'I do not know',
    'btw': 'by the way',
    'pls': 'please'
}


### Emoticons

In [13]:
emoticons = {
    '&#8220': '', #"
    '&#128555': 'tired',
    '&#128553': 'weary',
    '&#128557': 'crying',
    '&#128514': 'joy',
    '&#128131': '',
    '&#128149': 'love',
    '&#128095': '',
    '&#128128': 'death',
    '&#127813': '',
    '&#127829': '',
    '&#128064': '',
    '&#128073': '',
    '&#128077': 'ok',
    '&#127867': '',
    '&#9733': '',
    '&#127942': '',
    '&#128034': '',
    '&#128072': '',
    '&#128075': '',
    '&#128530': 'unamused',
    '&#128563': '', #hands up
    '&#128175': '',
    '&#128588': '', #hands up
}

In [14]:
i = 0
for tweet in df['tweet']:
    if re.match('&#\d{4}', tweet):
        if any(True for emot in emoticons if emot in tweet):
            continue
        print(i, tweet)
        i += 1

0 &#128079; congrats you've turned a hoe into a housewife, don't get shitty when your guys start singing they hit it first. #ButThatsNoneOfMyBusiness
1 &#128165;&#128162; on the pussy http://t.co/mWXQnjm4So
2 &#128347; is the most important thing. All this temporary bullshit and lies is fa the birds. Kill that !
3 &#128520;&#127383; we snap chatted for one night lol. But you're cute. Snapchat me back nig
4 &#128527; haahaa ,dumb bitch
5 &#128532; RT @MichyDoe: Every week them hoes partying ! I see them hoes in every city partying for an event
6 &#128539;&#128120; you've been a good as friend to me , glad I got your honkie ass. Let's fuck shit up this year
7 &#128540;&#128583;&#128582; I hate ya bitch ass
8 &#128583;&#128583;&#128583;&#128583; can y'all females let this sink in for min? Moment of silence to get y'all bitches thinking right? http://t.co/cDN0dEYCkO
9 &#128700;&#128700;- you a little genius man i need you for a class so u can help a nig out but you cool af man
10 &#8216;Ch

## 1.4. Correcting words

In [15]:
from collections import Counter

def get_words(text):
    return re.findall(r'\w+', text.lower())

def replace_jargon(word):
    for jargon_dict in [contractions, abbreviations, adjustments]:
        if word in jargon_dict:
            return jargon_dict[word]
    return word

def replace_jargon_from_array(arr):
    return [replace_jargon(w) for w in arr]

def replace_jargon_from_text(text):
    words = get_words(text)
    return replace_jargon_from_array(words)

def delete_multiplied_letters(arr):
    changed = []
    for word in arr:
        word = re.sub(r'(\w)\1{2,}', r'\1', word)
        changed.append(word)
    return changed

WORDS = Counter(get_words(open('big.txt').read()))

def correct_array(words):
    return [(correction(w) if w not in WORDS else w) for w in words]

def correct_text(text):
    words = get_words(text)
    return correct_array(words)

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    best = max(candidates(word), key=P)
    #if best == 'a':
        #print(word, '=>', best)
    return best

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits(word)) or known(edits_gen(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    #inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces)
  
def edits_gen(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits(word) for e2 in edits(e1))

## Lemmatizing

def lemmatize_array(arr):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in arr]

def lemmatize_text(text):
    return lemmatize_array(get_words(text))

## 1.5. Cleaning tweets

In [16]:
special_characters_regex = '[!"_$%&/()=_ˆ*¡@' ',:;?#]'
retweet_regex = '(.*rt @\w+)+:'
space_regex = '\s+'
url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
             '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
emoticon_regex = '&#\d+;'
mention_regex = '@[\w\-]+'
number_regex = '\d+'

In [17]:
data_set['clean'] = data_set.apply(lambda row:
                        re.sub(space_regex, ' ',
                        re.sub(special_characters_regex, '', 
                        re.sub(number_regex, ' NUMBERHERE ',
                        re.sub('\s*RT MENTIONHERE', ' MENTIONHERE ',
                        re.sub(url_regex, ' LINKHERE ',
                        re.sub(mention_regex, ' MENTIONHERE ',
                        re.sub(retweet_regex, '',
                        re.sub(space_regex, ' ',
                              row['clean'])))))), flags=re.ASCII)), axis=1)

data_set['clean'] = data_set.apply(lambda row: row['clean'].lower(), axis=1)
data_set['clean'] = data_set.apply(lambda row: replace_jargon_from_text(row['clean']), axis=1)
data_set['clean'] = data_set.apply(lambda row: delete_multiplied_letters(row['clean']), axis=1)

## 1.6. Word Correction

In [18]:
data_set['clean']

0        [rt, mentionhere, as, a, woman, you, shouldn, ...
1        [rt, mentionhere, boy, dats, cold, tyga, dwn, ...
2        [rt, mentionhere, dawg, rt, mentionhere, you, ...
3        [rt, mentionhere, mentionhere, she, look, like...
4        [rt, mentionhere, the, shit, you, hear, about,...
                               ...                        
24778    [you, s, a, muthafin, lie, numberhere, mention...
24779    [you, ve, gone, and, broke, the, wrong, heart,...
24780    [young, buck, wanna, eat, that, nigguh, like, ...
24781        [youu, got, wild, bitches, tellin, you, lies]
24782    [ruffled, ntac, eileen, dahlia, beautiful, col...
Name: clean, Length: 24783, dtype: object

In [19]:
data_set['corrected'] = data_set.apply(lambda row: correct_array(row['clean']), axis=1)

In [20]:
data_set['corrected'][:30]

0     [it, mentionhere, as, a, woman, you, shouldn, ...
1     [it, mentionhere, boy, days, cold, tea, own, b...
2     [it, mentionhere, dawn, it, mentionhere, you, ...
3     [it, mentionhere, mentionhere, she, look, like...
4     [it, mentionhere, the, shit, you, hear, about,...
5     [mentionhere, the, shit, just, blows, me, clai...
6     [mentionhere, i, can, not, just, sit, up, and,...
7     [numberhere, mentionhere, because, i, m, tired...
8     [amp, you, might, not, get, you, bitch, back, ...
9     [mentionhere, hobbies, include, fighting, mari...
10    [weeks, is, a, bitch, she, curves, everyone, l...
11               [marya, gang, bitch, this, gang, land]
12    [so, hoes, that, smoke, are, loses, yea, go, o...
13    [bad, birches, is, the, only, thing, that, i, ...
14                            [bitch, get, up, off, me]
15                   [bitch, nigga, miss, me, with, it]
16                               [bitch, ply, whatever]
17                          [bitch, who, do, you

### 1.6.1. Setup working sets

In [39]:
def get_joined(r='corrected'):
    return data_set.apply(lambda row: ' '.join(row[r]), axis=1)

In [40]:
get_joined()

0        it mentionhere as a woman you shouldn t compla...
1        it mentionhere boy days cold tea own bad for c...
2        it mentionhere dawn it mentionhere you ever fu...
3        it mentionhere mentionhere she look like a granny
4        it mentionhere the shit you hear about me migh...
                               ...                        
24778    you s a muthafin lie numberhere mentionhere me...
24779    you ve gone and broke the wrong heart baby and...
24780    young buck anna eat that nigh like i amant muc...
24781                   you got wild birches tell you lies
24782    ruffled near spleen dahlia beautiful color com...
Length: 24783, dtype: object

### 1.6.2. Hatred n-gram dictionary

In [27]:
def get_weight(row):
    w = [hd['prophate'] for i,hd in hatred_dict.iterrows() if hd['ngram'] in row]
    return w if len(w) > 0 else [0.0]

In [28]:
data_set['hate'] = get_joined().apply(get_weight)

In [29]:
data_set['hate'].apply(lambda row: max(row)).sum()

766.3710000000002

In [30]:
data_set['hate'].apply(lambda row: max(row)).describe()

count    24783.000000
mean         0.030923
std          0.132975
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          0.889000
Name: hate, dtype: float64

## 1.7. Lemmatizing

In [31]:
data_set['lem'] = data_set.apply(lambda row: lemmatize_array(row['corrected']), axis=1)

In [32]:
data_set['lem']

0        [it, mentionhere, a, a, woman, you, shouldn, t...
1        [it, mentionhere, boy, day, cold, tea, own, ba...
2        [it, mentionhere, dawn, it, mentionhere, you, ...
3        [it, mentionhere, mentionhere, she, look, like...
4        [it, mentionhere, the, shit, you, hear, about,...
                               ...                        
24778    [you, s, a, muthafin, lie, numberhere, mention...
24779    [you, ve, gone, and, broke, the, wrong, heart,...
24780    [young, buck, anna, eat, that, nigh, like, i, ...
24781              [you, got, wild, birch, tell, you, lie]
24782    [ruffled, near, spleen, dahlia, beautiful, col...
Name: lem, Length: 24783, dtype: object

## 1.8. Tokenization

In [33]:
sentences = data_set.apply(lambda row: sent_tokenize(' '.join(row['lem'])),axis=1)
words = data_set.apply(lambda row: word_tokenize(' '.join(row['lem'])), axis=1)

In [34]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords += [',', '.', ';']
data_set['tokens'] = words.apply(lambda row: [w for w in row if w not in stopwords]) 
data_set['tokens']

0        [mentionhere, woman, complain, cleaning, house...
1        [mentionhere, boy, day, cold, tea, bad, coffin...
2        [mentionhere, dawn, mentionhere, ever, fuck, b...
3           [mentionhere, mentionhere, look, like, granny]
4        [mentionhere, shit, hear, might, true, might, ...
                               ...                        
24778    [muthafin, lie, numberhere, mentionhere, menti...
24779    [gone, broke, wrong, heart, baby, drove, redne...
24780    [young, buck, anna, eat, nigh, like, amant, mu...
24781                        [got, wild, birch, tell, lie]
24782    [ruffled, near, spleen, dahlia, beautiful, col...
Name: tokens, Length: 24783, dtype: object

# 2. Vectorization

## 2.1. TFiDF

In [42]:
vectorizer = TfidfVectorizer(min_df=1)
X_tf = vectorizer.fit_transform(get_joined('tokens'))
X_tf.column = vectorizer.get_feature_names()
X_tf.toarray()[0]
#X.column
X_tf.shape

(24783, 11920)

## 2.2. TFiDF + N-grams

In [46]:
vectorizer = TfidfVectorizer(ngram_range=(2,2), min_df=1)
X_ngram = vectorizer.fit_transform(get_joined('tokens'))
X_ngram.column = vectorizer.get_feature_names()
X_ngram.shape

(24783, 104400)

In [47]:
X_ngram.sum(axis=1)

matrix([[2.9893232 ],
        [2.97648589],
        [2.96771896],
        ...,
        [2.82163766],
        [1.99845111],
        [3.60304159]])

## 2.3. TFiDF + N-grams + POS tagging

In [55]:
tagged = data_set['tokens'].apply(nltk.pos_tag)
tagged

0        [(mentionhere, RB), (woman, NN), (complain, VB...
1        [(mentionhere, RB), (boy, JJ), (day, NN), (col...
2        [(mentionhere, RB), (dawn, NN), (mentionhere, ...
3        [(mentionhere, RB), (mentionhere, RB), (look, ...
4        [(mentionhere, RB), (shit, VBN), (hear, JJ), (...
                               ...                        
24778    [(muthafin, NN), (lie, NN), (numberhere, RB), ...
24779    [(gone, VBN), (broke, VBD), (wrong, JJ), (hear...
24780    [(young, JJ), (buck, NN), (anna, JJ), (eat, NN...
24781    [(got, VBD), (wild, JJ), (birch, NN), (tell, N...
24782    [(ruffled, VBN), (near, IN), (spleen, JJ), (da...
Name: tokens, Length: 24783, dtype: object

## 2.4. Other Features

#### RTs

In [56]:
data_set['RT'] = df.apply(lambda row: row["tweet"].count("RT") , axis=1)

#### Number of words

In [57]:
data_set['num_words'] = words.apply(len)

#### Number of sentences

In [60]:
data_set['num_sents'] = sentences.apply(len)

#### Class

In [71]:
data_set['class'] = df['class']

# 3. Export data

In [72]:
data_set

Unnamed: 0,clean,corrected,hate,lem,tokens,RT,num_words,num_sents,class
0,"[rt, mentionhere, as, a, woman, you, shouldn, ...","[it, mentionhere, as, a, woman, you, shouldn, ...",[0.0],"[it, mentionhere, a, a, woman, you, shouldn, t...","[mentionhere, woman, complain, cleaning, house...",1,25,1,2
1,"[rt, mentionhere, boy, dats, cold, tyga, dwn, ...","[it, mentionhere, boy, days, cold, tea, own, b...",[0.0],"[it, mentionhere, boy, day, cold, tea, own, ba...","[mentionhere, boy, day, cold, tea, bad, coffin...",1,17,1,1
2,"[rt, mentionhere, dawg, rt, mentionhere, you, ...","[it, mentionhere, dawn, it, mentionhere, you, ...",[0.0],"[it, mentionhere, dawn, it, mentionhere, you, ...","[mentionhere, dawn, mentionhere, ever, fuck, b...",2,20,1,1
3,"[rt, mentionhere, mentionhere, she, look, like...","[it, mentionhere, mentionhere, she, look, like...",[0.0],"[it, mentionhere, mentionhere, she, look, like...","[mentionhere, mentionhere, look, like, granny]",1,8,1,1
4,"[rt, mentionhere, the, shit, you, hear, about,...","[it, mentionhere, the, shit, you, hear, about,...",[0.0],"[it, mentionhere, the, shit, you, hear, about,...","[mentionhere, shit, hear, might, true, might, ...",1,25,1,1
...,...,...,...,...,...,...,...,...,...
24778,"[you, s, a, muthafin, lie, numberhere, mention...","[you, s, a, muthafin, lie, numberhere, mention...",[0.0],"[you, s, a, muthafin, lie, numberhere, mention...","[muthafin, lie, numberhere, mentionhere, menti...",0,22,1,1
24779,"[you, ve, gone, and, broke, the, wrong, heart,...","[you, ve, gone, and, broke, the, wrong, heart,...",[0.0],"[you, ve, gone, and, broke, the, wrong, heart,...","[gone, broke, wrong, heart, baby, drove, redne...",0,14,1,2
24780,"[young, buck, wanna, eat, that, nigguh, like, ...","[young, buck, anna, eat, that, nigh, like, i, ...",[0.0],"[young, buck, anna, eat, that, nigh, like, i, ...","[young, buck, anna, eat, nigh, like, amant, mu...",0,13,1,1
24781,"[youu, got, wild, bitches, tellin, you, lies]","[you, got, wild, birches, tell, you, lies]",[0.0],"[you, got, wild, birch, tell, you, lie]","[got, wild, birch, tell, lie]",0,7,1,1


In [66]:
def pickle_save(obj, fn):
    with open(fn, 'wb') as f:
        pickle.dump(obj, f)

In [73]:
pickle_save(data_set.drop('lem', axis=1), 'nlp_dataset.bin')
pickle_save(tagged, 'nlp_pos.bin')
pickle_save(X_tf, 'nlp_tf.bin')
pickle_save(X_ngram, 'nlp_tf_ngram.bin')