# NLP

In [1]:
import re
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import nltk.corpus
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon')
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.sentiment.vader import SentimentIntensityAnalyzer
import seaborn as sns

from math import ceil

from sklearn import metrics

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!




# 1. Preprocessing




## Loading data

In [2]:
df = pd.read_csv("labeled_data.csv")
df.drop(df.columns[0], inplace=True, axis=1)
hatred_dict = pd.read_csv("refined_ngram_dict.csv")

In [3]:
df

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
...,...,...,...,...,...,...
24778,3,0,2,1,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...
24779,3,0,1,2,2,"you've gone and broke the wrong heart baby, an..."
24780,3,0,3,0,1,young buck wanna eat!!.. dat nigguh like I ain...
24781,6,0,6,0,1,youu got wild bitches tellin you lies


In [4]:
hatred_dict

Unnamed: 0,ngram,prophate
0,allah akbar,0.870
1,blacks,0.583
2,chink,0.467
3,chinks,0.542
4,dykes,0.602
...,...,...
173,nigga you a lame,0.556
174,niggers are in my,0.714
175,wit a lame nigga,0.556
176,you a lame bitch,0.556


In [5]:
df.isna().sum()

count                 0
hate_speech           0
offensive_language    0
neither               0
class                 0
tweet                 0
dtype: int64

In [6]:
df.describe()

Unnamed: 0,count,hate_speech,offensive_language,neither,class
count,24783.0,24783.0,24783.0,24783.0,24783.0
mean,3.243473,0.280515,2.413711,0.549247,1.110277
std,0.88306,0.631851,1.399459,1.113299,0.462089
min,3.0,0.0,0.0,0.0,0.0
25%,3.0,0.0,2.0,0.0,1.0
50%,3.0,0.0,3.0,0.0,1.0
75%,3.0,0.0,3.0,0.0,1.0
max,9.0,7.0,9.0,9.0,2.0


## Preparing the data to preprocess

In [7]:
data_set = pd.DataFrame()
data_set['clean'] = df['tweet']

### Limiting processed data
To reduce memory usage a data limit can be applied.

In [8]:
N = None

In [9]:
if N is not None:
    data_set = data_set[0:N]
    df = df[0:N]

## Correction dictionaries - to hide

### Contractions

In [10]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

### Others

In [11]:
adjustments = {
    'ya': 'you',
    'aw': '',
    'dat': 'that',
    'dem': 'them',
    'll': 'will',
    'u': 'you'
}

abbreviations = {
    'idk': 'I do not know',
    'btw': 'by the way',
    'pls': 'please'
}


### Emoticons

In [12]:
emoticons = {
    '&#8220': '', #"
    '&#128555': 'tired',
    '&#128553': 'weary',
    '&#128557': 'crying',
    '&#128514': 'joy',
    '&#128131': '',
    '&#128149': 'love',
    '&#128095': '',
    '&#128128': 'death',
    '&#127813': '',
    '&#127829': '',
    '&#128064': '',
    '&#128073': '',
    '&#128077': 'ok',
    '&#127867': '',
    '&#9733': '',
    '&#127942': '',
    '&#128034': '',
    '&#128072': '',
    '&#128075': '',
    '&#128530': 'unamused',
    '&#128563': '', #hands up
    '&#128175': '',
    '&#128588': '', #hands up
}

In [13]:
i = 0
for tweet in df['tweet']:
  if re.match('&#\d{4}', tweet):
    if any(True for emot in emoticons if emot in tweet):
      continue
    print(i, tweet)
    i += 1

0 &#128079; congrats you've turned a hoe into a housewife, don't get shitty when your guys start singing they hit it first. #ButThatsNoneOfMyBusiness
1 &#128165;&#128162; on the pussy http://t.co/mWXQnjm4So
2 &#128347; is the most important thing. All this temporary bullshit and lies is fa the birds. Kill that !
3 &#128520;&#127383; we snap chatted for one night lol. But you're cute. Snapchat me back nig
4 &#128527; haahaa ,dumb bitch
5 &#128532; RT @MichyDoe: Every week them hoes partying ! I see them hoes in every city partying for an event
6 &#128539;&#128120; you've been a good as friend to me , glad I got your honkie ass. Let's fuck shit up this year
7 &#128540;&#128583;&#128582; I hate ya bitch ass
8 &#128583;&#128583;&#128583;&#128583; can y'all females let this sink in for min? Moment of silence to get y'all bitches thinking right? http://t.co/cDN0dEYCkO
9 &#128700;&#128700;- you a little genius man i need you for a class so u can help a nig out but you cool af man
10 &#8216;Ch

## Correcting words

In [14]:
from collections import Counter

def get_words(text):
    return re.findall(r'\w+', text.lower())

def replace_jargon(word):
    for jargon_dict in [contractions, abbreviations, adjustments]:
        if word in jargon_dict:
          return jargon_dict[word]
    return word

def replace_jargon_from_array(arr):
    return [replace_jargon(w) for w in arr]

def replace_jargon_from_text(text):
    words = get_words(text)
    return replace_jargon_from_array(words)

def delete_multiplied_letters(arr):
  changed = []
  for word in arr:
      word = re.sub(r'(\w)\1{2,}', r'\1', word)
      changed.append(word)
  return changed


WORDS = Counter(get_words(open('big.txt').read()))

#TO IMPROVE?
def correct_array(words):
   return [correction(replace_jargon(w)) for w in words if w not in WORDS]


def correct_text(text):
    words = get_words(text)
    return correct_array(words)

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    best = max(candidates(word), key=P)
    if best == 'a':
      print(word, '=>', best)
    return best

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)
  
def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

## Lemmatizing

def lemmatize_array(arr):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in arr]

def lemmatize_text(text):
    return lemmatize_array(get_words(text))

## Cleaning tweets

In [15]:
special_characters_regex = '[!"_$%&/()=_ˆ*¡@' ',:;?#]'
retweet_regex = '(.*rt @\w+)+:'
space_regex = '\s+'
url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
             '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
emoticon_regex = '&#\d+;'
mention_regex = '@[\w\-]+'
number_regex = '\d+'

In [16]:
data_set['clean'] = data_set.apply(lambda row:
                        re.sub(space_regex, ' ',
                        re.sub(special_characters_regex, '', 
                        re.sub(number_regex, ' NUMBERHERE ',
                        re.sub('\s*RT MENTIONHERE', ' MENTIONHERE ',
                        re.sub(url_regex, ' LINKHERE ',
                        re.sub(mention_regex, ' MENTIONHERE ',
                        re.sub(retweet_regex, '',
                        re.sub(space_regex, ' ',
                              row['clean'])))))), flags=re.ASCII)), axis=1)

data_set['clean'] = data_set.apply(lambda row: row['clean'].lower(), axis=1)
data_set['clean'] = data_set.apply(lambda row: replace_jargon_from_text(row['clean']), axis=1)
data_set['clean'] = data_set.apply(lambda row: delete_multiplied_letters(row['clean']), axis=1)

## Word Correction

In [17]:
data_set['clean']

0        [rt, mentionhere, as, a, woman, you, shouldn, ...
1        [rt, mentionhere, boy, dats, cold, tyga, dwn, ...
2        [rt, mentionhere, dawg, rt, mentionhere, you, ...
3        [rt, mentionhere, mentionhere, she, look, like...
4        [rt, mentionhere, the, shit, you, hear, about,...
                               ...                        
24778    [you, s, a, muthafin, lie, numberhere, mention...
24779    [you, ve, gone, and, broke, the, wrong, heart,...
24780    [young, buck, wanna, eat, that, nigguh, like, ...
24781        [youu, got, wild, bitches, tellin, you, lies]
24782    [ruffled, ntac, eileen, dahlia, beautiful, col...
Name: clean, Length: 24783, dtype: object

In [None]:
data_set['clean'] = data_set.apply(lambda row: correct_array(row['clean']), axis=1)

In [None]:
data_set['clean'][:30]

In [None]:
#data_set['clean'][24778]

## Lemmatizing

In [None]:
data_set['clean'] = data_set.apply(lambda row: lemmatize_array(row['clean']), axis=1)

In [None]:
data_set['clean'][:20]

### Setup working sets

In [None]:
y = df[['class']]
X = pd.DataFrame() #df[['hate_speech', 'offensive_language', 'neither']]

## Tokenization

In [None]:
sentences = data_set.apply(lambda row: sent_tokenize(' '.join(row['clean'])),axis=1)
words = data_set.apply(lambda row: word_tokenize(' '.join(row['clean'])), axis=1)

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords += [',', '.', ';']
data_set['clean'] = words.apply(lambda row: [w for w in row if w not in stopwords]) 
data_set['clean']

# 2. Vectorization

In [None]:
joined = data_set.apply(lambda row: ' '.join(row['clean']), axis=1)

In [None]:
joined

### TFiDF

In [None]:
vectorizer = TfidfVectorizer(min_df=1)
X_tf = vectorizer.fit_transform(joined)
X_tf.column = vectorizer.get_feature_names()
X_tf.toarray()[0]
#X.column
X_tf.shape

In [None]:
X_tf.column

### TFiDF + N-grams

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(2,3), min_df=1)
X_ngram = vectorizer.fit_transform(joined)
X_ngram.column = vectorizer.get_feature_names()
X_ngram.shape

In [None]:
X_ngram.sum(axis=1)

### TFiDF + N-grams + POS tagging

In [None]:
tagged = data_set['clean'].apply(nltk.pos_tag)
tagged

### Other Features

#### RTs

In [None]:
X['RT'] = df.apply(lambda row: row["tweet"].count("RT") , axis=1)

#### Number of words

In [None]:
X['num_words'] = words.apply(len)
X.isna().sum()

#### Number of sentences

In [None]:
X['num_sents'] = sentences.apply(len)
X.isna().sum()

In [None]:
X

#### Sentiment analisis

In [None]:
sentiment_analyzer  = SentimentIntensityAnalyzer() 
sentiment = joined.apply(lambda row: sentiment_analyzer.polarity_scores(row))
sentiment = pd.DataFrame.from_records(sentiment)
if not any(c == 'neg' for c in X.columns):
    X = pd.concat([X, sentiment], axis=1)
else:
    X.update(sentiment)
X.isna().sum()

#### Hatred n-gram dictionary

In [None]:
def get_weight(row):
    return max(hd['prophate'] if hd['ngram'] in row else 0 for i,hd in hatred_dict.iterrows())

X['hatedict'] = joined.apply(get_weight) 

In [None]:
X['hatedict'].sum(), X['hatedict'].max()

In [None]:
X.describe()

# 3. Feature selection

'''from functools import reduce
from sklearn.impute import SimpleImputer
imp = SimpleImputer(strategy='constant', fill_value=0)
imp.fit(X_tf)
X_tf = imp.transform(X_tf)'''

In [None]:
X_tf_df = pd.DataFrame.sparse.from_spmatrix(X_tf)

In [None]:
X_tf_df.shape

In [None]:
X = pd.concat([X, X_tf_df], axis=1)

In [None]:
X.isna().sum().sum()

In [None]:
X

In [None]:
size = len(X.columns)
to_cut = ceil(0.7*len(X.columns))
to_save = size - to_cut
print('100% of features: {}\n 70% of features: {}'.format(size, to_cut))

In [None]:
from sklearn.feature_selection import SelectKBest, chi2, f_regression

selector = SelectKBest(f_regression, k=to_save)
selector.fit(X, y)
X_new = selector.transform(X)
columns = list(X.columns[selector.get_support(indices=True)])

In [None]:
columns[:9]

# 4. Classification algorithm

In [None]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, ytrain, ytest = train_test_split(X_new, y, random_state=2, test_size=0.3)

##Random forest

###Model training

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=0)
model.fit(Xtrain, ytrain)
ypred_forest = model.predict(Xtest)

###Test results

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt

mat_forest = confusion_matrix(ytest, ypred_forest)
sns.heatmap(mat_forest.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
print(metrics.classification_report(ypred_forest, ytest))

In [None]:
forest_score = metrics.accuracy_score(ytest, ypred_forest)
print(f'{forest_score}')

##Support vector classifier

In [None]:
from sklearn.svm import SVC
model_svc = SVC(kernel='linear', C=1E10)
model_svc.fit(Xtrain, ytrain)
ypred_svm = model.predict(Xtest)

###Test results

In [None]:
mat_svm = confusion_matrix(ytest, ypred)
sns.heatmap(mat_svc.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');
print(metrics.classification_report(ytest, ypred_svm))

In [None]:
svm_score = metrics.accuracy_score(ytest, ypred)
print(f'{svm_score}')