In [885]:
import os
import pandas as pd
import numpy as np
import scipy as sp
import re

from gensim.models import Word2Vec
from nltk.corpus import stopwords
from autocorrect import spell

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

## Modeling 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from xgboost.sklearn import XGBClassifier

from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from nltk.sentiment.sentiment_analyzer import SentimentAnalyzer

In [865]:
os.listdir(os.getcwd()+"\\training-v1")

['offenseval-annotation.txt',
 'offenseval-training-v1.tsv',
 'readme-trainingset-v1.txt']

### 1. Reading in the dataset

In [954]:
train_data = []
train_df = pd.DataFrame()

with open('.\\training-v1\\offenseval-training-v1.tsv','r', encoding = 'utf-8') as in_file:
    #train_data = [line.strip().split('\t') for line in in_file]
    
    for line in in_file:
        train_df = pd.concat([train_df,pd.DataFrame([line.strip().split('\t')])], axis = 0)
        
train_df.reset_index(inplace=True)

### 2. Analyzing the data and response variables

In [955]:
## Setting the first row as the column headers
train_df.columns = train_df.iloc[0,:].values
train_df = train_df.iloc[1:,]

In [956]:
train_df.head()

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
1,0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
2,0,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
3,0,16820,Amazon is investigating Chinese employees who ...,NOT,,
4,0,62688,"""@USER Someone should'veTaken"""" this piece of ...",OFF,UNT,
5,0,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,


In [957]:
train_df.shape

(13240, 6)

#### Distribution of each of the sub-tasks

In [958]:
train_df.subtask_a.value_counts()

NOT    8840
OFF    4400
Name: subtask_a, dtype: int64

In [959]:
train_df.subtask_b.value_counts()

NULL    8840
TIN     3876
UNT      524
Name: subtask_b, dtype: int64

In [960]:
train_df.subtask_c.value_counts()

NULL    9364
IND     2407
GRP     1074
OTH      395
Name: subtask_c, dtype: int64

In [961]:
#train_df.tweet.value_counts()

In [962]:
train_df[train_df.tweet == '@USER Looks Like The Jokes On Liberals Again.  #FortTrump #Poland #BoomingEconomy URL']

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
1270,0,45643,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
4073,0,22953,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
8333,0,60513,@USER Looks Like The Jokes On Liberals Again. ...,OFF,TIN,GRP
10460,0,66322,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
10624,0,38491,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
11304,0,73520,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,


In [963]:
train_df[train_df.tweet == "@USER An obvious last minute liberal ploy to delay confirmation. More dirty tricks since the liberals lost the previous election."]

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
5166,0,81140,@USER An obvious last minute liberal ploy to d...,NOT,,
5307,0,46503,@USER An obvious last minute liberal ploy to d...,NOT,,


In [964]:
train_df[train_df.tweet ==  "@USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER Following all #Maga patriots please follow back 👍  #LionsDen 🦁  #MAGA2KAG 🇺🇸"]

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
2144,0,14617,@USER @USER @USER @USER @USER @USER @USER @USE...,NOT,,
2724,0,16759,@USER @USER @USER @USER @USER @USER @USER @USE...,NOT,,
4223,0,15862,@USER @USER @USER @USER @USER @USER @USER @USE...,NOT,,


### Preprocessing

In [965]:
#train_df.tweet = train_df.tweet.str.lower()

In [1130]:
### Spelling correction
def spell_correct(text):
    ### Removes letter repitition and corrects spellings
    pattern = re.compile(r"(.)\1{2,}")
    return spell(pattern.sub(r"\1\1", text))

In [1136]:
### Correct spellings
# clean_words = train_df.tweet.apply(lambda x: ' '.join([spell_correct(words) for words in x.split(" ")]))

### 3. Feature Engineering

#### 3.1 Sentiment Polarity

In [878]:
sentiment_polarity_values = train_df.tweet.apply(lambda x: SentimentIntensityAnalyzer().polarity_scores(x))

In [879]:
polarity_array = []

for each in sentiment_polarity_values:
    temp_polarity_array = list(each.values())
    polarity_array.append(temp_polarity_array)

#### 3.2 Word2Vec Embeddings

#### Stopword collection

In [1031]:
### Getting stopwords
stop_words_list = stopwords.words('english')
tweet_tokens = train_df.tweet.apply(word_tokenize)

In [1032]:
## Combining all the tokenized words
words = ' '.join(words for sent in tweet_tokens.values for words in sent )

In [1033]:
word_count = Counter(words.split(" "))

In [1034]:
## Selecting words with frequency 1
less_freq_words = [word for (word,count) in word_count.most_common()[-16200:]]

In [1035]:
word_count.most_common(200)

[('@', 33437),
 ('USER', 33412),
 ('.', 14360),
 ('the', 8071),
 ('is', 6392),
 ('to', 6170),
 ("''", 5565),
 ('#', 5399),
 ('a', 5103),
 ('!', 5099),
 ('and', 4588),
 ('you', 4063),
 ('of', 3731),
 ('are', 3465),
 ('I', 3440),
 ('?', 3016),
 ('that', 2629),
 ('in', 2575),
 ('’', 2574),
 ('for', 2382),
 ('URL', 2058),
 ('it', 2026),
 ('he', 1880),
 ('...', 1806),
 ('on', 1648),
 ('she', 1559),
 ('not', 1466),
 ('with', 1450),
 ('have', 1415),
 ("'s", 1393),
 ('be', 1373),
 ('this', 1372),
 ('``', 1347),
 ("n't", 1289),
 ('You', 1245),
 ('do', 1239),
 (',', 1235),
 ('they', 1221),
 ('He', 1159),
 ('gun', 1144),
 ('control', 1114),
 ('all', 1055),
 ('your', 1047),
 ('like', 1025),
 ('s', 995),
 ('was', 984),
 ('about', 982),
 ('as', 976),
 ('t', 958),
 ('so', 945),
 ('her', 933),
 ('She', 922),
 ('will', 898),
 ('MAGA', 883),
 (';', 883),
 ('liberals', 866),
 ('who', 853),
 ('The', 836),
 ('what', 828),
 ('just', 827),
 ('people', 824),
 ('but', 786),
 ('&', 784),
 ('from', 730),
 ('has'

In [1036]:
high_freq_words = [word for (word,count) in word_count.most_common(100)]
words_to_keep = ['gun','control','MAGA','liberals','Antifa','conservatives','Trump','Liberals','amp','people','control','god',
                'right','Conservatives','good','antifa','don','love','better','most','Kavanaugh','ANTIFA','Gun','support',
                'america','thank','lol','nothing','Democrats','vote','trying','money','laws','party','very','country','down','No','man','president']
words_to_keep = [each.lower() for each in words_to_keep]
_ = [high_freq_words.remove(each) for each in set(words_to_keep) if each in high_freq_words]

In [1037]:
stop_words_list.extend(less_freq_words)
stop_words_list.extend(high_freq_words)

#### Removing stopwords from tweets

In [973]:
tweet_tokens_clean = tweet_tokens.apply(lambda x: [words for words in x if words not in stop_words_list])

In [974]:
tweet_tokens_clean_list = [list(np.where(len(each) > 0,each,['UNK'])) for each in tweet_tokens_clean]

#### Creating Word2Vec models

In [895]:
from gensim.models import KeyedVectors
from gensim.test.utils import datapath, get_tmpfile
tmp_file = get_tmpfile("glove.twitter.27B.200d.txt")

In [907]:
def prepend_slow(infile, outfile, line):
	"""
	Slower way to prepend the line by re-creating the inputfile.
	"""
	with open(infile, 'r',encoding = 'utf8') as fin:
		with open(outfile, 'w',encoding = 'utf8') as fout:
			fout.write(line + "\n")
			for line in fin:
				fout.write(line)

In [908]:
# prepend_slow("glove.twitter.27B.200d.txt","glove.twitter.27B.200d_out.txt","400000 200")

In [909]:
model = KeyedVectors.load_word2vec_format("glove.twitter.27B.200d_out.txt")

In [332]:
#model = Word2Vec(tweet_tokens_clean_list, size = 200, window=5, min_count=1)

In [917]:
#model.vocab.keys()

In [920]:
random_array = np.random.rand(100)
embedding_list = []
for each in tweet_tokens_clean_list:
    each_filter = [words for words in each if words in model.vocab.keys()]
    
    if(len(each_filter) == 0):
        embedding_list.append(list(random_array))
    else:
        embedding_list.append(list(np.mean(model[each_filter],axis = 0)))

In [928]:
word_embeddings_df = pd.DataFrame(embedding_list)
word_embeddings_df = word_embeddings_df.fillna(0)

#### 3.3 Number of Hashtags and Number of User count

In [949]:
tweet_token_space = train_df.tweet.apply(lambda x: x.split(" "))

In [950]:
user_counts = tweet_token_space.apply(lambda x:  len( [each for each in x if each.startswith('@')])).values
hast_tag_counts = tweet_token_space.apply(lambda x:  len( [each for each in x if each.startswith('#')])).values

#### 3.4 Character n-grams

In [344]:
def word2ngrams(text, n=3, exact=True):
    """ Convert text into character ngrams. """
    return ["".join(j) for j in zip(*[text[i:] for i in range(n)])]

In [345]:
char_3_gram = train_df.tweet.apply(word2ngrams,n = 3)
char_4_gram = train_df.tweet.apply(word2ngrams,n = 4)
char_5_gram = train_df.tweet.apply(word2ngrams,n = 5)
char_6_gram = train_df.tweet.apply(word2ngrams,n = 6)
char_7_gram = train_df.tweet.apply(word2ngrams,n = 7)

In [346]:
char_3_gram_dict = {}
char_4_gram_dict = {}
char_5_gram_dict = {}
char_6_gram_dict = {}
char_7_gram_dict = {}

for row_num, text in enumerate(train_df.tweet):
    char_3_gram_dict[text] = char_3_gram.values[row_num]
    char_4_gram_dict[text] = char_4_gram.values[row_num]
    char_5_gram_dict[text] = char_5_gram.values[row_num]
    char_6_gram_dict[text] = char_6_gram.values[row_num]
    char_7_gram_dict[text] = char_7_gram.values[row_num]

#### 3.5 Dictionary of cuss words

In [27]:
import xml.etree.ElementTree as ET
import requests

In [63]:
from lxml import html

In [303]:
cuss_words_df = pd.DataFrame()

for alphas in list(string.ascii_lowercase):
    #print(alphas)
    url = "https://www.noswearing.com//dictionary//"+alphas
    XML = requests.get(url)

    tree = html.fromstring(XML.content)
    
    td_files = tree.xpath("//center")[4].getchildren()[0].getchildren()[1].getchildren()[0].getchildren()[0]

    for tags in td_files.getchildren():
        if(tags.text != None):

            #print(tags.text)
            #print(tags.tail)
            test = pd.DataFrame({'Cuss_word':[tags.text.strip()],'Meaning':[tags.tail.replace("-","").strip()]})
            cuss_words_df = pd.concat([cuss_words_df,test],axis = 0)

In [304]:
cuss_words_df.reset_index(inplace=True,drop = True)

In [1084]:
all_cuss = list(set(cuss_words_df.Cuss_word.values)) + list(set(cuss_words_df.Meaning.values))

In [1113]:
### Presence of cuss word
cuss_total = train_df.tweet.apply(lambda x: len([words for words in word_tokenize(x) if words.lower() in all_cuss]))

## Position of cuss word
cuss_position = train_df.tweet.apply(lambda x: np.mean([pos for pos,words in enumerate(word_tokenize(x)) if words.lower() in all_cuss]))
cuss_position = cuss_position.fillna(0)

  out=out, **kwargs)


#### 3.6 POS Tagging

In [645]:
#tweet_pos = train_df.tweet.apply(lambda x: (x.split(" ")))

#### Bag of Words

In [951]:
#https://stackoverflow.com/questions/35867484/pass-tokens-to-countvectorizer
stop_words_list.append('user')
stop_words_list.append('url')

In [984]:
bow_counts_char_n_gram = CountVectorizer(tokenizer=lambda key: char_4_gram_dict[key],
                                         preprocessor= lambda x:x, lowercase=False,stop_words = stop_words_list)
bow_data_char_n_gram = bow_counts_char_n_gram.fit_transform(train_df.tweet)

In [1063]:
bow_counts = CountVectorizer(tokenizer= word_tokenize, stop_words = stop_words_list,ngram_range=(1,2))
bow_data = bow_counts.fit_transform(train_df.tweet)

#### Tf Idf Vectorizer

In [527]:
tfidf_counts_char_gram = TfidfVectorizer(tokenizer=lambda key: char_5_gram_dict[key], preprocessor= lambda x:x, lowercase=False)
tfidf_data_char_gram = tfidf_counts_char_gram.fit_transform(train_df.tweet)

In [519]:
tfidf_counts = TfidfVectorizer(tokenizer=word_tokenize, stop_words = stop_words_list,ngram_range=(1,4))
tfidf_data = tfidf_counts.fit_transform(train_df.tweet)

### Train test split

In [520]:
#senti_analyser = SentimentAnalyzer()
#senti_analyser.all_words(train_df.tweet.values[0])

In [937]:
#tfidf_data2 = np.concatenate((list(user_counts),embedding_list),axis=0)

In [1140]:
### Input data
#tfidf_data_char_gram
X = sp.sparse.hstack((bow_data,pd.DataFrame([user_counts,hast_tag_counts]).T.values,
                      word_embeddings_df.values,pd.DataFrame([cuss_total,cuss_position]).T.values,
                     pd.DataFrame(polarity_array).values ) ,format='csr')

### Input Column names
X_columns=bow_counts.get_feature_names()+['user_counts','hash_tag_counts'] + list(word_embeddings_df.columns.values) + \
                                        ['Cuss_word','Cuss_position'] + ['Polarity1','Polarity2','Polarity3','Polarity4']

In [1141]:
X_train_bow, X_test_bow, y_train_bow, y_test_bow =  \
                            train_test_split(X,train_df.subtask_a,test_size = 0.2,random_state = 0)

In [1142]:
X_train_char, X_test_char, y_train_char, y_test_char = \
                train_test_split(bow_data_char_n_gram,train_df.subtask_a,test_size = 0.2,random_state = 0)

### Model Creation

#### 1. Logistic Regression Model - BOW

In [1143]:
for C in [0.75,1,2,0.5]:
    lr_model = LogisticRegression(C = C)
    lr_model.fit(X_train_bow,y_train_bow)
    test_pred_lr_bow = lr_model.predict(X_test_bow)

    #acc = 100 * np.sum(test_pred == y_test)/len(y_test)
    print(C,": ",f1_score(y_test_bow,test_pred_lr_bow,average='weighted') )

0.75 :  0.7722846821720458
1 :  0.7718397045390523
2 :  0.7701547658068698
0.5 :  0.77667312116357


In [1144]:
(y_test_bow == test_pred_lr_bow).sum()/len(y_test_bow)

0.7862537764350453

#### 2. Logistic Regression Model - character n gram

In [727]:
for C in [0.75,1,2,0.5]:
    lr_model = LogisticRegression(C = C)
    lr_model.fit(X_train_char,y_train_char)
    test_pred_lr_char = lr_model.predict(X_test_char)

    #acc = 100 * np.sum(test_pred == y_test)/len(y_test)
    print(C,": ",f1_score(y_test_char,test_pred_lr_char,average='weighted') )

0.75 :  0.7515914006050404
1 :  0.7530801481915396
2 :  0.7519957127859939
0.5 :  0.7514206118958465


In [728]:
(y_test_char == test_pred_lr_char).sum()/len(y_test_char)

0.7620845921450151

### Random Forest

In [740]:
rf_model = RandomForestClassifier(n_estimators = 50)
rf_model.fit(X_train_bow,y_train_bow)
test_pred_rf = rf_model.predict(X_test_bow)

In [741]:
acc = 100 * np.sum(test_pred_rf == y_test_bow)/len(y_test_bow)
print("F1 : ",f1_score(y_test_bow,test_pred_rf,average='weighted'))
print("Acc:", acc)

F1 :  0.7519213753831259
Acc: 77.56797583081571


### XGBoost

In [935]:
xgb_model = XGBClassifier(n_estimators = 600, max_depth= 6,learning_rate = 0.3)
xgb_model.fit(X_train_bow,y_train_bow)
test_pred_xgb = xgb_model.predict(X_test_bow)

  if diff:


In [936]:
acc = 100 * np.sum(test_pred_xgb == y_test_bow)/len(y_test_bow)
print("F1 : ",f1_score(y_test,test_pred_xgb,average='weighted'))
print("Acc:", acc)

F1 :  0.7653851532482638
Acc: 77.56797583081571


### Ensemble

In [753]:
result_df = pd.DataFrame([test_pred_lr_bow,test_pred_lr_char,test_pred_rf,test_pred_xgb]).T

In [764]:
result_df['prediction'] = result_df.apply(lambda x: x.value_counts().index[0],axis= 1)

In [768]:
100 * np.sum(result_df['prediction'].values == y_test_bow)/len(y_test_bow)

78.20996978851964

## Validating the result

In [388]:
top_n = 20
top_features = np.array(X_columns)[(-np.abs(lr_model.coef_)).argsort()[0][0:top_n]]

In [389]:
pd.DataFrame(top_features)

Unnamed: 0,0
0,hing!
1,Ouch!
2,ws. .
3,fuck
4,fuck
5,shit
6,bitch
7,ass
8,ews.
9,shit


### Looking at some of the misclassified tweets

In [773]:
misclassified_tweets = train_df.iloc[(y_test_bow[(y_test_bow != result_df['prediction'].values)].index.values - 1).tolist(),:]

In [775]:
y_test_bow[(y_test_bow != result_df['prediction'].values)].index.values[0:5]

array([  398,  3162,  2071,  8001, 12048], dtype=int64)

In [778]:
#### Inputs with misclassification

#X_test_misclassified = X_test_bow.toarray()[(y_test_bow != result_df['prediction'].values),:]

In [777]:
misclassified_tweets.head(10)

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
398,0,58548,@USER @USER Wonder if he apologized to Diamond...,OFF,TIN,IND
3162,0,34571,"""an already-debunked viral hoax"""" Tell me who ...",OFF,TIN,IND
2071,0,49760,"""Black Female Democrats Call for Changes in Pa...",OFF,TIN,GRP
8001,0,18083,@USER @USER @USER @USER Loved to hate MR in Bo...,OFF,TIN,IND
12048,0,98896,@USER @USER Just saw it. Still vague though. H...,OFF,TIN,IND
7132,0,90130,"""@USER @USER @USER @USER @USER @USER @USER @US...",OFF,TIN,IND
7328,0,16191,@USER @USER @USER @USER shitt on Pitt - perfec...,OFF,TIN,IND
13115,0,30186,@USER @USER @USER I think he is on crack!,OFF,TIN,IND
3356,0,55498,@USER @USER Tucker Carlson is not an idiot. H...,NOT,,
314,0,42133,@USER @USER @USER @USER @USER @USER You must b...,OFF,TIN,GRP


In [607]:
misclassified_tweets.head(1)

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
148,0,68629,@USER Let the leftist democrats riot in the st...,OFF,TIN,IND


In [779]:
misclassified_tweets.to_csv('misclassified_tweets_ensemble.csv',index = False)

### Tokens and Features

In [614]:
position_of_misclassified = 4


non_zero_features = [pos for pos,each in enumerate(X_test_misclassified[position_of_misclassified]) if each > 0]

tokens = np.array(X_columns)[non_zero_features]

words = [X_columns.index(each) for each in tokens]
score = [lr_model.coef_[0][each] for each in words]

pd.DataFrame(tokens,score)

Unnamed: 0,0
0.012921,disgrace
-0.042033,DISG
0.004921,a DI
0.419129,are
0.086794,you
-0.423991,@USER
0.004921,DISGR
0.033888,ER yo
0.004921,GRACE
0.004921,ISGRA


In [780]:
train_df[train_df.tweet.str.contains('hate')].subtask_a.value_counts()

OFF    133
NOT    125
Name: subtask_a, dtype: int64

In [781]:
train_df[train_df.tweet.str.contains('hate')]

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
74,0,27645,"""@USER I do remember. :( But somehow centrist""...",NOT,,
137,0,28812,@USER This all I get 😢 damn these haters delet...,OFF,TIN,GRP
234,0,34317,"""@USER @USER @USER @USER @USER @USER @USER @US...",OFF,TIN,IND
314,0,42133,@USER @USER @USER @USER @USER @USER You must b...,OFF,TIN,GRP
486,0,30426,@USER @USER NOT A SNOWBALLS CHANCE IN HELL DIM...,OFF,TIN,IND
524,0,85623,@USER Because he hates,NOT,,
525,0,81520,@USER The hypocrisy of the #NEverTrump movemen...,NOT,,
574,0,93516,@USER Get the hell out of my country that u ha...,NOT,,
625,0,14239,@USER @USER Such delusional liberals..! So twi...,OFF,TIN,OTH
642,0,83576,@USER @USER @USER We have a stupid problem. Co...,OFF,UNT,


In [706]:
lr_model = LogisticRegression(C = 2500)
lr_model.fit(X_train,y_train)
test_pred = lr_model.predict(X_test)

print(C,": ",100 * np.sum(test_pred == y_test)/len(y_test))
confusion_matrix(y_test,test_pred)

5000 :  72.84743202416918


array([[1629,  141],
       [ 578,  300]], dtype=int64)

#### Random Forest

In [707]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train,y_train)

test_pred = lr_model.predict(X_test)
f1_score(y_test,test_pred,average='weighted') 

0.6984123845797418

#### Top Features

In [2]:
from autocorrect import spell

In [588]:
spell('helooo')

'hello'

In [9]:
tweet = '@USER ðŸ‘ŒðŸ» Iâ€™ve never seen anyone talk like that on Twitter before and Iâ€™ve seen some really messed up shit'

In [21]:
spell('he!!')

'he'

'hello'