In [307]:
import os
import pandas as pd
import numpy as np
import scipy as sp

from gensim.models import Word2Vec
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

## Modeling 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from nltk.sentiment.sentiment_analyzer import SentimentAnalyzer



In [308]:
os.listdir(os.getcwd()+"\\training-v1")

['offenseval-annotation.txt',
 'offenseval-training-v1.tsv',
 'readme-trainingset-v1.txt']

### 1. Reading in the dataset

In [309]:
train_data = []
train_df = pd.DataFrame()

with open('.\\training-v1\\offenseval-training-v1.tsv','r', encoding = 'utf-8') as in_file:
    #train_data = [line.strip().split('\t') for line in in_file]
    
    for line in in_file:
        train_df = pd.concat([train_df,pd.DataFrame([line.strip().split('\t')])], axis = 0)
        
train_df.reset_index(inplace=True)

### 2. Analyzing the data and response variables

In [310]:
## Setting the first row as the column headers
train_df.columns = train_df.iloc[0,:].values
train_df = train_df.iloc[1:,]

In [311]:
train_df.head()

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
1,0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
2,0,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
3,0,16820,Amazon is investigating Chinese employees who ...,NOT,,
4,0,62688,"""@USER Someone should'veTaken"""" this piece of ...",OFF,UNT,
5,0,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,


In [312]:
train_df.shape

(13240, 6)

#### Distribution of each of the sub-tasks

In [313]:
train_df.subtask_a.value_counts()

NOT    8840
OFF    4400
Name: subtask_a, dtype: int64

In [314]:
train_df.subtask_b.value_counts()

NULL    8840
TIN     3876
UNT      524
Name: subtask_b, dtype: int64

In [315]:
train_df.subtask_c.value_counts()

NULL    9364
IND     2407
GRP     1074
OTH      395
Name: subtask_c, dtype: int64

In [316]:
#train_df.tweet.value_counts()

In [317]:
train_df[train_df.tweet == '@USER Looks Like The Jokes On Liberals Again.  #FortTrump #Poland #BoomingEconomy URL']

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
1270,0,45643,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
4073,0,22953,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
8333,0,60513,@USER Looks Like The Jokes On Liberals Again. ...,OFF,TIN,GRP
10460,0,66322,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
10624,0,38491,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
11304,0,73520,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,


In [318]:
train_df[train_df.tweet == "@USER An obvious last minute liberal ploy to delay confirmation. More dirty tricks since the liberals lost the previous election."]

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
5166,0,81140,@USER An obvious last minute liberal ploy to d...,NOT,,
5307,0,46503,@USER An obvious last minute liberal ploy to d...,NOT,,


In [319]:
train_df[train_df.tweet ==  "@USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER Following all #Maga patriots please follow back 👍  #LionsDen 🦁  #MAGA2KAG 🇺🇸"]

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
2144,0,14617,@USER @USER @USER @USER @USER @USER @USER @USE...,NOT,,
2724,0,16759,@USER @USER @USER @USER @USER @USER @USER @USE...,NOT,,
4223,0,15862,@USER @USER @USER @USER @USER @USER @USER @USE...,NOT,,


### 3. Feature Engineering

#### 3.1 Sentiment Polarity

In [320]:
sentiment_polarity_values = train_df.tweet.apply(lambda x: SentimentIntensityAnalyzer().polarity_scores(x))

In [321]:
polarity_array = []

for each in sentiment_polarity_values:
    temp_polarity_array = list(each.values())
    polarity_array.append(temp_polarity_array)

#### 3.2 Word2Vec Embeddings

#### Stopword collection

In [322]:
### Getting stopwords
stop_words_list = stopwords.words('english')
tweet_tokens = train_df.tweet.apply(word_tokenize)

In [323]:
## Combining all the tokenized words
words = ' '.join(words for sent in tweet_tokens.values for words in sent )

In [324]:
word_count = Counter(words.split(" "))

In [325]:
## Selecting words with frequency 1
less_freq_words = [word for (word,count) in word_count.most_common()[-16200:]]

In [326]:
word_count.most_common(200)

[('@', 33437),
 ('USER', 33412),
 ('.', 14360),
 ('the', 8071),
 ('is', 6392),
 ('to', 6170),
 ("''", 5565),
 ('#', 5399),
 ('a', 5103),
 ('!', 5099),
 ('and', 4588),
 ('you', 4063),
 ('of', 3731),
 ('are', 3465),
 ('I', 3440),
 ('?', 3016),
 ('that', 2629),
 ('in', 2575),
 ('’', 2574),
 ('for', 2382),
 ('URL', 2058),
 ('it', 2026),
 ('he', 1880),
 ('...', 1806),
 ('on', 1648),
 ('she', 1559),
 ('not', 1466),
 ('with', 1450),
 ('have', 1415),
 ("'s", 1393),
 ('be', 1373),
 ('this', 1372),
 ('``', 1347),
 ("n't", 1289),
 ('You', 1245),
 ('do', 1239),
 (',', 1235),
 ('they', 1221),
 ('He', 1159),
 ('gun', 1144),
 ('control', 1114),
 ('all', 1055),
 ('your', 1047),
 ('like', 1025),
 ('s', 995),
 ('was', 984),
 ('about', 982),
 ('as', 976),
 ('t', 958),
 ('so', 945),
 ('her', 933),
 ('She', 922),
 ('will', 898),
 ('MAGA', 883),
 (';', 883),
 ('liberals', 866),
 ('who', 853),
 ('The', 836),
 ('what', 828),
 ('just', 827),
 ('people', 824),
 ('but', 786),
 ('&', 784),
 ('from', 730),
 ('has'

In [327]:
high_freq_words = [word for (word,count) in word_count.most_common(100)]
words_to_keep = ['gun','MAGA','liberals','Antifa','conservatives','Trump','Liberals','amp','people','control']

_ = [high_freq_words.remove(each) for each in words_to_keep]

In [328]:
#high_freq_words

In [329]:
stop_words_list.extend(less_freq_words)
stop_words_list.extend(high_freq_words)

#### Removing stopwords from tweets

In [330]:
tweet_tokens_clean = tweet_tokens.apply(lambda x: [words for words in x if words not in stop_words_list])

In [331]:
tweet_tokens_clean_list = [list(np.where(len(each) > 0,each,['UNK'])) for each in tweet_tokens_clean]

#### Creating Word2Vec models

In [332]:
model = Word2Vec(tweet_tokens_clean_list, size = 200, window=5, min_count=1)

In [333]:
random_array = np.random.rand(100)
embedding_list = []
for each in tweet_tokens_clean_list:
    if(len(each) == 0):
        embedding_list.append(list(random_array))
    else:
        embedding_list.append(list(np.mean(model[each],axis = 0)))

  import sys


In [412]:
word_embeddings_df = pd.DataFrame(embedding_list)

#### 3.3 Number of Hashtags and Number of User count

In [334]:
tweet_token_space = train_df.tweet.apply(lambda x: x.split(" "))

In [335]:
user_counts = tweet_token_space.apply(lambda x:  len( [each for each in x if each.startswith('@')])).values
hast_tag_counts = tweet_token_space.apply(lambda x:  len( [each for each in x if each.startswith('#')])).values

#### 3.4 Character n-grams

In [344]:
def word2ngrams(text, n=3, exact=True):
    """ Convert text into character ngrams. """
    return ["".join(j) for j in zip(*[text[i:] for i in range(n)])]

In [345]:
char_3_gram = train_df.tweet.apply(word2ngrams,n = 3)
char_4_gram = train_df.tweet.apply(word2ngrams,n = 4)
char_5_gram = train_df.tweet.apply(word2ngrams,n = 5)
char_6_gram = train_df.tweet.apply(word2ngrams,n = 6)
char_7_gram = train_df.tweet.apply(word2ngrams,n = 7)

In [346]:
char_3_gram_dict = {}
char_4_gram_dict = {}
char_5_gram_dict = {}
char_6_gram_dict = {}
char_7_gram_dict = {}

for row_num, text in enumerate(train_df.tweet):
    char_3_gram_dict[text] = char_3_gram.values[row_num]
    char_4_gram_dict[text] = char_4_gram.values[row_num]
    char_5_gram_dict[text] = char_5_gram.values[row_num]
    char_6_gram_dict[text] = char_6_gram.values[row_num]
    char_7_gram_dict[text] = char_7_gram.values[row_num]

#### 3.5 Dictionary of cuss words

In [27]:
import xml.etree.ElementTree as ET
import requests

In [63]:
from lxml import html

In [303]:
cuss_words_df = pd.DataFrame()

for alphas in list(string.ascii_lowercase):
    #print(alphas)
    url = "https://www.noswearing.com//dictionary//"+alphas
    XML = requests.get(url)

    tree = html.fromstring(XML.content)
    
    td_files = tree.xpath("//center")[4].getchildren()[0].getchildren()[1].getchildren()[0].getchildren()[0]

    for tags in td_files.getchildren():
        if(tags.text != None):

            #print(tags.text)
            #print(tags.tail)
            test = pd.DataFrame({'Cuss_word':[tags.text.strip()],'Meaning':[tags.tail.replace("-","").strip()]})
            cuss_words_df = pd.concat([cuss_words_df,test],axis = 0)

In [304]:
cuss_words_df.reset_index(inplace=True,drop = True)

In [336]:
cuss_words_df.head()

Unnamed: 0,Cuss_word,Meaning
0,anus,butt
1,arse,butt
2,arsehole,butt
3,ass,butt
4,ass-hat,idiot


#### 3.6 POS Tagging

In [645]:
#tweet_pos = train_df.tweet.apply(lambda x: (x.split(" ")))

#### Bag of Words

In [342]:
#https://stackoverflow.com/questions/35867484/pass-tokens-to-countvectorizer
stop_words_list.append('user')
stop_words_list.append('url')

In [516]:
bow_counts_char_n_gram = CountVectorizer(tokenizer=lambda key: char_4_gram_dict[key],
                                         preprocessor= lambda x:x, lowercase=False,stop_words = stop_words_list)
bow_data_char_n_gram = bow_counts_char_n_gram.fit_transform(train_df.tweet)

In [517]:
bow_counts = CountVectorizer(tokenizer= word_tokenize, stop_words = stop_words_list,ngram_range=(1,2))
bow_data = bow_counts.fit_transform(train_df.tweet)

#### Tf Idf Vectorizer

In [527]:
tfidf_counts_char_gram = TfidfVectorizer(tokenizer=lambda key: char_5_gram_dict[key], preprocessor= lambda x:x, lowercase=False)
tfidf_data_char_gram = tfidf_counts_char_gram.fit_transform(train_df.tweet)

In [519]:
tfidf_counts = TfidfVectorizer(tokenizer=word_tokenize, stop_words = stop_words_list,ngram_range=(1,3),)
tfidf_data = tfidf_counts.fit_transform(train_df.tweet)

### Train test split

In [520]:
#senti_analyser = SentimentAnalyzer()
#senti_analyser.all_words(train_df.tweet.values[0])

In [521]:
#tfidf_data2 = np.concatenate((list(user_counts),embedding_list),axis=0)

In [522]:
#word_embeddings_df = word_embeddings_df.fillna(0)

In [None]:
pd.DataFrame(user_counts,hast_tag_counts)

In [532]:
### Input data
#tfidf_data_char_gram
X = sp.sparse.hstack((bow_data,bow_data_char_n_gram,tfidf_data),format='csr')

### Input Column names
X_columns=bow_counts.get_feature_names()+  \
            bow_counts_char_n_gram.get_feature_names()+\
                    tfidf_counts.get_feature_names()  

In [564]:
X_train, X_test, y_train, y_test =  train_test_split(X,train_df.subtask_a,test_size = 0.2)

### Model Creation

#### Logistic Regression Model

In [565]:
for C in [0.75,1,2,0.5]:
    lr_model = LogisticRegression(C = C)
    lr_model.fit(X_train,y_train)
    test_pred = lr_model.predict(X_test)

    #acc = 100 * np.sum(test_pred == y_test)/len(y_test)
    print(C,": ",f1_score(y_test,test_pred,average='weighted') )

0.75 :  0.7534852387252842
1 :  0.7537323564566173
2 :  0.7501793131352553
0.5 :  0.7537325258552041


In [566]:
(y_test == test_pred).sum()/len(y_test)

0.7662386706948641

### Random Forest

In [567]:
rf_model = RandomForestClassifier(n_estimators = 50)
rf_model.fit(X_train,y_train)
test_pred = rf_model.predict(X_test)

In [568]:
acc = 100 * np.sum(test_pred == y_test)/len(y_test)
print("F1 : ",f1_score(y_test,test_pred,average='weighted'))
print("Acc:", acc)

F1 :  0.7365723641233011
Acc: 76.54833836858006


### XGBoost

In [569]:
from xgboost.sklearn import XGBClassifier

In [572]:
xgb_model = XGBClassifier(n_estimators = 500, max_depth= 6,learning_rate = 0.3)
xgb_model.fit(X_train,y_train)
test_pred = xgb_model.predict(X_test)

  if diff:


In [573]:
acc = 100 * np.sum(test_pred == y_test)/len(y_test)
print("F1 : ",f1_score(y_test,test_pred,average='weighted'))
print("Acc:", acc)

F1 :  0.7678135614867583
Acc: 77.9833836858006


## Validating the result

In [388]:
top_n = 20
top_features = np.array(X_columns)[(-np.abs(lr_model.coef_)).argsort()[0][0:top_n]]

In [389]:
pd.DataFrame(top_features)

Unnamed: 0,0
0,hing!
1,Ouch!
2,ws. .
3,fuck
4,fuck
5,shit
6,bitch
7,ass
8,ews.
9,shit


### Looking at some of the misclassified tweets

In [603]:
misclassified_tweets = train_df.iloc[(y_test[(y_test != test_pred)].index.values - 1).tolist(),:]

In [604]:
y_test[(y_test != test_pred)].index.values[0:5]

array([  148, 12436,  1748,  9803, 10525], dtype=int64)

In [605]:
#### Inputs with misclassification

X_test_misclassified = X_test.toarray()[(y_test != test_pred),:]

In [606]:
misclassified_tweets.head(10)

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
148,0,68629,@USER Let the leftist democrats riot in the st...,OFF,TIN,IND
12436,0,43619,"""....."""" And he is shuting up. Don't mess wi...",OFF,TIN,IND
1748,0,99677,@USER The #metoo movement has turned out just ...,NOT,,
9803,0,37881,@USER He is the worst person.,OFF,TIN,IND
10525,0,72803,@USER you are a DISGRACE,OFF,TIN,IND
4476,0,93549,@USER @USER @USER @USER @USER @USER @USER @USE...,OFF,UNT,
4270,0,69659,@USER Do not think so maybe do a little resear...,OFF,TIN,OTH
3919,0,42219,"""@USER There are deals on Mexico EU trade and ...",OFF,UNT,
9493,0,37828,@USER She is by far the most pretentious littl...,OFF,TIN,IND
12530,0,81420,@USER @USER Liberals didn't want to hear from ...,OFF,TIN,IND


In [607]:
misclassified_tweets.head(1)

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
148,0,68629,@USER Let the leftist democrats riot in the st...,OFF,TIN,IND


In [273]:
#misclassified_tweets.to_csv('misclassified_tweets.csv',index = False)

### Tokens and Features

In [337]:
position_of_misclassified = 3


non_zero_features = [pos for pos,each in enumerate(X_test_misclassified[position_of_misclassified]) if each > 0]

tokens = np.array(bow_counts.get_feature_names())[non_zero_features]

words = [bow_counts.get_feature_names().index(each) for each in tokens]
score = [lr_model.coef_[0][each] for each in words]

pd.DataFrame(tokens,score)

Unnamed: 0,0
-0.389097,church
0.0,church god
10.755536,disgusting
0.0,disgusting going
-2.699415,god
-2.753855,going
0.002113,going taking
-1.856062,taking
0.0,taking whole
1.368911,whole


In [339]:
train_df[train_df.tweet.str.contains('shit')].subtask_a.value_counts()

OFF    369
NOT     61
Name: subtask_a, dtype: int64

In [706]:
lr_model = LogisticRegression(C = 2500)
lr_model.fit(X_train,y_train)
test_pred = lr_model.predict(X_test)

print(C,": ",100 * np.sum(test_pred == y_test)/len(y_test))
confusion_matrix(y_test,test_pred)

5000 :  72.84743202416918


array([[1629,  141],
       [ 578,  300]], dtype=int64)

#### Random Forest

In [707]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train,y_train)

test_pred = lr_model.predict(X_test)
f1_score(y_test,test_pred,average='weighted') 

0.6984123845797418

#### Top Features

In [2]:
from autocorrect import spell

In [588]:
spell('helooo')

'hello'

In [9]:
tweet = '@USER ðŸ‘ŒðŸ» Iâ€™ve never seen anyone talk like that on Twitter before and Iâ€™ve seen some really messed up shit'

In [21]:
spell('he!!')

'he'

In [602]:
import re
def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

word = 'helllllllo'
word_wlf = reduce_lengthening(word)
spell(word_wlf)

'hello'