In [15]:
import os
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize
from nltk import pos_tag
from collections import Counter

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

## Modeling 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from nltk.sentiment.sentiment_analyzer import SentimentAnalyzer

In [2]:
os.listdir(os.getcwd()+"\\training-v1")

['offenseval-annotation.txt',
 'offenseval-training-v1.tsv',
 'readme-trainingset-v1.txt']

### 1. Reading in the dataset

In [3]:
train_data = []
train_df = pd.DataFrame()

with open('.\\training-v1\\offenseval-training-v1.tsv','r', encoding = 'utf-8') as in_file:
    #train_data = [line.strip().split('\t') for line in in_file]
    
    for line in in_file:
        train_df = pd.concat([train_df,pd.DataFrame([line.strip().split('\t')])], axis = 0)
        
train_df.reset_index(inplace=True)

### 2. Analyzing the data and response variables

In [4]:
## Setting the first row as the column headers
train_df.columns = train_df.iloc[0,:].values
train_df = train_df.iloc[1:,]

In [5]:
train_df.head()

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
1,0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
2,0,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
3,0,16820,Amazon is investigating Chinese employees who ...,NOT,,
4,0,62688,"""@USER Someone should'veTaken"""" this piece of ...",OFF,UNT,
5,0,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,


In [6]:
train_df.shape

(13240, 6)

#### Distribution of each of the sub-tasks

In [7]:
train_df.subtask_a.value_counts()

NOT    8840
OFF    4400
Name: subtask_a, dtype: int64

In [8]:
train_df.subtask_b.value_counts()

NULL    8840
TIN     3876
UNT      524
Name: subtask_b, dtype: int64

In [9]:
train_df.subtask_c.value_counts()

NULL    9364
IND     2407
GRP     1074
OTH      395
Name: subtask_c, dtype: int64

In [685]:
#train_df.tweet.value_counts()

In [659]:
train_df[train_df.tweet == '@USER Looks Like The Jokes On Liberals Again.  #FortTrump #Poland #BoomingEconomy URL']

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
1270,0,45643,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
4073,0,22953,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
8333,0,60513,@USER Looks Like The Jokes On Liberals Again. ...,OFF,TIN,GRP
10460,0,66322,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
10624,0,38491,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
11304,0,73520,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,


In [661]:
train_df[train_df.tweet == "@USER An obvious last minute liberal ploy to delay confirmation. More dirty tricks since the liberals lost the previous election."]

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
5166,0,81140,@USER An obvious last minute liberal ploy to d...,NOT,,
5307,0,46503,@USER An obvious last minute liberal ploy to d...,NOT,,


In [663]:
train_df[train_df.tweet ==  "@USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER Following all #Maga patriots please follow back 👍  #LionsDen 🦁  #MAGA2KAG 🇺🇸"]

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
2144,0,14617,@USER @USER @USER @USER @USER @USER @USER @USE...,NOT,,
2724,0,16759,@USER @USER @USER @USER @USER @USER @USER @USE...,NOT,,
4223,0,15862,@USER @USER @USER @USER @USER @USER @USER @USE...,NOT,,


### 3. Feature Engineering

#### 3.1 Sentiment Polarity

In [10]:
sentiment_polarity_values = train_df.tweet.apply(lambda x: SentimentIntensityAnalyzer().polarity_scores(x))

In [11]:
polarity_array = []

for each in sentiment_polarity_values:
    temp_polarity_array = list(each.values())
    polarity_array.append(temp_polarity_array)

#### 3.2 Word2Vec Embeddings

#### Stopword collection

In [12]:
### Getting stopwords
stop_words_list = stopwords.words('english')
tweet_tokens = train_df.tweet.apply(word_tokenize)

In [13]:
## Combining all the tokenized words
words = ' '.join(words for sent in tweet_tokens.values for words in sent )

In [16]:
word_count = Counter(words.split(" "))

In [17]:
## Selecting words with frequency 1
less_freq_words = [word for (word,count) in word_count.most_common()[-16200:]]

In [18]:
high_freq_words = [word for (word,count) in word_count.most_common(100)]
words_to_keep = ['gun','MAGA','liberals','Antifa','conservatives','Trump','Liberals','amp','people','control']

_ = [high_freq_words.remove(each) for each in words_to_keep]

In [19]:
stop_words_list.extend(less_freq_words)
stop_words_list.extend(high_freq_words)

#### Removing stopwords from tweets

In [20]:
tweet_tokens_clean = tweet_tokens.apply(lambda x: [words for words in x if words not in stop_words_list])

In [21]:
tweet_tokens_clean_list = [list(np.where(len(each) > 0,each,['UNK'])) for each in tweet_tokens_clean]

#### Creating Word2Vec models

In [22]:
model = Word2Vec(tweet_tokens_clean_list, size = 200, window=5, min_count=1)

In [23]:
random_array = np.random.rand(100)
embedding_list = []
for each in tweet_tokens_clean_list:
    if(len(each) == 0):
        embedding_list.append(list(random_array))
    else:
        embedding_list.append(list(np.mean(model[each],axis = 0)))

  import sys


#### 3.3 Number of Hashtags and Number of User count

In [24]:
tweet_token_space = train_df.tweet.apply(lambda x: x.split(" "))

In [25]:
user_counts = tweet_token_space.apply(lambda x:  len( [each for each in x if each.startswith('@')])).values
hast_tag_counts = tweet_token_space.apply(lambda x:  len( [each for each in x if each.startswith('#')])).values

#### 3.4 Character n-grams

In [26]:
def word2ngrams(text, n=3, exact=True):
    """ Convert text into character ngrams. """
    return ["".join(j) for j in zip(*[text[i:] for i in range(n)])]

In [27]:
char_3_gram = train_df.tweet.apply(word2ngrams,n = 3)
char_4_gram = train_df.tweet.apply(word2ngrams,n = 4)
char_5_gram = train_df.tweet.apply(word2ngrams,n = 5)
char_6_gram = train_df.tweet.apply(word2ngrams,n = 6)
char_7_gram = train_df.tweet.apply(word2ngrams,n = 7)

In [28]:
char_3_gram_dict = {}
char_4_gram_dict = {}
char_5_gram_dict = {}
char_6_gram_dict = {}
char_7_gram_dict = {}

for row_num, text in enumerate(train_df.tweet):
    char_3_gram_dict[text] = char_3_gram.values[row_num]
    char_4_gram_dict[text] = char_4_gram.values[row_num]
    char_5_gram_dict[text] = char_5_gram.values[row_num]
    char_6_gram_dict[text] = char_6_gram.values[row_num]
    char_7_gram_dict[text] = char_7_gram.values[row_num]

#### 3.5 Dictionary of cuss words

In [590]:
#https://www.noswearing.com/dictionary/a

#### 3.6 POS Tagging

In [645]:
#tweet_pos = train_df.tweet.apply(lambda x: (x.split(" ")))

#### Bag of Words

In [112]:
#https://stackoverflow.com/questions/35867484/pass-tokens-to-countvectorizer
stop_words_list.append('user')
stop_words_list.append('url')

In [94]:
bow_counts_char_n_gram = CountVectorizer(tokenizer=lambda key: char_7_gram_dict[key],
                                         preprocessor= lambda x:x, lowercase=False,stop_words = stop_words_list)
bow_data_char_n_gram = bow_counts_char_n_gram.fit_transform(train_df.tweet)

In [246]:
bow_counts = CountVectorizer(tokenizer= word_tokenize, stop_words = stop_words_list,ngram_range=(1,2))
bow_data = bow_counts.fit_transform(train_df.tweet)

#### Tf Idf Vectorizer

In [696]:
tfidf_counts = TfidfVectorizer(tokenizer=lambda key: char_7_gram_dict[key], preprocessor= lambda x:x, lowercase=False)
tfidf_data = tfidf_counts.fit_transform(train_df.tweet)

In [103]:
tfidf_counts = TfidfVectorizer(tokenizer=word_tokenize, stop_words = stop_words_list,ngram_range=(1,3))
tfidf_data = tfidf_counts.fit_transform(train_df.tweet)

### Feature Engineering

### Train test split

In [25]:
#senti_analyser = SentimentAnalyzer()
#senti_analyser.all_words(train_df.tweet.values[0])

In [33]:
#tfidf_data2 = np.concatenate((list(user_counts),embedding_list),axis=0)

In [247]:
X_train, X_test, y_train, y_test =  train_test_split(bow_data,train_df.subtask_a,test_size = 0.2)

### Model Creation

#### Logistic Regression Model

In [248]:
for C in [0.75,1,10,50,100,1000,5000]:
    lr_model = LogisticRegression(C = C)
    lr_model.fit(X_train,y_train)
    test_pred = lr_model.predict(X_test)

    #acc = 100 * np.sum(test_pred == y_test)/len(y_test)
    print(C,": ",f1_score(y_test,test_pred,average='weighted') )

0.75 :  0.7211360693000004
1 :  0.7228951055796126
10 :  0.7352755306200399
50 :  0.7262840124894246
100 :  0.7215120950312894
1000 :  0.7111800718384211
5000 :  0.7016610964280146


In [249]:
(y_test == test_pred).sum()/len(y_test)

0.7122356495468278

In [250]:
top_n = 15
top_features = np.array(bow_counts.get_feature_names())[(-np.abs(lr_model.coef_)).argsort()[0][0:top_n]]

In [251]:
pd.DataFrame(top_features)

Unnamed: 0,0
0,cute 😍
1,right though
2,life lol
3,awfully good
4,bitch
5,go hell
6,sure hell
7,president bad
8,fuck
9,shit come


### Looking at some of the misclassified tweets

In [266]:
misclassified_tweets = train_df.iloc[(y_test[(y_test != test_pred)].index.values - 1).tolist(),:]

In [267]:
y_test[(y_test != test_pred)].index.values[0:5]

array([9747, 1170, 2262, 4395, 3080], dtype=int64)

In [299]:
#### Inputs with misclassification

X_test_misclassified = X_test.toarray()[(y_test != test_pred),:]

In [300]:
misclassified_tweets.head(10)

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
9747,0,18688,@USER 👌🏻 I’ve never seen anyone talk like that...,OFF,TIN,GRP
1170,0,77539,@USER Liberals are dangerous. They'll say and ...,NOT,,
2262,0,67144,@USER Maxine is just a mean spirited woman. I ...,OFF,TIN,IND
4395,0,70414,@USER Disgusting. He is going down and taking...,NOT,,
3080,0,11325,@USER @USER Capitalism has killed more than th...,NOT,,
10478,0,25533,@USER Not really. Gun control limits one capab...,OFF,TIN,IND
10607,0,43602,@USER @USER She’s a class act isn’t she. Kim t...,NOT,,
10920,0,48185,@USER I do not know who these POS Nazi Liberal...,OFF,TIN,GRP
8838,0,29727,"""@USER If the media stop reporting what happen...",OFF,TIN,GRP
4400,0,72490,@USER Liberals have the numbers and money. Con...,OFF,TIN,GRP


In [273]:
#misclassified_tweets.to_csv('misclassified_tweets.csv',index = False)

### Tokens and Features

In [326]:
position_of_misclassified = 12


non_zero_features = [pos for pos,each in enumerate(X_test_misclassified[position_of_misclassified]) if each > 0]

tokens = np.array(bow_counts.get_feature_names())[non_zero_features]

words = [bow_counts.get_feature_names().index(each) for each in tokens]
score = [lr_model.coef_[0][each] for each in words]

pd.DataFrame(tokens,score)

Unnamed: 0,0
0.0,fuckingndbs
0.0,fuckingndbs funnyhejs
0.0,funnyhejs


In [321]:
train_df[train_df.tweet.str.contains('act')].subtask_a.value_counts()

NOT    555
OFF    254
Name: subtask_a, dtype: int64

In [706]:
lr_model = LogisticRegression(C = 2500)
lr_model.fit(X_train,y_train)
test_pred = lr_model.predict(X_test)

print(C,": ",100 * np.sum(test_pred == y_test)/len(y_test))
confusion_matrix(y_test,test_pred)

5000 :  72.84743202416918


array([[1629,  141],
       [ 578,  300]], dtype=int64)

#### Random Forest

In [707]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train,y_train)

test_pred = lr_model.predict(X_test)
f1_score(y_test,test_pred,average='weighted') 

0.6984123845797418

#### Top Features

<function numpy.lib.function_base.append>