In [None]:
import os
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize
from nltk import pos_tag

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer

## Modeling 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from nltk.sentiment.sentiment_analyzer import SentimentAnalyzer

In [210]:
os.listdir(os.getcwd()+"\\training-v1")

['offenseval-annotation.txt',
 'offenseval-training-v1.tsv',
 'readme-trainingset-v1.txt']

### 1. Reading in the dataset

In [211]:
train_data = []
train_df = pd.DataFrame()

with open('.\\training-v1\\offenseval-training-v1.tsv','r', encoding = 'utf-8') as in_file:
    #train_data = [line.strip().split('\t') for line in in_file]
    
    for line in in_file:
        train_df = pd.concat([train_df,pd.DataFrame([line.strip().split('\t')])], axis = 0)
        
train_df.reset_index(inplace=True)

### 2. Analyzing the data and response variables

In [212]:
## Setting the first row as the column headers
train_df.columns = train_df.iloc[0,:].values
train_df = train_df.iloc[1:,]

In [218]:
train_df.head()

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
1,0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
2,0,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
3,0,16820,Amazon is investigating Chinese employees who ...,NOT,,
4,0,62688,"""@USER Someone should'veTaken"""" this piece of ...",OFF,UNT,
5,0,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,


In [219]:
train_df.shape

(13240, 6)

#### Distribution of each of the sub-tasks

In [220]:
train_df.subtask_a.value_counts()

NOT    8840
OFF    4400
Name: subtask_a, dtype: int64

In [221]:
train_df.subtask_b.value_counts()

NULL    8840
TIN     3876
UNT      524
Name: subtask_b, dtype: int64

In [222]:
train_df.subtask_c.value_counts()

NULL    9364
IND     2407
GRP     1074
OTH      395
Name: subtask_c, dtype: int64

In [685]:
#train_df.tweet.value_counts()

In [659]:
train_df[train_df.tweet == '@USER Looks Like The Jokes On Liberals Again.  #FortTrump #Poland #BoomingEconomy URL']

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
1270,0,45643,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
4073,0,22953,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
8333,0,60513,@USER Looks Like The Jokes On Liberals Again. ...,OFF,TIN,GRP
10460,0,66322,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
10624,0,38491,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,
11304,0,73520,@USER Looks Like The Jokes On Liberals Again. ...,NOT,,


In [661]:
train_df[train_df.tweet == "@USER An obvious last minute liberal ploy to delay confirmation. More dirty tricks since the liberals lost the previous election."]

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
5166,0,81140,@USER An obvious last minute liberal ploy to d...,NOT,,
5307,0,46503,@USER An obvious last minute liberal ploy to d...,NOT,,


In [663]:
train_df[train_df.tweet ==  "@USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER @USER Following all #Maga patriots please follow back 👍  #LionsDen 🦁  #MAGA2KAG 🇺🇸"]

Unnamed: 0,0,id,tweet,subtask_a,subtask_b,subtask_c
2144,0,14617,@USER @USER @USER @USER @USER @USER @USER @USE...,NOT,,
2724,0,16759,@USER @USER @USER @USER @USER @USER @USER @USE...,NOT,,
4223,0,15862,@USER @USER @USER @USER @USER @USER @USER @USE...,NOT,,


### 3. Feature Engineering

#### 3.1 Sentiment Polarity

In [223]:
sentiment_polarity_values = train_df.tweet.apply(lambda x: SentimentIntensityAnalyzer().polarity_scores(x))

In [226]:
polarity_array = []

for each in sentiment_polarity_values:
    temp_polarity_array = list(each.values())
    polarity_array.append(temp_polarity_array)

#### 3.2 Word2Vec Embeddings

#### Stopword collection

In [448]:
### Getting stopwords
stop_words_list = stopwords.words('english')
tweet_tokens = train_df.tweet.apply(word_tokenize)

In [449]:
## Combining all the tokenized words
words = ' '.join(words for sent in tweet_tokens.values for words in sent )

In [450]:
word_count = Counter(words.split(" "))

In [451]:
## Selecting words with frequency 1
less_freq_words = [word for (word,count) in word_count.most_common()[-16200:]]

In [452]:
high_freq_words = [word for (word,count) in word_count.most_common(100)]
words_to_keep = ['gun','MAGA','liberals','Antifa','conservatives','Trump','Liberals','amp','people','control']

_ = [high_freq_words.remove(each) for each in words_to_keep]

In [453]:
stop_words_list.extend(less_freq_words)
stop_words_list.extend(high_freq_words)

#### Removing stopwords from tweets

In [455]:
tweet_tokens_clean = tweet_tokens.apply(lambda x: [words for words in x if words not in stop_words_list])

In [516]:
tweet_tokens_clean_list = [list(np.where(len(each) > 0,each,['UNK'])) for each in tweet_tokens_clean]

#### Creating Word2Vec models

In [517]:
model = Word2Vec(tweet_tokens_clean_list, size = 200, window=5, min_count=1)

In [546]:
random_array = np.random.rand(200)
embedding_list = []
for each in tweet_tokens_clean_list:
    if(len(each) == 0):
        embedding_list.append(list(random_array))
    else:
        embedding_list.append(list(np.mean(model[each],axis = 0)))

  import sys


#### 3.3 Number of Hashtags and Number of User count

In [561]:
tweet_token_space = train_df.tweet.apply(lambda x: x.split(" "))

In [583]:
user_counts = tweet_token_space.apply(lambda x:  len( [each for each in x if each.startswith('@')])).values
hast_tag_counts = tweet_token_space.apply(lambda x:  len( [each for each in x if each.startswith('#')])).values

#### 3.4 Character n-grams

In [591]:
def word2ngrams(text, n=3, exact=True):
    """ Convert text into character ngrams. """
    return ["".join(j) for j in zip(*[text[i:] for i in range(n)])]

In [651]:
char_3_gram = train_df.tweet.apply(word2ngrams,n = 3)
char_4_gram = train_df.tweet.apply(word2ngrams,n = 4)
char_5_gram = train_df.tweet.apply(word2ngrams,n = 5)
char_6_gram = train_df.tweet.apply(word2ngrams,n = 6)
char_7_gram = train_df.tweet.apply(word2ngrams,n = 7)

In [616]:
char_3_gram_dict = {}
char_4_gram_dict = {}
char_5_gram_dict = {}
char_6_gram_dict = {}
char_7_gram_dict = {}

for row_num, text in enumerate(train_df.tweet):
    char_3_gram_dict[text] = char_3_gram.values[row_num]
    char_4_gram_dict[text] = char_4_gram.values[row_num]
    char_5_gram_dict[text] = char_5_gram.values[row_num]
    char_6_gram_dict[text] = char_6_gram.values[row_num]
    char_7_gram_dict[text] = char_7_gram.values[row_num]

In [618]:
#char_7_gram_dict

#### 3.5 Dictionary of cuss words

In [590]:
#https://www.noswearing.com/dictionary/a

#### 3.6 POS Tagging

In [645]:
#tweet_pos = train_df.tweet.apply(lambda x: (x.split(" ")))

#### Bag of Words

In [606]:
#https://stackoverflow.com/questions/35867484/pass-tokens-to-countvectorizer

In [671]:
#stop_words=stopwords.words('english')
bow_counts = CountVectorizer(tokenizer=lambda key: char_6_gram_dict[key], preprocessor= lambda x:x, lowercase=False)
bow_data = bow_counts.fit_transform(train_df.tweet)

#### Tf Idf Vectorizer

In [24]:
tfidf_counts = CountVectorizer(tokenizer = word_tokenize,ngram_range=(1,2))
tfidf_data = tfidf_counts.fit_transform(train_df.tweet)

### Feature Engineering

### Train test split

In [25]:
#senti_analyser = SentimentAnalyzer()
#senti_analyser.all_words(train_df.tweet.values[0])

In [683]:
X_train, X_test, y_train, y_test =  train_test_split(embedding_list,train_df.subtask_a,test_size = 0.2)

### Model Creation

#### Logistic Regression Model

In [684]:
for C in [0.75,1,10,50,100]:
    lr_model = LogisticRegression(C = C)
    lr_model.fit(X_train,y_train)
    test_pred = lr_model.predict(X_test)

    #acc = 100 * np.sum(test_pred == y_test)/len(y_test)
    print(C,": ",f1_score(y_test,test_pred,average='weighted') )

  'precision', 'predicted', average, warn_for)


0.75 :  0.5516274559978714
1 :  0.5516274559978714
10 :  0.5516274559978714
50 :  0.5516274559978714
100 :  0.5566844140266969


In [680]:
(y_test == test_pred).sum()/len(y_test)

0.67472306143001

In [31]:
y_test.value_counts()

NOT    2661
OFF    1311
Name: subtask_a, dtype: int64

In [33]:
674 + 637

1311

In [30]:
confusion_matrix(y_test,test_pred)

array([[2319,  342],
       [ 637,  674]], dtype=int64)

In [137]:
lr_model = LogisticRegression(C = 0.75)
lr_model.fit(X_train,y_train)
test_pred = lr_model.predict(X_test)

print(C,": ",100 * np.sum(test_pred == y_test)/len(y_test))
confusion_matrix(y_test,test_pred)

100 :  71.82779456193353


array([[2632,   30],
       [1089,  221]], dtype=int64)

#### Random Forest

In [138]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train,y_train)

test_pred = lr_model.predict(X_test)
f1_score(y_test,test_pred,average='weighted') 

0.646086411710712

#### Top Features