In [2]:
import os
import pandas as pd
import numpy as np

from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import confusion_matrix
from nltk.corpus import stopwords
from nltk.sentiment.sentiment_analyzer import SentimentAnalyzer



In [3]:
os.listdir(os.getcwd()+"\\training-v1")

['offenseval-annotation.txt',
 'offenseval-training-v1.tsv',
 'readme-trainingset-v1.txt']

In [13]:
train_data = []
train_df = pd.DataFrame()

with open('.\\training-v1\\offenseval-training-v1.tsv','r', encoding = 'utf-8') as in_file:
    #train_data = [line.strip().split('\t') for line in in_file]
    
    for line in in_file:
        train_df = pd.concat([train_df,pd.DataFrame([line.strip().split('\t')])], axis = 0)

In [14]:
train_df.columns = train_df.iloc[0,:].values
train_df = train_df.iloc[1:,]

In [15]:
train_df.shape

(13240, 5)

In [16]:
train_df.subtask_a.value_counts()

NOT    8840
OFF    4400
Name: subtask_a, dtype: int64

In [17]:
train_df.subtask_b.value_counts()

NULL    8840
TIN     3876
UNT      524
Name: subtask_b, dtype: int64

In [18]:
train_df.subtask_c.value_counts()

NULL    9364
IND     2407
GRP     1074
OTH      395
Name: subtask_c, dtype: int64

In [19]:
train_df.head()

Unnamed: 0,id,tweet,subtask_a,subtask_b,subtask_c
0,86426,@USER She should ask a few native Americans wh...,OFF,UNT,
0,90194,@USER @USER Go home you’re drunk!!! @USER #MAG...,OFF,TIN,IND
0,16820,Amazon is investigating Chinese employees who ...,NOT,,
0,62688,"""@USER Someone should'veTaken"""" this piece of ...",OFF,UNT,
0,43605,@USER @USER Obama wanted liberals &amp; illega...,NOT,,


#### Tokenization

In [20]:
train_df['tokenized_words'] = train_df.tweet.apply(lambda x:word_tokenize(x))

In [21]:
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.apply_features(train_df.tokenized_words.values[10])
all_words_neg

[{}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, {}, ...]

In [22]:
train_df.iloc[1,].values

array(['90194',
       '@USER @USER Go home you’re drunk!!! @USER #MAGA #Trump2020 👊🇺🇸👊 URL',
       'OFF', 'TIN', 'IND',
       list(['@', 'USER', '@', 'USER', 'Go', 'home', 'you', '’', 're', 'drunk', '!', '!', '!', '@', 'USER', '#', 'MAGA', '#', 'Trump2020', '👊🇺🇸👊', 'URL'])],
      dtype=object)

### Feature Engineering

#### Bag of Words

In [23]:
#stop_words=stopwords.words('english')
bow_counts = CountVectorizer(tokenizer = word_tokenize,ngram_range=(1,2))
bow_data = bow_counts.fit_transform(train_df.tweet)

#### Tf Idf Vectorizer

In [24]:
tfidf_counts = CountVectorizer(tokenizer = word_tokenize,ngram_range=(1,2))
tfidf_data = tfidf_counts.fit_transform(train_df.tweet)

### Train test split

In [25]:
#senti_analyser = SentimentAnalyzer()
#senti_analyser.all_words(train_df.tweet.values[0])

In [26]:
X_train, X_test, y_train, y_test =  train_test_split(bow_data,train_df.subtask_a,test_size = 0.3)

### Model Creation

#### Logistic Regression Model

In [27]:
for C in [0.75,1,10,50,100]:
    lr_model = LogisticRegression(C = C)
    lr_model.fit(X_train,y_train)
    test_pred = lr_model.predict(X_test)

    #acc = 100 * np.sum(test_pred == y_test)/len(y_test)
    print(C,": ",f1_score(y_test,test_pred,average='weighted') )

0.75 :  0.7405156645908003
1 :  0.7410220250787326
10 :  0.7430990980508277
50 :  0.7434590177652464
100 :  0.744373841112178


In [34]:
(y_test == test_pred).sum()/len(y_test)

0.7535246727089627

In [31]:
y_test.value_counts()

NOT    2661
OFF    1311
Name: subtask_a, dtype: int64

In [33]:
674 + 637

1311

In [30]:
confusion_matrix(y_test,test_pred)

array([[2319,  342],
       [ 637,  674]], dtype=int64)

In [137]:
lr_model = LogisticRegression(C = 0.75)
lr_model.fit(X_train,y_train)
test_pred = lr_model.predict(X_test)

print(C,": ",100 * np.sum(test_pred == y_test)/len(y_test))
confusion_matrix(y_test,test_pred)

100 :  71.82779456193353


array([[2632,   30],
       [1089,  221]], dtype=int64)

#### Random Forest

In [138]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train,y_train)

test_pred = lr_model.predict(X_test)
f1_score(y_test,test_pred,average='weighted') 

0.646086411710712

#### Top Features