### Model training

#### Import all the required packages

In [127]:
import pandas as pd
import numpy as np
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import feature_extraction,model_selection,preprocessing
from sklearn.metrics import accuracy_score

#### load processed dataset

In [128]:
df = pd.read_csv('../data/processed/processed_data.csv')

In [129]:
### dropping the original unclean comment coloum from dataset
df = df.drop('comment_text', axis = 1)

In [130]:
### Renaming the clean comment colum to comment_text for ease
df = df.rename({'clean_comment': 'comment_text'}, axis=1)

In [131]:
df.head()

Unnamed: 0.1,Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate,comment_text
0,0,0000997932d777bf,0,0,0,0,0,0,explanation edits made username hardcore metal...
1,1,000103f0d9cfb60f,0,0,0,0,0,0,aww match background colour seemingly stuck th...
2,2,000113f07ec002fd,0,0,0,0,0,0,hey man really trying edit war guy constantly ...
3,3,0001b41b1c6bb37e,0,0,0,0,0,0,ca make real suggestion improvement wondered s...
4,4,0001d958c54c6e35,0,0,0,0,0,0,sir hero chance remember page


In [132]:
### fill NA for any missing data 
df['comment_text'].fillna("missing", inplace=True)

In [133]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
corpus = df['comment_text']

### Split the date into train test datasets

In [134]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(corpus,df[labels],test_size=0.25,random_state=42)

In [135]:
X_train.shape, X_test.shape

((119678,), (39893,))

In [136]:
# Stats of X_train labels
counts = []
for i in labels:
    counts.append((i, y_train[i].sum()))
df_stats = pd.DataFrame(counts, columns=['Labels', 'number_of_comments'])
df_stats

Unnamed: 0,Labels,number_of_comments
0,toxic,11479
1,severe_toxic,1189
2,obscene,6306
3,threat,373
4,insult,5866
5,identity_hate,1048


In [137]:
#stats of X_test labels
counts = []
for i in labels:
    counts.append((i, y_test[i].sum()))
df_stats = pd.DataFrame(counts, columns=['Labels', 'number_of_comments'])
df_stats

Unnamed: 0,Labels,number_of_comments
0,toxic,3815
1,severe_toxic,406
2,obscene,2143
3,threat,105
4,insult,2011
5,identity_hate,357


#### Converting text comments into vectors using bag of words or TF-IDF 

In [138]:
def word_embeddings(X_train, X_test, embedding_type = "tfidf"):
    if embedding_type == "bow":
        bw_vectorizer = feature_extraction.text.CountVectorizer(max_features= 100)
        X_train = bw_vectorizer.fit_transform(X_train).toarray()
        X_test = bw_vectorizer.fit_transform(X_test).toarray()
    if embedding_type == "tfidf":
        tf_vectorizer = feature_extraction.text.TfidfVectorizer(max_features=100)
        X_train = tf_vectorizer.fit_transform(X_train).toarray()
        X_test = tf_vectorizer.fit_transform(X_test).toarray()
    return X_train, X_test

In [139]:
Xv_train, Xv_test = word_embeddings(X_train, X_test, "tfidf")

### Training

In [140]:
### Linear regression 
for label in labels:
    print('... Processing {}'.format(label))
    # train the model 
    logreg = OneVsRestClassifier(LogisticRegression(solver='sag'))
    logreg.fit(Xv_train, y_train[label])
    # compute the testing accuracy
    prediction = logreg.predict(Xv_test)
    print('Validation accuracy is {}'.format(accuracy_score(y_test[label], prediction)))

... Processing toxic
Validation accuracy is 0.9188328779485123
... Processing severe_toxic
Validation accuracy is 0.9899982453061941
... Processing obscene
Validation accuracy is 0.9598425788985536
... Processing threat
Validation accuracy is 0.9973679592911037
... Processing insult
Validation accuracy is 0.9562579901235806
... Processing identity_hate
Validation accuracy is 0.9910510615897525


<br> Checked the impact of use of Bag of words and TF-IDF on the accuracy of Linear regression. 
<br>In this use case accuracy:
<br>Remained same - Identity hate, threat
<br>Almost same - Severe_toxic
<br>Little bit improved with the use of TF-IDF but not very significant change - Toxic, Obscene, insult