# <font color = 'green'>Vectorization</font> and <font color = 'green'>Model Building</font> 
#### <b>Using TF-IDF and Unigram Approach</b>

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import ComplementNB, BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import lazypredict
from lazypredict.Supervised import LazyClassifier

## Function to perform Vectorization and model building

In [3]:
def vector_model(df, category, vectorizer, ngram):
    X = df['comment_text'].fillna(' ')
    Y = df[category]

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=42)

    vector = vectorizer(ngram_range=(ngram), stop_words='english')

    X_train_scal = vector.fit_transform(X_train)
    X_test_scal = vector.transform(X_test)
    
    #KNN
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train_scal, Y_train)
    Y_pred_knn = knn.predict(X_test_scal)
    print(f"Knn done -> It's classification report for {category} category \n {classification_report(Y_test, Y_pred_knn)} ")
    print("\n----------------------------------------------------------------------")

    #logistic regression
    lr = LogisticRegression()
    lr.fit(X_train_scal, Y_train)
    Y_pred_lr = lr.predict(X_test_scal)
    print(f"\nLr done -> It's classification report for {category} category \n {classification_report(Y_test, Y_pred_lr)} ")
    print("\n----------------------------------------------------------------------\n")

    #Support Vector Machine
    svm = SVC(kernel='rbf')
    svm.fit(X_train_scal, Y_train)
    Y_pred_svm = svm.predict(X_test_scal)
    print(f"\nsvm done -> It's classification report for {category} category \n {classification_report(Y_test, Y_pred_svm)} ")
    print("\n----------------------------------------------------------------------\n")

    #Naive Bayes
    cnb = ComplementNB()
    cnb.fit(X_train_scal, Y_train)
    Y_pred_cnb = cnb.predict(X_test_scal)
    print(f"\ncnb done -> It's classification report for {category} category \n {classification_report(Y_test, Y_pred_cnb)} ")
    print("\n----------------------------------------------------------------------\n")

    bnb = BernoulliNB()
    bnb.fit(X_train_scal, Y_train)
    Y_pred_bnb = bnb.predict(X_test_scal)
    print(f"\nbnb done -> It's classification report for {category} category \n {classification_report(Y_test, Y_pred_bnb)} ")
    print("\n----------------------------------------------------------------------\n")

    #Decision Tree Classifier
    dt = DecisionTreeClassifier(criterion='entropy', min_samples_split=2, random_state=42)
    dt.fit(X_train_scal, Y_train)
    Y_pred_dt = dt.predict(X_test_scal)
    print(f"\nDT done -> It's classification report for {category} category \n {classification_report(Y_test, Y_pred_dt)} ")
    print("\n----------------------------------------------------------------------\n")

    #Random Forest Classifier
    rf = RandomForestClassifier(n_estimators=105, min_samples_split=2, random_state=42)
    rf.fit(X_train_scal, Y_train)
    Y_pred_rf = rf.predict(X_test_scal)
    print(f"\nRF done -> It's classification report for {category} category \n {classification_report(Y_test, Y_pred_rf)} ")
    print("\n----------------------------------------------------------------------\n")

    f1_scores = [round(f1_score(Y_pred_knn, Y_test), 2), round(f1_score(Y_pred_lr, Y_test), 2), round(f1_score(Y_pred_svm, Y_test), 2),
                 round(f1_score(Y_pred_cnb, Y_test), 2), round(f1_score(Y_pred_bnb, Y_test), 2), round(f1_score(Y_pred_dt, Y_test), 2),
                 round(f1_score(Y_pred_rf, Y_test), 2)]
    print(f"F1_scores for {category} category Are calculated")

    Scores = {f'F1_Score - {category}':f1_scores}
    Scores_df = pd.DataFrame(Scores, index=['KNN', 'Logistic Regression', 'SVM', 'Complement NB', 'Bernoulli NB', 'Decision Tree', 'Random Forest'])
    return Scores_df

### Toxic

In [5]:
# Toxic
df_toxic = pd.read_csv('Balanced Data/Toxic.csv')
result_toxic = vector_model(df_toxic, 'toxic', TfidfVectorizer, (1,1))
result_toxic

Knn done -> It's classification report for toxic category 
               precision    recall  f1-score   support

           0       0.55      0.63      0.59      4535
           1       0.58      0.50      0.54      4642

    accuracy                           0.56      9177
   macro avg       0.57      0.57      0.56      9177
weighted avg       0.57      0.56      0.56      9177
 

----------------------------------------------------------------------

Lr done -> It's classification report for toxic category 
               precision    recall  f1-score   support

           0       0.85      0.92      0.89      4535
           1       0.92      0.85      0.88      4642

    accuracy                           0.88      9177
   macro avg       0.89      0.88      0.88      9177
weighted avg       0.89      0.88      0.88      9177
 

----------------------------------------------------------------------


svm done -> It's classification report for toxic category 
               prec

Unnamed: 0,F1_Score - toxic
KNN,0.54
Logistic Regression,0.88
SVM,0.89
Complement NB,0.88
Bernoulli NB,0.77
Decision Tree,0.83
Random Forest,0.85


### Severe_toxic

In [None]:
#severe_toxic
df_severe_toxic = pd.read_csv('Balanced Data/Severe_toxic.csv')
result_severe_toxic = vector_model(df_severe_toxic, 'severe_toxic', TfidfVectorizer, (1,1))
result_severe_toxic

### Threat

In [None]:
#Threat
df_threat = pd.read_csv('Balanced Data/Threat.csv')
result_threat = vector_model(df_threat, 'threat', TfidfVectorizer, (1,1))
result_threat

### Obscene

In [None]:
#Obscene
df_obscene = pd.read_csv('Balanced Data/Obscene.csv')
result_obscene = vector_model(df_obscene, 'obscene', TfidfVectorizer, (1,1))
result_obscene

### Insult

In [None]:
#Insult
df_insult = pd.read_csv('Balanced Data/Insult.csv')
result_insult = vector_model(df_insult, 'insult', TfidfVectorizer, (1,1))
result_insult

### Identity_hate

In [None]:
# Identity_hate
df_identity_hate = pd.read_csv('Balanced Data/Identity_hate.csv')
result_identity_hate = vector_model(df_identity_hate, 'identity_hate', TfidfVectorizer, (1,1))
result_identity_hate

### Visualization of F1-Score of all Categories

In [None]:
# Visualization of F1-Score of all categories
result = pd.concat([result_toxic, result_severe_toxic, result_threat, result_obscene, result_insult, result_identity_hate], axis=1)
result = result.transpose()
result.head()

In [None]:
plt.figure(figsize=(15,15))
sns.lineplot(data=result, markers=True)
plt.legend(loc='best')

### <b> Trying Test Results - picking logistic regression model from above graph</b>

- <b>Can it Differentiate between <font color = 'red'>Toxic</font> and non-toxic comments</b>

In [None]:
x = df_toxic.comment_text.fillna(' ')
y = df_toxic['toxic']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
tfv1 = TfidfVectorizer(ngram_range=(1,1), stop_words='english')
x_train_scal = tfv1.fit_transform(x_train)
x_test_scal = tfv1.transform(x_test)
lrt = LogisticRegression()
lrt.fit(x_train_scal, y_train)
lrt.predict(x_test_scal)[:100]

- <b>Can it Differentiate between <font color = 'red'>Severe Toxic</font> and non-severe-toxic comments</b>

In [None]:
x = df_severe_toxic.comment_text.fillna(' ')
y = df_severe_toxic['severe_toxic']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
tfv2 = TfidfVectorizer(ngram_range=(1,1), stop_words='english')
x_train_scal = tfv2.fit_transform(x_train)
x_test_scal = tfv2.transform(x_test)
lrst = LogisticRegression()
lrst.fit(x_train_scal, y_train)
lrst.predict(x_test_scal)[:100]

- <b>Can it Differentiate between <font color = 'red'>Threat</font> and non-threat comments</b>

In [None]:
x = df_threat.comment_text.fillna(' ')
y = df_threat['threat']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
tfv3 = TfidfVectorizer(ngram_range=(1,1), stop_words='english')
x_train_scal = tfv3.fit_transform(x_train)
x_test_scal = tfv3.transform(x_test)
lrth = LogisticRegression()
lrth.fit(x_train_scal, y_train)
lrth.predict(x_test_scal)[:100]

- <b>Can it Differentiate between <font color = 'red'>obscene</font> and non-obscene comments</b>

In [None]:
x = df_obscene.comment_text.fillna(' ')
y = df_obscene['obscene']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
tfv4 = TfidfVectorizer(ngram_range=(1,1), stop_words='english')
x_train_scal = tfv4.fit_transform(x_train)
x_test_scal = tfv4.transform(x_test)
lro = LogisticRegression()
lro.fit(x_train_scal, y_train)
lro.predict(x_test_scal)[:100]

- <b>Can it Differentiate between <font color = 'red'>Insult</font> and non-Insult comments</b>

In [None]:
x = df_insult.comment_text.fillna(' ')
y = df_insult['insult']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
tfv5 = TfidfVectorizer(ngram_range=(1,1), stop_words='english')
x_train_scal = tfv5.fit_transform(x_train)
x_test_scal = tfv5.transform(x_test)
lri = LogisticRegression()
lri.fit(x_train_scal, y_train)
lri.predict(x_test_scal)[:100]

- <b> Can it Differentiate between <font color = 'red'>identity_hate</font> and non-Identity_hate comments</b>

In [None]:
x = df_identity_hate.comment_text.fillna(' ')
y = df_identity_hate['identity_hate']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
tfv6 = TfidfVectorizer(ngram_range=(1,1), stop_words='english')
x_train_scal = tfv6.fit_transform(x_train)
x_test_scal = tfv6.transform(x_test)
lrid = LogisticRegression()
lrid.fit(x_train_scal, y_train)
lrid.predict(x_test_scal)[:100]

In [None]:
example1 = ['7th person on the edge of the cliff is a fucked up person']
example2 = ['if you have a look back at the source the information i updated was the correct form i can only guess the source hadnt updated i shall update the information once again but thank you for your message']

- <b>toxic or not ?</b>

In [None]:
example1_transform = tfv1.transform(example1)
zero=lrt.predict_proba(example1_transform)[:,0][0]
one=lrt.predict_proba(example1_transform)[:,1][0]
if (zero>=0.42 and one<=0.58) and (zero<=0.58 and one>=0.42):
    print('Neutral for Toxic Category')
elif one>0.58:
    print('Toxic')
else: 
    print('Non Toxic')

In [None]:
example2_transform = tfv1.transform(example2)
zero=lrt.predict_proba(example2_transform)[:,0][0]
one=lrt.predict_proba(example2_transform)[:,1][0]
if (zero>=0.42 and one<=0.58) and (zero<=0.58 and one>=0.42):
    print('Neutral for Toxic Category')
elif one>0.58:
    print('Toxic')
else: 
    print('Non Toxic')

- <b>severe_toxic or not ?</b>

In [None]:
example1_transform = tfv2.transform(example1)
zero=lrst.predict_proba(example1_transform)[:,0][0]
one=lrst.predict_proba(example1_transform)[:,1][0]
if (zero>=0.42 and one<=0.58) and (zero<=0.58 and one>=0.42):
    print('Neutral for Severe Toxic Category')
elif one>0.58:
    print('Severe Toxic')
else: 
    print('Non Severe Toxic')

In [None]:
example2_transform = tfv2.transform(example2)
zero=lrst.predict_proba(example2_transform)[:,0][0]
one=lrst.predict_proba(example2_transform)[:,1][0]
if (zero>=0.42 and one<=0.58) and (zero<=0.58 and one>=0.42):
    print('Neutral for Severe Toxic Category')
elif one>0.58:
    print('Severe Toxic')
else: 
    print('Non Severe Toxic')

- <b>threat or not ?</b>

In [None]:
example1_transform = tfv3.transform(example1)
zero=lrth.predict_proba(example1_transform)[:,0][0]
one=lrth.predict_proba(example1_transform)[:,1][0]
if (zero>=0.42 and one<=0.58) and (zero<=0.58 and one>=0.42):
    print('Neutral for Threat Category')
elif one>0.58:
    print('Threat')
else: 
    print('Non Threat')

In [None]:
example2_transform = tfv3.transform(example2)
zero=lrth.predict_proba(example2_transform)[:,0][0]
one=lrth.predict_proba(example2_transform)[:,1][0]
if (zero>=0.42 and one<=0.58) and (zero<=0.58 and one>=0.42):
    print('Neutral for Threat Category')
elif one>0.58:
    print('Threat')
else: 
    print('Non Threat')

- <b>obscene or not ?</b>

In [None]:
example1_transform = tfv4.transform(example1)
zero=lro.predict_proba(example1_transform)[:,0][0]
one=lro.predict_proba(example1_transform)[:,1][0]
if (zero>=0.42 and one<=0.58) and (zero<=0.58 and one>=0.42):
    print('Neutral for Obscene Category')
elif one>0.58:
    print('Obscene')
else: 
    print('Non Obscene')

In [None]:
example2_transform = tfv4.transform(example2)
zero=lro.predict_proba(example2_transform)[:,0][0]
one=lro.predict_proba(example2_transform)[:,1][0]
if (zero>=0.42 and one<=0.58) and (zero<=0.58 and one>=0.42):
    print('Neutral for Obscene Category')
elif one>0.58:
    print('Obscene')
else: 
    print('Non Obscene')

- <b>insult or not ?</b>

In [None]:
example1_transform = tfv5.transform(example1)
zero=lri.predict_proba(example1_transform)[:,0][0]
one=lri.predict_proba(example1_transform)[:,1][0]
if (zero>=0.42 and one<=0.58) and (zero<=0.58 and one>=0.42):
    print('Neutral for Insult Category')
elif one>0.58:
    print('Insult')
else: 
    print('Non Insult')

In [None]:
example2_transform = tfv5.transform(example2)
zero=lri.predict_proba(example2_transform)[:,0][0]
one=lri.predict_proba(example2_transform)[:,1][0]
if (zero>=0.42 and one<=0.58) and (zero<=0.58 and one>=0.42):
    print('Neutral for Insult Category')
elif one>0.58:
    print('Insult')
else: 
    print('Non Insult')

- <b>identity_hate or not ?</b>

In [None]:
example1_transform = tfv6.transform(example1)
zero=lrid.predict_proba(example1_transform)[:,0][0]
one=lrid.predict_proba(example1_transform)[:,1][0]
if (zero>=0.42 and one<=0.58) and (zero<=0.58 and one>=0.42):
    print('Neutral for Identity Hate Category')
elif one>0.58:
    print('Identity hate')
else: 
    print('Non Identity hate')

In [None]:
example2_transform = tfv6.transform(example2)
zero=lrid.predict_proba(example2_transform)[:,0][0]
one=lrid.predict_proba(example2_transform)[:,1][0]
if (zero>=0.42 and one<=0.58) and (zero<=0.58 and one>=0.42):
    print('Neutral for Identity Hate Category')
elif one>0.58:
    print('Identity hate')
else: 
    print('Non Identity hate')

### Exporting Trained Models as Pickle Files

In [None]:
def getfiles(df, label):
    x = df.comment_text.fillna(' ')
    y = df[label]
    
    tfv_f = TfidfVectorizer(ngram_range=(1,1), stop_words='english')
    X_vect = tfv_f.fit_transform(x)
    
    with open(f'{label + "_vect"}.pkl', 'wb') as f:
        pickle.dump(tfv_f, f)
    
    log = LogisticRegression()
    log.fit(X_vect, y)
    
    with open(f'{label + "_model"}.pkl', 'wb') as f:
        pickle.dump(log, f)


In [None]:
list_c = ['toxic', 'severe_toxic', 'threat', 'obscene', 'insult', 'identity_hate']
list_d = [df_toxic, df_severe_toxic, df_threat, df_obscene, df_insult, df_identity_hate]
for i, j in zip(list_d, list_c):
    getfiles(i, j)