In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [13]:
data = pd.read_csv("archive/FinalBalancedDataset.csv")

In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56745 entries, 0 to 56744
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  56745 non-null  int64 
 1   Toxicity    56745 non-null  int64 
 2   tweet       56745 non-null  object
dtypes: int64(2), object(1)
memory usage: 1.3+ MB


In [15]:
data.head()

Unnamed: 0.1,Unnamed: 0,Toxicity,tweet
0,0,0,@user when a father is dysfunctional and is s...
1,1,0,@user @user thanks for #lyft credit i can't us...
2,2,0,bihday your majesty
3,3,0,#model i love u take with u all the time in ...
4,4,0,factsguide: society now #motivation


In [19]:
data = data.drop("Unnamed: 0", axis=1)

In [22]:
data.head()

Unnamed: 0,Toxicity,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [20]:
data["Toxicity"].value_counts()

Toxicity
0    32592
1    24153
Name: count, dtype: int64

In [25]:
import nltk
nltk.download()
from nltk import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import stopwords as nltk_stopwords
from nltk.corpus import wordnet

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


# Lemmatizer
* Leaves
* Leafs Leaf
### Text pre-processing

In [26]:
wordnet_lemmatizer = WordNetLemmatizer()

In [27]:
import re

In [28]:
def prepare_text(text):
    def get_wordnet_pos(treebank_tag):
        if treebank_tag.startswith('J'):
            return wordnet.ADJ
        elif treebank_tag.startswith('V'):
            return wordnet.VERB
        elif treebank_tag.startswith('N'):
            return wordnet.NOUN
        elif treebank_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN
    text = re.sub(r'[^a-zA-Z\']', ' ', text)
    text = text.split()
    text = ' '.join(text)
    text = word_tokenize(text)
    text = pos_tag(text)
    lemma = []
    for i in text: lemma.append(wordnet_lemmatizer.lemmatize(i[0], pos = get_wordnet_pos(i[1])))
    lemma = ' '.join(lemma)
    return lemma

In [29]:
data['clean_tweets'] = data['tweet'].apply(lambda x: prepare_text(x))

In [34]:
data.tail()

Unnamed: 0,Toxicity,tweet,clean_tweets
56740,1,you's a muthaf***in lie &#8220;@LifeAsKing: @2...,you 's a muthaf in lie LifeAsKing Pearls corey...
56741,1,"you've gone and broke the wrong heart baby, an...",you 've go and break the wrong heart baby and ...
56742,1,young buck wanna eat!!.. dat nigguh like I ain...,young buck wan na eat dat nigguh like I aint f...
56743,1,youu got wild bitches tellin you lies,youu get wild bitch tellin you lie
56744,0,~~Ruffled | Ntac Eileen Dahlia - Beautiful col...,Ruffled Ntac Eileen Dahlia Beautiful color com...


# Tfidf for features

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
import pickle

In [41]:
corpus = data['clean_tweets'].values.astype('U')

In [42]:
stopwords = set(nltk_stopwords.words('english'))

In [47]:
stopwords_list = list(stopwords)

count_tf_idf = TfidfVectorizer(stop_words=stopwords_list)
tf_idf = count_tf_idf.fit_transform(corpus)

In [48]:
pickle.dump(count_tf_idf, open("tf_idf.pkt", "wb"))

In [49]:
tf_idf_train, tf_idf_test, target_train, target_test = train_test_split(
    tf_idf, data['Toxicity'], test_size = 0.8, random_state= 42, shuffle=True
)

# Create a Binary Classification Model

In [50]:
model_bayes = MultinomialNB()

In [51]:
model_bayes = model_bayes.fit(tf_idf_train, target_train)

In [52]:
y_pred_proba = model_bayes.predict_proba(tf_idf_test)[::, 1]

In [53]:
y_pred_proba

array([0.90151866, 0.27913231, 0.79020483, ..., 0.09474976, 0.20552427,
       0.32089036])

In [54]:
fpr, tpr, _ = roc_curve(target_test, y_pred_proba)

In [55]:
final_roc_auc = roc_auc_score(target_test, y_pred_proba)

In [56]:
final_roc_auc

np.float64(0.9659014779038171)

In [62]:
test_text = "I hate you moron"
test_tfidf = count_tf_idf.transform([test_text])
display(model_bayes.predict_proba(test_tfidf))
display(model_bayes.predict(test_tfidf))

array([[0.39921021, 0.60078979]])

array([1])

# Save the model

In [58]:
pickle.dump(model_bayes, open("toxicity_model.pkt", "wb"))