In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd

df = pd.read_csv('proj.csv')

In [3]:
df

Unnamed: 0.1,Unnamed: 0,comment,label
0,0,"> i must study politics and war, that our sons...",normal
1,1,"i was literally told as a child ""i want you ki...",normal
2,2,how did this even happen? \n\nmy grandmother u...,normal
3,3,"bootstraps, ignore the house i bought for $10,...",normal
4,4,it's been a strange realization to slowly unde...,normal
...,...,...,...
293519,293519,i'm a bills fan. spent my childhood rooting fo...,normal
293520,293520,"he ones told his mother he can fly, he called ...",normal
293521,293521,that playoff luck from last year reverses itse...,normal
293522,293522,with lamar this would be the game i’d most loo...,normal


In [4]:
df = df.drop(['Unnamed: 0'],axis=1)

In [5]:
df

Unnamed: 0,comment,label
0,"> i must study politics and war, that our sons...",normal
1,"i was literally told as a child ""i want you ki...",normal
2,how did this even happen? \n\nmy grandmother u...,normal
3,"bootstraps, ignore the house i bought for $10,...",normal
4,it's been a strange realization to slowly unde...,normal
...,...,...
293519,i'm a bills fan. spent my childhood rooting fo...,normal
293520,"he ones told his mother he can fly, he called ...",normal
293521,that playoff luck from last year reverses itse...,normal
293522,with lamar this would be the game i’d most loo...,normal


### Data Exploration and Cleaning

In [6]:
df.isnull().sum()

comment    5
label      0
dtype: int64

**Observation:**
- There are 5 Null values in the Dataset which we need to drop or fill

In [7]:
df = df.dropna()

**Observation:**
- It is better to drop those values as they are not having any significance effect on the model

In [8]:
df['label'].value_counts()

normal       286624
provoking      6308
racist          587
Name: label, dtype: int64

**Observation:**
- As per the data we scraped, we found there are more than 2.8 lakh normal comments whereas provoking comments are about 6.3K and Racist comments are about 0.5K. From which we can infer that most of the Reddit users are not Racist and Provoking or Reddit's Policy for This kind of Comments are very strict.
- As the Data is imbalanced for classification, so we need to apply some kind of balancing techniques.

In [9]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

df['label'] = le.fit_transform(df['label'])
df

Unnamed: 0,comment,label
0,"> i must study politics and war, that our sons...",0
1,"i was literally told as a child ""i want you ki...",0
2,how did this even happen? \n\nmy grandmother u...,0
3,"bootstraps, ignore the house i bought for $10,...",0
4,it's been a strange realization to slowly unde...,0
...,...,...
293519,i'm a bills fan. spent my childhood rooting fo...,0
293520,"he ones told his mother he can fly, he called ...",0
293521,that playoff luck from last year reverses itse...,0
293522,with lamar this would be the game i’d most loo...,0


**Observation:**
- I have label encoded the labels as string values can't be given to a model
- normal has been mapped to 0, provoking has been mapped to 1 and racist has been mapped to 2

In [10]:
df = df.sample(20000)

In [11]:
df

Unnamed: 0,comment,label
71149,gentlemen... behold!!! *beep beep beep* .....c...,0
96137,i can't even imagine infecting a complete stra...,0
235479,losing my job in solidarity with bob sarver is...,0
192289,sim! no meio da confusão!,0
119142,lol thor isn't human got to be a new argument.,0
...,...,...
244035,"fuck, now i can’t go",0
143677,what a piece of shit lol.,0
63918,i just commented this elsewhere but as a daugh...,0
50626,i feel like this is exactly the argument they ...,0


**Explaination**
- As the Scraped data was so huge so it's hard to fit models into it as our local system isn't that much efficient so. I have taken a sample of 20K out of it and going to fit those into models.
- If we take all 2.9 Lakh data points then obiviously the accuracy will be much higher.

In [12]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

# Tokenize the text in the 'text' column
df['tokens'] = df['comment'].apply(lambda x: nltk.word_tokenize(x))

# Load the stop words
stop_words = set(stopwords.words("english"))

# Remove stop words from the 'tokens' column
df['cleaned_comment'] = df['tokens'].apply(lambda x: " ".join(token.lower() for token in x if token.lower() not in stop_words))

# Drop the 'tokens' column if it's no longer needed
df = df.drop(columns='tokens')

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed
[nltk_data]     (_ssl.c:852)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed
[nltk_data]     (_ssl.c:852)>


In [13]:
df

Unnamed: 0,comment,label,cleaned_comment
71149,gentlemen... behold!!! *beep beep beep* .....c...,0,gentlemen ... behold ! ! ! * beep beep beep * ...
96137,i can't even imagine infecting a complete stra...,0,ca n't even imagine infecting complete strange...
235479,losing my job in solidarity with bob sarver is...,0,losing job solidarity bob sarver something wan...
192289,sim! no meio da confusão!,0,sim ! meio da confusão !
119142,lol thor isn't human got to be a new argument.,0,lol thor n't human got new argument .
...,...,...,...
244035,"fuck, now i can’t go",0,"fuck , ’ go"
143677,what a piece of shit lol.,0,piece shit lol .
63918,i just commented this elsewhere but as a daugh...,0,commented elsewhere daughter conservatives pro...
50626,i feel like this is exactly the argument they ...,0,feel like exactly argument would make steer ar...


**Explaination**
- Using "nltk" library we are ignoring the common words which are not significant for this problem

In [14]:
df['label'].value_counts()

0    19534
1      437
2       29
Name: label, dtype: int64

In [15]:
X, y = df['cleaned_comment'], df['label']

**Explaination**
- Selecting X(Feature) and y(Target) 

In [16]:
X.values

array(['gentlemen ... behold ! ! ! * beep beep beep * ..... corn ! ! !',
       "ca n't even imagine infecting complete stranger , let alone kids . really sorry happened parents cultists said , little hope right thing going forward . 'm sure many others infected , besides , . seen nation makes afraid future , esp . light pandemics coming pike . polio re-emerged uk monkeypox may `` pox '' horizon .",
       'losing job solidarity bob sarver something want lol', ...,
       'commented elsewhere daughter conservatives promise ’ care woman regardless relation',
       "feel like exactly argument would make steer argument away something . 're still much higher countries think worse united states , think 's clear something needs done getting `` data countries might accurate might bad '' constructive .",
       '’ start initiative ! reducing stress 101 - unreasonable deadlines ( _really_ need ? )'],
      dtype=object)

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse import csr_matrix

# Convert X to a sparse matrix of token counts
cv = CountVectorizer()
X_count = cv.fit_transform(X)

# Convert X_count to a CSR matrix
X_count = csr_matrix(X_count)

**Observation:**
- To convert the text dataset into a proper dataframe I have used Count Vectorizer

In [18]:
print("Original dataset shape:","\n", pd.Series(y).value_counts())

Original dataset shape: 
 0    19534
1      437
2       29
Name: label, dtype: int64


In [19]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_resampled, y_resampled = smote.fit_resample(X_count, y)

print("Resampled dataset shape:",'\n',pd.Series(y_resampled).value_counts())

Resampled dataset shape: 
 2    19534
1    19534
0    19534
Name: label, dtype: int64


**Explaination**
- As the Sampled data was also imbalanced so I have applied SMOTE technique to balanced the dataset
- SMOTE is a statistical technique for increasing the number of cases in your dataset in a balanced way. The component works by generating new instances from existing minority cases that you supply as input.

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score

In [21]:
def MNB(test_size, random_state):
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = test_size, random_state = random_state)
    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
    return acc, auc

In [22]:
def LGR(test_size, random_state):
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = test_size, random_state = random_state)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
    return acc, auc

In [23]:
def DTC(test_size, random_state):
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = test_size, random_state = random_state)
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
    return acc, auc

In [24]:
def RFC(test_size, random_state):
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = test_size, random_state = random_state)
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    model = model
    acc = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, model.predict_proba(X_test), multi_class='ovr')
    return acc, auc

In [25]:
results_dataframe = pd.DataFrame(columns = ['Model', 'Test Size', 'Random State', 'Accuracy','AUC'])
random_states = [10,20,30,40,42,50,60]
test_sizes = [0.20,0.25,0.30]

for r_s in random_states:
    for ts in test_sizes:
        acc, auc= MNB(ts, r_s)
        result = {}
        result['Model'] = 'MultinomialNB'
        result['Test Size'] = ts
        result['Random State'] = r_s
        result['Accuracy'] = acc
        result['AUC'] = auc
        results_dataframe = results_dataframe.append(result, ignore_index = True)

In [26]:
for r_s in random_states:
    for ts in test_sizes:
        acc, auc= LGR(ts, r_s)
        result = {}
        result['Model'] = 'LogisticRegression'
        result['Test Size'] = ts
        result['Random State'] = r_s
        result['Accuracy'] = acc
        result['AUC'] = auc
        results_dataframe = results_dataframe.append(result, ignore_index = True)

In [27]:
for r_s in random_states:
    for ts in test_sizes:
        acc, auc= DTC(ts, r_s)
        result = {}
        result['Model'] = 'DecisionTreeClassifier'
        result['Test Size'] = ts
        result['Random State'] = r_s
        result['Accuracy'] = acc
        result['AUC'] = auc
        results_dataframe = results_dataframe.append(result, ignore_index = True)

In [None]:
for r_s in random_states:
    for ts in test_sizes:
        acc, auc= RFC(ts, r_s)
        result = {}
        result['Model'] = 'RandomForestClassifier'
        result['Test Size'] = ts
        result['Random State'] = r_s
        result['Accuracy'] = acc
        result['AUC'] = auc
        results_dataframe = results_dataframe.append(result, ignore_index = True)

In [None]:
results_dataframe

In [None]:
results_dataframe.to_csv("preddd.csv")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.catplot(data = results_dataframe, y = 'Accuracy', x = 'Test Size',hue = 'Model', kind = 'box')

**Observation:**
- As per the Catplot b/w Accuracy and Test Size Random Forest has the highest Accuracy with the test size 0.2.

In [None]:
sns.catplot(data = results_dataframe, y = 'AUC', x = 'Test Size',hue = 'Model', kind = 'box')

**Observation:**
- As per the Catplot b/w AUC and Test Size Random Forest has the highest AUC with test size 0.2

In [None]:
sns.catplot(data = results_dataframe, y = 'Accuracy', x = 'Random State',hue = 'Model', kind = 'box')

**Observation:**
- As per the Catplot b/w Accuracy and Random State Random Forest has the highest Accuracy with Random Size 40

In [None]:
sns.catplot(data = results_dataframe, y = 'AUC', x = 'Random State',hue = 'Model', kind = 'box')

**Observation:**
- As per the Catplot b/w AUC and Random State Random Forest has almost the highest AUC with Random Size 20, 30, 40 

**Overall Observation:**
- Random Forest is the best model for this case with Test Size 0.2 and Random State 40

In [None]:
RFC(0.2,40)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size = 0.2, random_state = 40)
model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

### Exporting Model for Deployment

In [None]:
import pickle

In [None]:
pickle.dump(model, open("model.pickle", 'wb'))
pickle.dump(le, open("le.pickle", 'wb'))
pickle.dump(cv, open("cv.pickle", 'wb'))