In [1]:

# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re

from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix



### 1.0 Load data

In [2]:
df = pd.read_csv('./data/reddit_cleaned_data.csv')

In [3]:
# Cheking data frame shape
display(df.shape)
#Checking if there is a null value
display(df.isnull().sum().sum())
#Checking column names
df.columns

(18000, 2)

0

Index(['clean_text', 'label'], dtype='object')

In [4]:
df.head()

Unnamed: 0,clean_text,label
0,locked macbook lost find activate,0
1,apple m2 macbook pro see first ever discount 2...,0
2,home automation,0
3,apple customer service beyond apple care actua...,0
4,sure right place get quick,0


### 2.0 Define X, Y

In [5]:
X = df[['clean_text']]
y = df['label']

### 3.0 Baseline accuracy

In [6]:
y.value_counts(normalize=True)

0    0.5
1    0.5
Name: label, dtype: float64

###  4.0 Train Test Split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state = 42,
                                                    stratify=y,
                                                    test_size = 0.25)

( 1.1 ) CountVectorizer + LogisticRegression

### 5.0 Modelling

We will be evaluating the following models based on their performance:  
    - Multinomial Naive Bayes (MultinomialNB),  
    - Random Forest Classifier,  
    - Support Vector Machine  
They will be paired with both CountVectorizer and TF-IDF Vectorizer

#### 5.1 CountVectorizer

In [8]:
# define function to pull accuracy score of training and testing dataset for each model
def get_metrics(model, X_train, X_test):
    print(u'\u2500' * 30)
    print(f"Accuracy score of training set: {round(100* model.score(X_train, y_train),2)}%")
    print(f"Accuracy score of testing set: {round(100* model.score(X_test, y_test),2)}%")
    print(u'\u2500' * 30)    
    return plot_confusion_matrix(model, X_test,y_test)

In [28]:
# Instantiate our CountVectorizer.
cvec = CountVectorizer(max_features = 2000,min_df=2,max_df=0.9, stop_words = 'english')

In [29]:
# Fit our CountVectorizer on the training data and transform training data.
X_train_cvec = pd.DataFrame(cvec.fit_transform(X_train['clean_text']).todense(),
                          columns = cvec.get_feature_names_out())

In [30]:
# Transform our testing data with the already-fit CountVectorizer.
X_test_cvec = pd.DataFrame(cvec.transform(X_test['clean_text']).todense(),
                         columns = cvec.get_feature_names_out())

In [31]:
X_train_cvec.head()

Unnamed: 0,00,000,01,02,03,05,06,07,08,10,...,yesterday,yogesh,young,youre,youtube,zenfone,zero,zone,zoom,zte
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
# Instantiate our model!
nb = MultinomialNB()

In [33]:
# Fit our model!
cvec_nb = nb.fit(X_train_cvec, y_train)

In [34]:
# Generate our predictions!

predictions = cvec_nb.predict(X_test_cvec)
    
print(u'\u2500' * 30)
print(f"Accuracy score of training set: {round(100* cvec_nb.score(X_train_cvec, y_train),2)}%")
print(f"Accuracy score of testing set: {round(100* cvec_nb.score(X_test_cvec, y_test),2)}%")
print(u'\u2500' * 30)    

──────────────────────────────
Accuracy score of training set: 85.01%
Accuracy score of testing set: 82.89%
──────────────────────────────


In [35]:
confusion_matrix(y_test, predictions)

array([[2062,  188],
       [ 582, 1668]], dtype=int64)

#### 