# Modeling - Count Vectorization (bk 6)

#### In this Notebook:
        6.1: Train, Test, Split
        6.2: Bernoulli Naive Bayes
            6.2.1: Bernoulli Naive Bayes
            6.2.2: Multinomial Naive Bayes
            6.2.3: Gaussian Naive Bayes
            6.2.4: Optimisation of Bernoulli Naive Bayes
        6.4: Logistic Regression
            6.4.1: Model Evaluation
        6.3: KNN Neighbor
            6.3.1: Model Fitting and Evaluation
         

In [1]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib as plt

from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, Pipeline

In [2]:
X_counts_df = pd.read_csv('../data/X_counts_df_CV.csv')

In [3]:
data_clean = pd.read_csv ('../data/data_clean.csv')

In [4]:
data_clean.head()

Unnamed: 0,subreddit,selftext_title,created_utc
0,stocks,I am earning very little at the moment but I w...,1626851004
1,stocks,"The stocks I chose were aapl, net, asts, icln,...",1626847423
2,stocks,Retail owns the companies so it could happen i...,1626846017
3,stocks,"Hi,\n\nI'm looking for the best software to tr...",1626845812
4,stocks,I'm not a car guy and I'm not an EV guy. Def ...,1626840162


In [5]:
# Dropping the created_utc 
data_clean = data_clean.drop( ['created_utc'], axis = 1)

In [6]:
data_clean.head()

Unnamed: 0,subreddit,selftext_title
0,stocks,I am earning very little at the moment but I w...
1,stocks,"The stocks I chose were aapl, net, asts, icln,..."
2,stocks,Retail owns the companies so it could happen i...
3,stocks,"Hi,\n\nI'm looking for the best software to tr..."
4,stocks,I'm not a car guy and I'm not an EV guy. Def ...


In [7]:
data_clean.tail()

Unnamed: 0,subreddit,selftext_title
3395,CryptoCurrency,I’ve started seeing posts on here acknowledgin...
3396,CryptoCurrency,Let me get started.\n\nIt was after the Snowde...
3397,CryptoCurrency,I have been thinking about this for sometime a...
3398,CryptoCurrency,I'm curious as to how mining and the price of ...
3399,CryptoCurrency,"Hey guys, \n\nSo the post a few days ago about..."


In [8]:
di = {'stocks': 1, 'CryptoCurrency' :0}
data_clean['subreddit'] = data_clean['subreddit'].map(di)

In [9]:
data_clean.shape

(3400, 2)

In [10]:
# assigning X and y 
X = X_counts_df
y = data_clean['subreddit']

In [11]:
X.shape

(3400, 23127)

In [12]:
y.shape

(3400,)

In [13]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: subreddit, dtype: int64

In [14]:
y.tail()

3395    0
3396    0
3397    0
3398    0
3399    0
Name: subreddit, dtype: int64

#### Baseline score

In [15]:
y.value_counts(normalize = True)

0    0.5
1    0.5
Name: subreddit, dtype: float64

### 6.1 Train, Test, Split

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, stratify = y, random_state = 40)

### 6.2 Bernoulli Naive Bayes

#### 6.2.1 Using Bernoulli Naive Bayes to predict 

In [17]:
BernNB = BernoulliNB(binarize = True)
BernNB.fit(X_train, y_train)
print(BernNB)

y_expect = y_test

# Predicting Y = subreddit
y_pred = BernNB.predict(X_test)

# Cross-Validation
print(cross_val_score(BernNB, X_train, y_train, cv = 5).mean())

# Accuracy score of train
print(BernNB.score(X_train, y_train))

# Accuracy score of test
print(BernNB.score(X_test, y_test))

# Accuracy for y_expect and y_pred
print(accuracy_score(y_expect, y_pred))

BernoulliNB(binarize=True)
0.7218487394957983
0.7714285714285715
0.7245098039215686
0.7245098039215686


#### 6.2.2 Using Multinomial Naive Bayes to predict

In [18]:
MultiNB = MultinomialNB()
MultiNB.fit(X_train, y_train)
print(MultiNB)

# Predicting Y = subreddit 
y_pred = MultiNB.predict(X_test)

# Cross-Validation
print(cross_val_score(MultiNB, X_train, y_train, cv = 5).mean())

# Accuracy score of train
print(MultiNB.score(X_train, y_train))

# Accuracy score of test
print(MultiNB.score(X_test, y_test))

# Accuracy for y_test and y_pred
print(accuracy_score(y_test, y_pred))

MultinomialNB()
0.7546218487394958
0.7777310924369748
0.7637254901960784
0.7637254901960784


#### 6.2.3 Using Gaussian Naive Bayes to predict

In [19]:
GausNB = GaussianNB()
GausNB.fit(X_train, y_train)
print(GausNB)

# Predicting Y = subreddit 
y_pred = GausNB.predict(X_test)

# Cross-Validation
print(cross_val_score(GausNB, X_train, y_train, cv = 5).mean())

# Accuracy score of train
print(GausNB.score(X_train, y_train))

# Accuracy score of test
print(GausNB.score(X_test, y_test))

# Accuracy for y_test and y_pred
print(accuracy_score(y_test, y_pred))

GaussianNB()
0.7100840336134454
0.7890756302521008
0.7186274509803922
0.7186274509803922


#### 6.2.4 Using optimisation of Bernoulli Naive Bayes

In [20]:
BernNB = BernoulliNB(binarize = 0.1)
BernNB.fit(X_train, y_train)
print(BernNB)

y_expect = y_test

# Predicting Y = subreddit 
y_pred = BernNB.predict(X_test)

# Cross-Validation
print(cross_val_score(BernNB, X_train, y_train, cv = 5).mean())

# Accuracy score of train
print(BernNB.score(X_train, y_train))

# Accuracy score of test
print(BernNB.score(X_test, y_test))

# Accuracy for y_expect and y_pred
print(accuracy_score(y_expect, y_pred))

BernoulliNB(binarize=0.1)
0.7638655462184873
0.8445378151260504
0.7823529411764706
0.7823529411764706


#### Model Evaluation of Multinomial Naive Bayes

In [21]:
MultiNB = MultinomialNB()
MultiNB.fit(X_train, y_train)
print(MultiNB)

# predicting Y = subreddit 
y_pred = MultiNB.predict(X_test) 

MultinomialNB()


In [22]:
# classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.69      0.96      0.80       510
           1       0.94      0.56      0.71       510

    accuracy                           0.76      1020
   macro avg       0.81      0.76      0.75      1020
weighted avg       0.81      0.76      0.75      1020



In [23]:
cm = confusion_matrix(y_test, y_pred)

tn, fp, fn, tp = cm.ravel()

# scores
accuracy_score = accuracy_score(y_test, y_pred)
precision_score = precision_score(y_test, y_pred)
recall_score = recall_score = recall_score(y_test, y_pred)
specificity_score = tn / (tn + fp)

print('accuracy score: ' + str(accuracy_score))
print('precision score: ' + str(precision_score))
print('recall score: ' + str(recall_score))
print('specificity score: ' + str(specificity_score))

accuracy score: 0.7637254901960784
precision score: 0.9381107491856677
recall score: 0.5647058823529412
specificity score: 0.9627450980392157


### 6.3 Logistic Regression

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, stratify = y, random_state = 40)

In [25]:
LogReg = LogisticRegression(solver = 'liblinear')
LogReg.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

In [26]:
y_pred = LogReg.predict(X_test)

#### Model Evaluation

In [27]:
# classification report
print (classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.99      0.98       510
           1       0.99      0.97      0.98       510

    accuracy                           0.98      1020
   macro avg       0.98      0.98      0.98      1020
weighted avg       0.98      0.98      0.98      1020



In [28]:
y_train_pred = cross_val_predict(LogReg, X_train, y_train, cv = 5)
confusion_matrix(y_train, y_train_pred)

array([[1175,   15],
       [  23, 1167]], dtype=int64)

### 6.4 KNN Neighbor

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, stratify = y, random_state = 40)

In [30]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [31]:
# Instantiate KNN
knn = KNeighborsClassifier(n_neighbors = 3)

# Cross-validation
cross_val_score(knn, X_train_sc, y_train, cv = 5).mean()

0.5579831932773109

#### 6.4.1 Model Fitting and Evaluation

In [32]:
knn.fit(X_train_sc, y_train)
knn.score(X_train_sc, y_train)

0.980672268907563

In [33]:
knn.score(X_test_sc, y_test)

0.5852941176470589

In [34]:
cross_val_score(knn, X_test_sc, y_test, cv = 5).mean()

0.5107843137254902