# Modeling -  TF/IDF (bk 7)

#### In this Notebook:
        7.1: Train, Test, Split
        7.2: Bernoulli Naive Bayes
            7.2.1: Bernoulli Naive Bayes
            7.2.2: Multinomial Naive Bayes
            7.2.3: Gaussian Naive Bayes
            7.2.4: Optimisation of Bernoulli Naive Bayes
        7.4: Logistic Regression
            7.4.1: Model Evaluation
        7.3: KNN Neighbor
            7.3.1: Model Fitting and Evaluation

In [1]:
import numpy as np
import pandas as pd
import sklearn
import seaborn as sns
import matplotlib as plt

from sklearn.model_selection import train_test_split, cross_val_score 
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, Pipeline

In [2]:
# Sparse Metric
X_tfidf_df = pd.read_csv('../data/X_tfidf_df.csv')

In [3]:
data_clean = pd.read_csv ('../data/data_clean.csv')

In [4]:
data_clean.head()

Unnamed: 0,subreddit,selftext_title,created_utc
0,stocks,I am earning very little at the moment but I w...,1626851004
1,stocks,"The stocks I chose were aapl, net, asts, icln,...",1626847423
2,stocks,Retail owns the companies so it could happen i...,1626846017
3,stocks,"Hi,\n\nI'm looking for the best software to tr...",1626845812
4,stocks,I'm not a car guy and I'm not an EV guy. Def ...,1626840162


In [5]:
# Dropping the created_utc 
data_clean = data_clean.drop( ['created_utc'], axis = 1)

In [6]:
data_clean.head()

Unnamed: 0,subreddit,selftext_title
0,stocks,I am earning very little at the moment but I w...
1,stocks,"The stocks I chose were aapl, net, asts, icln,..."
2,stocks,Retail owns the companies so it could happen i...
3,stocks,"Hi,\n\nI'm looking for the best software to tr..."
4,stocks,I'm not a car guy and I'm not an EV guy. Def ...


In [7]:
data_clean.tail()

Unnamed: 0,subreddit,selftext_title
3395,CryptoCurrency,I’ve started seeing posts on here acknowledgin...
3396,CryptoCurrency,Let me get started.\n\nIt was after the Snowde...
3397,CryptoCurrency,I have been thinking about this for sometime a...
3398,CryptoCurrency,I'm curious as to how mining and the price of ...
3399,CryptoCurrency,"Hey guys, \n\nSo the post a few days ago about..."


In [8]:
di = {'stocks': 1, 'CryptoCurrency' :0}
data_clean['subreddit'] = data_clean['subreddit'].map(di)

In [9]:
data_clean.shape

(3400, 2)

In [10]:
# assigning X and y
X = X_tfidf_df
y = data_clean['subreddit']

In [11]:
y.shape

(3400,)

In [12]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: subreddit, dtype: int64

In [13]:
y.tail()

3395    0
3396    0
3397    0
3398    0
3399    0
Name: subreddit, dtype: int64

### 7.1 Train, Test, Split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, stratify = y, random_state = 40)

### 7.2 Bernoulli Naive Bayes

#### 7.2.1 Using Bernoulli Naive Bayes to predict

In [15]:
BernNB = BernoulliNB(binarize = True)
BernNB.fit(X_train, y_train)
print(BernNB)

y_expect = y_test

# Predicting Y = subreddit
y_pred = BernNB.predict(X_test)

# Cross-Validation
print(cross_val_score(BernNB, X_train, y_train, cv = 5).mean())

# Accuracy score of train
print(BernNB.score(X_train, y_train))

# Accuracy score of test
print(BernNB.score(X_test, y_test))

# Accuracy for y_expect and y_pred
print(accuracy_score(y_expect, y_pred))

BernoulliNB(binarize=True)
0.5
0.5004201680672269
0.5009803921568627
0.5009803921568627


#### 7.2.3 Using Multinomial Naive Bayes to predict

In [16]:
MultiNB = MultinomialNB()
MultiNB.fit(X_train, y_train)
print(MultiNB)

# Predicting Y = subreddit 
y_pred = MultiNB.predict(X_test)

# Cross-Validation
print(cross_val_score(MultiNB, X_train, y_train, cv = 5).mean())

# Accuracy score of train
print(MultiNB.score(X_train, y_train))

# Accuracy score of test
print(MultiNB.score(X_test, y_test))

# Accuracy for y_test and y_pred
print(accuracy_score(y_test, y_pred))

MultinomialNB()
0.6029411764705882
0.6348739495798319
0.6137254901960785
0.6137254901960785


#### 7.2.4 Using Gaussian Naive Bayes to predict

In [17]:
GausNB = GaussianNB()
GausNB.fit(X_train, y_train)
print(GausNB)

# Predicting Y = subreddit 
y_pred = GausNB.predict(X_test)

# Cross-Validation
print(cross_val_score(GausNB, X_train, y_train, cv = 5).mean())

# Accuracy score of train
print(GausNB.score(X_train, y_train))

# Accuracy score of test
print(GausNB.score(X_test, y_test))

# Accuracy for y_test and y_pred
print(accuracy_score(y_test, y_pred))

GaussianNB()
0.9239495798319328
0.9794117647058823
0.9303921568627451
0.9303921568627451


#### 7.2.5 Optimisation of Bernoulli Naive Bayes

In [18]:
BernNB = BernoulliNB(binarize = 0.1)
BernNB.fit(X_train, y_train)
print(BernNB)

y_expect = y_test

# Predicting Y = subreddit 
y_pred = BernNB.predict(X_test)

# Cross-Validation
print(cross_val_score(BernNB, X_train, y_train, cv = 5).mean())

# Accuracy score of train
print(BernNB.score(X_train, y_train))

# Accuracy score of test
print(BernNB.score(X_test, y_test))

# Accuracy for y_expect and y_pred
print(accuracy_score(y_expect, y_pred))

BernoulliNB(binarize=0.1)
0.9281512605042017
0.9869747899159664
0.9333333333333333
0.9333333333333333


#### 7.2.4 Model Evaluation of Multinomial Naive Bayes

In [19]:
MultiNB = MultinomialNB()
MultiNB.fit(X_train, y_train)
print(MultiNB)

# predicting Y = subreddit 
y_pred = MultiNB.predict(X_test) 

MultinomialNB()


In [20]:
# classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.56      1.00      0.72       510
           1       1.00      0.23      0.37       510

    accuracy                           0.61      1020
   macro avg       0.78      0.61      0.55      1020
weighted avg       0.78      0.61      0.55      1020



In [21]:
cm = confusion_matrix(y_test, y_pred)

tn, fp, fn, tp = cm.ravel()

# scores
accuracy_score = accuracy_score(y_test, y_pred)
precision_score = precision_score(y_test, y_pred)
recall_score = recall_score = recall_score(y_test, y_pred)
specificity_score = tn / (tn + fp)

print('accuracy score: ' + str(accuracy_score))
print('precision score: ' + str(precision_score))
print('recall score: ' + str(recall_score))
print('specificity score: ' + str(specificity_score))

accuracy score: 0.6137254901960785
precision score: 1.0
recall score: 0.22745098039215686
specificity score: 1.0


### 7.2.3 Logistic Regression

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, stratify = y, random_state = 40)

In [23]:
LogReg = LogisticRegression(solver = 'liblinear')
LogReg.fit(X_train, y_train)

LogisticRegression(solver='liblinear')

In [24]:
y_pred = LogReg.predict(X_test)

#### Model Evaluation

In [25]:
# classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       510
           1       1.00      0.99      0.99       510

    accuracy                           0.99      1020
   macro avg       0.99      0.99      0.99      1020
weighted avg       0.99      0.99      0.99      1020



In [26]:
y_train_pred = cross_val_predict(LogReg, X_train, y_train, cv = 5)
confusion_matrix(y_train, y_train_pred)

array([[1190,    0],
       [   6, 1184]], dtype=int64)

### 7.2.4 KNN Neighbor

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .3, stratify = y, random_state = 40)

In [28]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [29]:
# Instantiate KNN
knn = KNeighborsClassifier(n_neighbors = 3)

# Cross-validation
cross_val_score(knn, X_train_sc, y_train, cv = 5).mean()

0.5336134453781513

#### Model Fitting and Evaluation

In [30]:
knn.fit(X_train_sc, y_train)
knn.score(X_train_sc, y_train)

0.9983193277310924

In [31]:
knn.score(X_test_sc, y_test)

0.5450980392156862

In [32]:
cross_val_score(knn, X_test_sc, y_test, cv = 5).mean()

0.5019607843137255