## Modeling with Multinomial Naive Bayes regression

In [22]:
#import the libraries
import numpy as np
import pandas as pd
from nltk import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.naive_bayes import MultinomialNB

from sklearn.compose import  ColumnTransformer, make_column_transformer

In [23]:
%store -r X
%store -r y

In [24]:
X.shape

(3020, 1)

### Train/test/split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                   stratify = y, random_state = 42)

**lemmatokenizer**

In [26]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
class LemmaTokenizer:
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [29]:
col_transformer_cv = make_column_transformer(
  (CountVectorizer(tokenizer = LemmaTokenizer()), 'cleaned'),
  remainder = 'passthrough'
)
col_transformer_td = make_column_transformer(
  (TfidfVectorizer(tokenizer = LemmaTokenizer()), 'cleaned'),
  remainder = 'passthrough'
)
pipe_mnb_cv = Pipeline([
    ("col_trans", col_transformer_cv),
    ('mnb', MultinomialNB())])

pipe_mnb_td = Pipeline([
    ("col_trans", col_transformer_td),
    ('mnb', MultinomialNB())])
    
# Construct Grid Parameters for CountVectorizer
hyperparams_cv = {
               'col_trans__countvectorizer__ngram_range': [(1,1), (1,2)],
               'col_trans__countvectorizer__stop_words': [None,'english'],
               'col_trans__countvectorizer__max_features': [None, 100, 500,1000],
               'col_trans__countvectorizer__min_df': [1, 3, 4],
               'col_trans__countvectorizer__max_df': [0.9, 0.95, .9],

                 }
# Construct Grid Parameters for TDIDFFVectorizer
hyperparams_td = {'col_trans__tfidfvectorizer__ngram_range': [(1,1), (1,2)],
               'col_trans__tfidfvectorizer__stop_words': [None, 'english'],
               'col_trans__tfidfvectorizer__max_features': [None, 100, 500,1000],
               'col_trans__tfidfvectorizer__min_df': [1, 3, 4],
               'col_trans__tfidfvectorizer__max_df': [0.9, 0.95, .99]
                 }

**Multinomial Naive Bayes & countvectorizer**

In [30]:
gs_mnb_cv = GridSearchCV(pipe_mnb_cv, param_grid= hyperparams_cv, verbose=1,cv=5,n_jobs=4)
gs_mnb_cv.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   13.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   50.5s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.9min
[Parallel(n_jobs=4)]: Done 720 out of 720 | elapsed:  3.1min finished
  'stop_words.' % sorted(inconsistent))


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('col_trans',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('countvectorizer',
                                                                         CountVectorizer(analyzer='word',
                                                                                         binary=False,
                                                                                         decode_error='strict',
                                                                                         dtype=<class 'numpy.int64'>,
                 

In [31]:
print(f'Train score: {gs_mnb_cv.score(X_train, y_train)}')
print(f'Test score: {gs_mnb_cv.score(X_test, y_test)}')

Train score: 0.9130242825607064
Test score: 0.8649006622516556


In [32]:
#best parameters
gs_mnb_cv.best_params_

{'col_trans__countvectorizer__max_df': 0.9,
 'col_trans__countvectorizer__max_features': 1000,
 'col_trans__countvectorizer__min_df': 3,
 'col_trans__countvectorizer__ngram_range': (1, 1),
 'col_trans__countvectorizer__stop_words': 'english'}

In [33]:
#confusion Matrix
pred_y_cv = gs_mnb_cv.predict(X_test)
pd.DataFrame(confusion_matrix(y_test, pred_y_cv), 
             columns=['predict_bitcoin', 'predict_ethereum'], 
             index=['actual_bitcoin', 'actual_ethereum'])

Unnamed: 0,predict_bitcoin,predict_ethereum
actual_bitcoin,256,28
actual_ethereum,74,397


**Multinomial Naive Bayes & TFIDFVectorizer**

In [34]:
gs_mnb_td = GridSearchCV(pipe_mnb_td, param_grid= hyperparams_td, verbose=1,cv=5,n_jobs=4)
gs_mnb_td.fit(X_train, y_train)

Fitting 5 folds for each of 144 candidates, totalling 720 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    9.4s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:   44.9s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=4)]: Done 720 out of 720 | elapsed:  2.9min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('col_trans',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='passthrough',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('tfidfvectorizer',
                                                                         TfidfVectorizer(analyzer='word',
                                                                                         binary=False,
                                                                                         decode_error='strict',
                                                                                         dtype=<class 'numpy.float64'>,
               

In [35]:
print(f'Train score: {gs_mnb_td.score(X_train, y_train)}')
print(f'Test score: {gs_mnb_td.score(X_test, y_test)}')

Train score: 0.8984547461368654
Test score: 0.856953642384106


In [36]:
#best parameters
gs_mnb_td.best_params_

{'col_trans__tfidfvectorizer__max_df': 0.9,
 'col_trans__tfidfvectorizer__max_features': 1000,
 'col_trans__tfidfvectorizer__min_df': 1,
 'col_trans__tfidfvectorizer__ngram_range': (1, 2),
 'col_trans__tfidfvectorizer__stop_words': None}

In [52]:
gs_mnb_td.get_feature_names()

AttributeError: 'GridSearchCV' object has no attribute 'get_feature_names'

In [47]:
#find the best estimators
pd.DataFrame(gs_mnb_td.best_estimator_.named_steps['mnb'].coef_, columns = df.columns)

NameError: name 'df' is not defined

In [38]:
#confusion Matrix
pred_y_td = gs_mnb_td.predict(X_test)
pd.DataFrame(confusion_matrix(y_test, pred_y_td), 
             columns=['predict_bitcoin', 'predict_ethereum'], 
             index=['actual_bitcoin', 'actual_ethereum'])

Unnamed: 0,predict_bitcoin,predict_ethereum
actual_bitcoin,204,80
actual_ethereum,28,443


## Summary of Multinomial Naive Bayes regression

**Hyperparameters**

|  	|TFIDFVectorizer  	|CountVectorizer  	|
|---	|---	|---	|
|max_df  	|0.9  	|0.9  	|
|min_df  	|1  	|3  	|
|max_features  	|1000  	|1000  	|
|ngram_range  	|(1, 2)  	|(1, 1)  	|
|stop_words  	|None  	|english  	|

**Accuracy score**

|  	|TFIDFVectorizer  	|CountVectorizer  	|
|---	|---	|---	|
|Train   	|0.898  	|0.913  	|
|Test  	|0.857  	|0.865  	|

**Confusion Matrix**

- **TFIDFVectorizer**

|  	|predict_bitcoin  	|predict_ethereum  	|
|---	|---	|---	|
|actual_bitcoin  	|215  	|69  	|
|actual_ethereum	  	|43  	|428  	|

- **CountVectorizer**

|  	|predict_bitcoin  	|predict_ethereum  	|
|---	|---	|---	|
|actual_bitcoin  	|256  	|28  	|
|actual_ethereum	  	|74  	|397  	|

**Precision**

In [17]:
#precision = TP / (TP + FP) 


|MODEL|VECTORIZER  	|TP  	|TN  	|FP  	|FN  	|Precision  	|
|---|---	|---	|---	|---	|---	|---	|
|MNB|TFIDF  	|204  	|443  	|28  	|80  	|0.879  	|
||COUNT  	|256  	|397  	|74  	|28  	|0.776  	|
|LOG REG|TFIDF  	|220  	|436  	|35  	|64  	|0.863  	|
||COUNT  	|254  	|419  	|52  	|30  	|0.830  	|

In [39]:
%store X_train
%store y

Stored 'X_train' (DataFrame)
Stored 'y' (Series)


In [40]:
%store y_train

Stored 'y_train' (Series)
