In [40]:
import numpy as np
import time
import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [42]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('wordnet')
nltk.download('punkt')

class LemmaTokenizer(object):
  def __init__(self):
    self.wnl = WordNetLemmatizer()
  def __call__(self, articles):
    return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\heath\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\heath\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [36]:
# currently looking at all categories
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, remove=('headers', 'footers', 'quotes'))
1
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, remove=('headers', 'footers', 'quotes'))
# For IMDB
train = pd.read_csv('train_IMDb.csv')
test = pd.read_csv('test_IMDb.csv')


In [37]:
class IMDB:
  def __init__(self,subset = 'train'):
    if (subset == 'train'):
      self.allData = train.to_numpy()
    else:
      self.allData = test.to_numpy()
    np.random.shuffle(self.allData)
    self.data = self.allData[:,0]
    self.target = self.allData[:,1]
    self.target=self.target.astype('int')


In [32]:
imdb_train = IMDB()
imdb_test = IMDB('test')

In [18]:
model_names = ['SVM', 'Logistic Regression', 'AdaBoost', 'Decision Tree', 'Random Forest']
# just need to add the FINAL PARAMETERS!!!!!!!
models = [
LinearSVC(random_state=0,max_iter=2000),
LogisticRegression(random_state=0,max_iter=1000),
AdaBoostClassifier(n_estimators=50, learning_rate=1,random_state=0),
DecisionTreeClassifier(random_state=0),
RandomForestClassifier(max_depth=2, random_state=0)]


In [19]:
def get_final_accuracy(train, test, model, model_name, parameters):
  start = time.time()
  text_clf = Pipeline([('vect', CountVectorizer()),
  ('tfidf', TfidfTransformer()),
  ('clf', model)])
  text_clf.set_params(**parameters)
  print("About to fit ",model_name, " with ", text_clf.get_params)
  text_clf.fit(train.data, train.target)
  print("Final accuracy for: ", model_name)
  print(text_clf.score(test.data, test.target))
  print("Time taken: ", time.time()-start)


In [20]:
def get_final_accuracy_lemm(train, test, model, model_name, parameters):
  start = time.time()

  text_clf = Pipeline([('vect', CountVectorizer(tokenizer=LemmaTokenizer())),
  ('tfidf', TfidfTransformer()),
  ('clf', model)])
  text_clf.set_params(**parameters)
  print("About to fit ",model_name, " with ", text_clf.get_params)
  text_clf.fit(train.data, train.target)
  print("Final accuracy for: ", model_name)
  print(text_clf.score(test.data, test.target))
  print("Time taken: ", time.time()-start)


### LR News --- Stop Words = None 

In [24]:
lr_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': None,
'tfidf__use_idf': True,
'clf__C': 60,
'clf__penalty': 'l2',
'clf__solver': 'saga'
}

lr = LogisticRegression(random_state=0,max_iter=1000)


get_final_accuracy(twenty_train, twenty_test, lr, 'Logistic Reg', lr_parameters)

About to fit  Logistic Reg  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=60, class_weight=None, dual=False,
         



Final accuracy for:  Logistic Reg
0.6890600106213489
Time taken:  606.0015366077423


### LR News --- Unigrams

In [46]:
lr_parameters = {
'vect__ngram_range': (1,1),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 60,
'clf__penalty': 'l2',
'clf__solver': 'saga'
}

lr = LogisticRegression(random_state=0,max_iter=1000)


get_final_accuracy(twenty_train, twenty_test, lr, 'Logistic Reg', lr_parameters)

About to fit  Logistic Reg  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=60, class_weight=None, dual=False,
    



Final accuracy for:  Logistic Reg
0.6886617100371747
Time taken:  129.280113697052


### LR News --- Unigrams, Bigrams

In [47]:
lr_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 60,
'clf__penalty': 'l2',
'clf__solver': 'saga'
}

lr = LogisticRegression(random_state=0,max_iter=1000)


get_final_accuracy(twenty_train, twenty_test, lr, 'Logistic Reg', lr_parameters)

About to fit  Logistic Reg  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=60, class_weight=None, dual=False,
    



Final accuracy for:  Logistic Reg
0.7027349973446628
Time taken:  426.02043175697327


### LR News --- Unigrams, Bigrams, Trigrams

In [48]:
lr_parameters = {
'vect__ngram_range': (1,3),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 60,
'clf__penalty': 'l2',
'clf__solver': 'saga'
}

lr = LogisticRegression(random_state=0,max_iter=1000)


get_final_accuracy(twenty_train, twenty_test, lr, 'Logistic Reg', lr_parameters)

About to fit  Logistic Reg  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=60, class_weight=None, dual=False,
    



Final accuracy for:  Logistic Reg
0.6995485926712692
Time taken:  654.2867667675018


### LR News --- Lemmatization included 

In [49]:
lr_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 60,
'clf__penalty': 'l2',
'clf__solver': 'saga'
}

lr = LogisticRegression(random_state=0,max_iter=1000)


get_final_accuracy_lemm(twenty_train, twenty_test, lr, 'Logistic Reg', lr_parameters)

About to fit  Logistic Reg  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<__main__.Lemm...
                                 vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegress

  'stop_words.' % sorted(inconsistent))


Final accuracy for:  Logistic Reg
0.6856080722251726
Time taken:  840.8142807483673


### LR IMDb --- StopWords = None 

In [33]:
lr_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': None,
'tfidf__use_idf': True,
'clf__C': 55,
'clf__penalty': 'l2',
'clf__solver': 'saga'
}

lr = LogisticRegression(random_state=0,max_iter=1000)


get_final_accuracy(imdb_train, imdb_test, lr, 'Logistic Reg', lr_parameters)

About to fit  Logistic Reg  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=55, class_weight=None, dual=False,
         

### LR IMDb --- Lemmatization Included

In [50]:
lr_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 55,
'clf__penalty': 'l2',
'clf__solver': 'saga'
}

lr = LogisticRegression(random_state=0,max_iter=1000)

get_final_accuracy_lemm(imdb_train, imdb_test, lr, 'Logistic Reg', lr_parameters)

About to fit  Logistic Reg  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<__main__.Lemm...
                                 vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegress

  'stop_words.' % sorted(inconsistent))


Final accuracy for:  Logistic Reg
0.887
Time taken:  374.9038083553314


### LR IMDb --- Unigrams 

In [51]:
lr_parameters = {
'vect__ngram_range': (1,1),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 55,
'clf__penalty': 'l2',
'clf__solver': 'saga'
}

lr = LogisticRegression(random_state=0,max_iter=1000)

get_final_accuracy(imdb_train, imdb_test, lr, 'Logistic Reg', lr_parameters)

About to fit  Logistic Reg  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=55, class_weight=None, dual=False,
    

### LR IMDb --- Unigrams, Bigrams

In [52]:
lr_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 55,
'clf__penalty': 'l2',
'clf__solver': 'saga'
}

lr = LogisticRegression(random_state=0,max_iter=1000)

get_final_accuracy(imdb_train, imdb_test, lr, 'Logistic Reg', lr_parameters)

About to fit  Logistic Reg  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=55, class_weight=None, dual=False,
    

### LR IMDb --- Unigrams, Bigrams, Trigrams

In [53]:
lr_parameters = {
'vect__ngram_range': (1,3),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 55,
'clf__penalty': 'l2',
'clf__solver': 'saga'
}

lr = LogisticRegression(random_state=0,max_iter=1000)

get_final_accuracy(imdb_train, imdb_test, lr, 'Logistic Reg', lr_parameters)

About to fit  Logistic Reg  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=55, class_weight=None, dual=False,
    

### SVM News --- StopWords = None

In [55]:
svm_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': None,
'tfidf__use_idf': True,
'clf__C': 75,
'clf__penalty': 'l2',
'clf__loss': 'squared_hinge'
}

svm = LinearSVC(random_state=0,max_iter=2000)

get_final_accuracy(twenty_train, twenty_test, svm, 'Linear SVM News', svm_parameters)

About to fit  Linear SVM News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LinearSVC(C=75, class_weight=None, dual=True,
                



Final accuracy for:  Linear SVM News
0.695432819968136
Time taken:  434.29689836502075


### SVM News --- Lemmatization Included

In [56]:
svm_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 75,
'clf__penalty': 'l2',
'clf__loss': 'squared_hinge'
}

svm = LinearSVC(random_state=0,max_iter=2000)

get_final_accuracy_lemm(twenty_train, twenty_test, svm, 'Linear SVM News', svm_parameters)

About to fit  Linear SVM News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<__main__.LemmaTokenizer object at 0x000001A21CEADDC8>,
                                 vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
               

  'stop_words.' % sorted(inconsistent))


Final accuracy for:  Linear SVM News
0.6848114710568242
Time taken:  471.4971921443939


### SVM News --- Unigrams

In [57]:
svm_parameters = {
'vect__ngram_range': (1,1),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 75,
'clf__penalty': 'l2',
'clf__loss': 'squared_hinge'
}

svm = LinearSVC(random_state=0,max_iter=2000)

get_final_accuracy(twenty_train, twenty_test, svm, 'Linear SVM News', svm_parameters)

About to fit  Linear SVM News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LinearSVC(C=75, class_weight=None, dual=True,
           



Final accuracy for:  Linear SVM News
0.6678173127987255
Time taken:  71.43506026268005


### SVM News --- Unigrams, Bigrams

In [58]:
svm_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 75,
'clf__penalty': 'l2',
'clf__loss': 'squared_hinge'
}

svm = LinearSVC(random_state=0,max_iter=2000)

get_final_accuracy(twenty_train, twenty_test, svm, 'Linear SVM News', svm_parameters)

About to fit  Linear SVM News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LinearSVC(C=75, class_weight=None, dual=True,
           



Final accuracy for:  Linear SVM News
0.7035315985130112
Time taken:  344.3998954296112


### SVM News --- Unigrams, Bigrams, Trigrams

In [59]:
svm_parameters = {
'vect__ngram_range': (1,3),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 75,
'clf__penalty': 'l2',
'clf__loss': 'squared_hinge'
}

svm = LinearSVC(random_state=0,max_iter=2000)

get_final_accuracy(twenty_train, twenty_test, svm, 'Linear SVM News', svm_parameters)

About to fit  Linear SVM News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LinearSVC(C=75, class_weight=None, dual=True,
           



Final accuracy for:  Linear SVM News
0.703797132235794
Time taken:  598.383136510849


### SVM IMDb --- StopWords = None

In [60]:
svm_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': None,
'tfidf__use_idf': True,
'clf__C': 5,
'clf__penalty': 'l2',
'clf__loss': 'squared_hinge'
}

svm = LinearSVC(random_state=0,max_iter=2000)

get_final_accuracy(twenty_train, twenty_test, svm, 'Linear SVM News', svm_parameters)

About to fit  Linear SVM News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LinearSVC(C=5, class_weight=None, dual=True,
                 

### SVM IMDb --- Lemmatization Included

In [61]:
svm_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 5,
'clf__penalty': 'l2',
'clf__loss': 'squared_hinge'
}

svm = LinearSVC(random_state=0,max_iter=2000)

get_final_accuracy_lemm(twenty_train, twenty_test, svm, 'Linear SVM News', svm_parameters)

About to fit  Linear SVM News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<__main__.LemmaTokenizer object at 0x000001A20FA86A08>,
                                 vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
               

  'stop_words.' % sorted(inconsistent))


Final accuracy for:  Linear SVM News
0.6930430164630909
Time taken:  63.99591779708862


### SVM IMDb --- Unigrams

In [62]:
svm_parameters = {
'vect__ngram_range': (1,1),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 5,
'clf__penalty': 'l2',
'clf__loss': 'squared_hinge'
}

svm = LinearSVC(random_state=0,max_iter=2000)

get_final_accuracy(twenty_train, twenty_test, svm, 'Linear SVM News', svm_parameters)

About to fit  Linear SVM News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LinearSVC(C=5, class_weight=None, dual=True,
            

### SVM IMDb --- Unigrams, Bigrams

In [63]:
svm_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 5,
'clf__penalty': 'l2',
'clf__loss': 'squared_hinge'
}

svm = LinearSVC(random_state=0,max_iter=2000)

get_final_accuracy(twenty_train, twenty_test, svm, 'Linear SVM News', svm_parameters)

About to fit  Linear SVM News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LinearSVC(C=5, class_weight=None, dual=True,
            

### SVM IMDb --- Unigrams, Bigrams, Trigrams

In [64]:
svm_parameters = {
'vect__ngram_range': (1,3),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__C': 5,
'clf__penalty': 'l2',
'clf__loss': 'squared_hinge'
}

svm = LinearSVC(random_state=0,max_iter=2000)

get_final_accuracy(twenty_train, twenty_test, svm, 'Linear SVM News', svm_parameters)

About to fit  Linear SVM News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LinearSVC(C=5, class_weight=None, dual=True,
            

### Decision Tree News --- StopWords = None

In [65]:
dt_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': None,
'tfidf__use_idf': True,
'clf__max_depth': 15,
'clf__max_features': None,
'clf__min_impurity_decrease': 0.0015
}

dt = DecisionTreeClassifier(random_state=0)

get_final_accuracy(twenty_train, twenty_test, dt, 'Decision Tree', dt_parameters)

About to fit  Decision Tree  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                               

About to fit  Decision Tree CHANGED DEPTH  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                 

### Decision Tree News --- Lemmatization Included

In [66]:
dt_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__max_depth': 15,
'clf__max_features': None,
'clf__min_impurity_decrease': 0.0015
}

dt = DecisionTreeClassifier(random_state=0)

get_final_accuracy_lemm(twenty_train, twenty_test, dt, 'Decision Tree', dt_parameters)

About to fit  Decision Tree  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<__main__.Lemm...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                                

  'stop_words.' % sorted(inconsistent))


Final accuracy for:  Decision Tree
0.26447158789166225
Time taken:  52.10894846916199


### Decision Tree News --- Unigrams

In [67]:
dt_parameters = {
'vect__ngram_range': (1,1),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__max_depth': 15,
'clf__max_features': None,
'clf__min_impurity_decrease': 0.0015
}

dt = DecisionTreeClassifier(random_state=0)

get_final_accuracy(twenty_train, twenty_test, dt, 'Decision Tree', dt_parameters)

About to fit  Decision Tree  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                               

### Decision Tree News --- Unigrams, Bigrams

In [68]:
dt_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__max_depth': 15,
'clf__max_features': None,
'clf__min_impurity_decrease': 0.0015
}

dt = DecisionTreeClassifier(random_state=0)

get_final_accuracy(twenty_train, twenty_test, dt, 'Decision Tree', dt_parameters)

About to fit  Decision Tree  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                               

### Decision Tree News --- Unigrams, Bigrams, Trigrams

In [69]:
dt_parameters = {
'vect__ngram_range': (1,3),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__max_depth': 15,
'clf__max_features': None,
'clf__min_impurity_decrease': 0.0015
}

dt = DecisionTreeClassifier(random_state=0)

get_final_accuracy(twenty_train, twenty_test, dt, 'Decision Tree', dt_parameters)

About to fit  Decision Tree  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                               

### Decision Tree IMDb --- StopWords = None

In [70]:
dt_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': None,
'tfidf__use_idf': True,
'clf__max_depth': 40,
'clf__max_features': None,
'clf__min_impurity_decrease': 0.0005
}

dt = DecisionTreeClassifier(random_state=0)

get_final_accuracy(imdb_train, imdb_test, dt, 'Decision Tree', dt_parameters)

About to fit  Decision Tree  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                               

### Decision Tree IMDb --- Lemmatization Included

In [71]:
dt_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__max_depth': 40,
'clf__max_features': None,
'clf__min_impurity_decrease': 0.0005
}

dt = DecisionTreeClassifier(random_state=0)

get_final_accuracy_lemm(imdb_train, imdb_test, dt, 'Decision Tree', dt_parameters)

About to fit  Decision Tree  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<__main__.Lemm...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                                

  'stop_words.' % sorted(inconsistent))


Final accuracy for:  Decision Tree
0.74376
Time taken:  150.08497786521912


### Decision tree IMDb --- Unigrams

In [72]:
dt_parameters = {
'vect__ngram_range': (1,1),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__max_depth': 40,
'clf__max_features': None,
'clf__min_impurity_decrease': 0.0005
}

dt = DecisionTreeClassifier(random_state=0)

get_final_accuracy(imdb_train, imdb_test, dt, 'Decision Tree', dt_parameters)

About to fit  Decision Tree  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                               

### DEcision Tree IMDb --- Unigrams, Bigrams

In [73]:
dt_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__max_depth': 40,
'clf__max_features': None,
'clf__min_impurity_decrease': 0.0005
}

dt = DecisionTreeClassifier(random_state=0)

get_final_accuracy(imdb_train, imdb_test, dt, 'Decision Tree', dt_parameters)

About to fit  Decision Tree  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                               

### dt imdb uni, bi, tri

In [74]:
dt_parameters = {
'vect__ngram_range': (1,3),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
'clf__max_depth': 40,
'clf__max_features': None,
'clf__min_impurity_decrease': 0.0005
}

dt = DecisionTreeClassifier(random_state=0)

get_final_accuracy(imdb_train, imdb_test, dt, 'Decision Tree', dt_parameters)

About to fit  Decision Tree  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 DecisionTreeClassifier(class_weight=None, criterion='gini',
                               

### Naive Bayes News StopWords = None 

In [75]:
nb_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': None,
'tfidf__use_idf': True,
}

nb = MultinomialNB()

get_final_accuracy(twenty_train, twenty_test, nb, 'Naive Bayes', nb_parameters)

About to fit  Naive Bayes  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
    

### Naive Bayes News --- Lemmatization Included

In [76]:
nb_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
}

nb = MultinomialNB()

get_final_accuracy_lemm(twenty_train, twenty_test, nb, 'Naive Bayes', nb_parameters)

About to fit  Naive Bayes  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<__main__.LemmaTokenizer object at 0x000001A20DF27308>,
                                 vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('c

  'stop_words.' % sorted(inconsistent))


Final accuracy for:  Naive Bayes
0.605416887944769
Time taken:  44.95395755767822


### Naive Bayes News --- Unigrams

In [77]:
nb_parameters = {
'vect__ngram_range': (1,1),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
}

nb = MultinomialNB()

get_final_accuracy(twenty_train, twenty_test, nb, 'Naive Bayes', nb_parameters)

About to fit  Naive Bayes  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],

### Naive Bayes News --- Unigrams, Bigrams


In [78]:
nb_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
}

nb = MultinomialNB()

get_final_accuracy(twenty_train, twenty_test, nb, 'Naive Bayes', nb_parameters)

About to fit  Naive Bayes  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],

### Naive Bayes News --- Unigrams, Bigrams, Trigrams

In [79]:
nb_parameters = {
'vect__ngram_range': (1,3),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
}

nb = MultinomialNB()

get_final_accuracy(twenty_train, twenty_test, nb, 'Naive Bayes', nb_parameters)

About to fit  Naive Bayes  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],

### Naive Bayes IMDb --- StopWords=None

In [44]:
nb_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': None,
'tfidf__use_idf': True,
}

nb = MultinomialNB()

get_final_accuracy(imdb_train, imdb_test, nb, 'Naive Bayes', nb_parameters)

About to fit  Naive Bayes  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
    

### Naive Bayes IMDb --- Lemmatization Included

In [83]:
nb_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
}

nb = MultinomialNB()

get_final_accuracy_lemm(imdb_train, imdb_test, nb, 'Naive Bayes', nb_parameters)

About to fit  Naive Bayes  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=<__main__.LemmaTokenizer object at 0x000001A21A33F288>,
                                 vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('c

  'stop_words.' % sorted(inconsistent))


Final accuracy for:  Naive Bayes
0.85292
Time taken:  125.98744058609009


### Naive Bayes IMDb --- Unigrams

In [45]:
nb_parameters = {
'vect__ngram_range': (1,1),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
}

nb = MultinomialNB()

get_final_accuracy(imdb_train, imdb_test, nb, 'Naive Bayes', nb_parameters)

About to fit  Naive Bayes  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],

### Naive Bayes IMDb --- Unigrams, Bigrams

In [80]:
nb_parameters = {
'vect__ngram_range': (1,2),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
}

nb = MultinomialNB()

get_final_accuracy(imdb_train, imdb_test, nb, 'Naive Bayes', nb_parameters)

About to fit  Naive Bayes  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],

### Naive Bayes IMDb --- Unigrams, Bigrams, Trigrams

In [81]:
nb_parameters = {
'vect__ngram_range': (1,3),
'vect__stop_words': 'english',
'tfidf__use_idf': True,
}

nb = MultinomialNB()

get_final_accuracy(imdb_train, imdb_test, nb, 'Naive Bayes', nb_parameters)

About to fit  Naive Bayes  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 3), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],