In [0]:
import numpy as np
import time
import pandas as pd
from sklearn.datasets import fetch_20newsgroups 

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer 

from sklearn.linear_model import SGDClassifier 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import BaggingClassifier

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV


In [0]:
categories = ['alt.atheism', 'soc.religion.christian', 
              'comp.graphics', 'sci.med']

# currently looking at all categories 
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, remove=('headers', 'footers', 'quotes'))
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, remove=('headers', 'footers', 'quotes'))

# For IMDB 
train = pd.read_csv('train_IMDb.csv')
test = pd.read_csv('test_IMDb.csv')

In [0]:
class IMDB:
    
    def __init__(self,subset = 'train'):
        if (subset == 'train'):
            self.allData = train.to_numpy()
        else:
            self.allData = test.to_numpy()

        np.random.shuffle(self.allData)

        self.data = self.allData[:,0]
        self.target = self.allData[:,1]
        self.target=self.target.astype('int')

In [0]:
imdb_train = IMDB()
imdb_test = IMDB('test')

In [0]:
model_names = ['SVM', 'Logistic Regression', 'AdaBoost', 'Decision Tree', 'Random Forest']

# just need to add the FINAL PARAMETERS!!!!!!! 
models = [
    LinearSVC(random_state=0,max_iter=2000),
    LogisticRegression(random_state=0,max_iter=1000),
    BaggingClassifier(random_state=0),
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(max_depth=2, random_state=0)
]

In [0]:
def get_final_accuracy(train, test, estimator, model_name, parameters):
    
    adaboost = BaggingClassifier(random_state=0)
    
    start = time.time()

    text_clf = Pipeline([('vect', CountVectorizer()),
                                ('tfidf', TfidfTransformer()),
                                ('clf', adaboost)])
            
        
    text_clf.set_params(clf__base_estimator=estimator)
    print("About to fit ",model_name, " with ", text_clf.get_params)
    text_clf.fit(train.data, train.target)
    print("Final accuracy for: ", model_name)
    print(text_clf.score(test.data, test.target))
    print("Time taken: ", time.time()-start)

### Logistic Regression News

In [0]:
lr_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
}

lr_news = LogisticRegression(random_state=0,max_iter=1000,C=60,penalty='l2',solver='saga')

# Just testing
get_final_accuracy(twenty_train, twenty_test, lr_news, 'Adaboost Logistic Reg News', lr_parameters)

About to fit  Adaboost Logistic Reg News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                                                                     fit_intercept=True,
                                                                     intercept_scaling=1,
                                                                     l1_



Final accuracy for:  Adaboost Logistic Reg News
0.6894583112055231
Time taken:  1169.680920124054


### Logistic Regression IMDB

In [0]:
lr_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
}

lr_imdb = LogisticRegression(random_state=0,max_iter=1000,C=55,penalty='l2',solver='saga')


# Just testing
get_final_accuracy(imdb_train, imdb_test, lr_imdb, 'Adaboost Logistic Reg IMDB', lr_parameters)

About to fit  Adaboost Logistic Reg IMDB  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                                                                     fit_intercept=True,
                                                                     intercept_scaling=1,
                                                                     l1_

### SVM News

In [0]:
svm_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
    'clf__algorithm': 'SAMME'
}

svm_news = LinearSVC(random_state=0,max_iter=2000, C=75,loss='squared_hinge',penalty='l2')


# Just testing
get_final_accuracy(twenty_train, twenty_test, svm_news, 'Adaboost Linear SVM News', svm_parameters)

About to fit  Adaboost Linear SVM News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 BaggingClassifier(base_estimator=LinearSVC(C=75,
                                                            class_weight=None,
                                                            dual=True,
                                     



Final accuracy for:  Adaboost Linear SVM News
0.6642326075411578
Time taken:  431.23776054382324


### SVM IMDb


In [0]:
svm_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
    'clf__algorithm': 'SAMME'
}

svm_imdb = LinearSVC(random_state=0,max_iter=2000, C=5,loss='squared_hinge',penalty='l2')

# Just testing
get_final_accuracy(imdb_train, imdb_test, svm_imdb, 'Adaboost Linear SVM IMDb', svm_parameters)

About to fit  Adaboost Linear SVM IMDb  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 BaggingClassifier(base_estimator=LinearSVC(C=5,
                                                            class_weight=None,
                                                            dual=True,
                                      

### Decision Tree News


In [0]:
dt_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
}
        
dt_news = DecisionTreeClassifier(random_state=0, max_depth=15, max_features=None, min_impurity_decrease=0.0015)

# Just testing
get_final_accuracy(twenty_train, twenty_test, dt_news, 'Adaboost Decision Tree News', dt_parameters)

About to fit  Adaboost Decision Tree News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                                                                         max_leaf_nodes=None,
                                                                         min_impurity_decrease=0.0015,
                                                     

### Decision Tree IMDb

In [0]:
dt_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
}

dt_imdb = DecisionTreeClassifier(random_state=0, max_depth=40, max_features=None, min_impurity_decrease=0.0005)

# Just testing
get_final_accuracy(imdb_train, imdb_test, dt_imdb, 'Adaboost Decision Tree IMDb', dt_parameters)

About to fit  Adaboost Decision Tree IMDb  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                                                                         max_leaf_nodes=None,
                                                                         min_impurity_decrease=0.0005,
                                                     

### Random Forest News

In [0]:
rf_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,

}

#     'clf__bootstrap': False,
#     'clf__max_depth': None,
#     'clf__max_features': 'auto',
#     'clf__min_samples_split': 10, 
#     'clf__n_estimators': 800
        
rf_news = RandomForestClassifier(max_depth=None, random_state=0, bootstrap=False, max_features='auto', min_samples_split=10,n_estimators=800)

# Just testing
get_final_accuracy(twenty_train, twenty_test, rf_news, 'Random Forest News', rf_parameters)

About to fit  Random Forest News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                                                                         min_impurity_split=None,
                                                                         min_samples_leaf=1,
                                                                    

### Random Forest IMDb

### Naive Bayes News

In [0]:
nb_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
}

get_final_accuracy(twenty_train, twenty_test, MultinomialNB(), 'AdaBoost Naive Bayes News', nb_parameters)

About to fit  AdaBoost Naive Bayes News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 BaggingClassifier(base_estimator=MultinomialNB(alpha=1.0,
                     

### Naive Bayes IMDb

In [0]:
nb_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
}


get_final_accuracy(imdb_train, imdb_test, MultinomialNB(), 'AdaBoost Naive Bayes News', nb_parameters)

About to fit  AdaBoost Naive Bayes News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 BaggingClassifier(base_estimator=MultinomialNB(alpha=1.0,
                     