In [10]:
import numpy as np
import time
import pandas as pd
from sklearn.datasets import fetch_20newsgroups 

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer 

from sklearn.linear_model import SGDClassifier 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB


from sklearn.model_selection import GridSearchCV,RandomizedSearchCV


In [24]:
categories = ['alt.atheism', 'soc.religion.christian', 
              'comp.graphics', 'sci.med']

# currently looking at all categories 
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, remove=('headers', 'footers', 'quotes'))
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, remove=('headers', 'footers', 'quotes'))

# For IMDB 
train = pd.read_csv('train_IMDb.csv')
test = pd.read_csv('test_IMDb.csv')

In [25]:
class IMDB:
    
    def __init__(self,subset = 'train'):
        if (subset == 'train'):
            self.allData = train.to_numpy()
        else:
            self.allData = test.to_numpy()

        np.random.shuffle(self.allData)

        self.data = self.allData[:,0]
        self.target = self.allData[:,1]
        self.target=self.target.astype('int')

In [26]:
imdb_train = IMDB()
imdb_test = IMDB('test')

In [27]:
model_names = ['SVM', 'Logistic Regression', 'AdaBoost', 'Decision Tree', 'Random Forest']

# just need to add the FINAL PARAMETERS!!!!!!! 
models = [
    LinearSVC(random_state=0,max_iter=2000),
    LogisticRegression(random_state=0,max_iter=1000),
    AdaBoostClassifier(n_estimators=50, learning_rate=1,random_state=0),
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(max_depth=2, random_state=0)
]

In [28]:
def get_final_accuracy(train, test, model, model_name, parameters):
    start = time.time()

    text_clf = Pipeline([('vect', CountVectorizer()),
                                ('tfidf', TfidfTransformer()),
                                ('clf', model)])
    text_clf.set_params(**parameters)
    print("About to fit ",model_name, " with ", text_clf.get_params)
    text_clf.fit(train.data, train.target)
    print("Final accuracy for: ", model_name)
    print(text_clf.score(test.data, test.target))
    print("Time taken: ", time.time()-start)

### Logistic Regression News

In [29]:
lr_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
    'clf__C': 60,
    'clf__penalty': 'l2',
    'clf__solver': 'saga'
}
# Just testing
get_final_accuracy(twenty_train, twenty_test, models[1], 'Logistic Reg', lr_parameters)

About to fit  Logistic Reg  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=60, class_weight=None, dual=False,
    

### Logistic Regression IMDB

In [30]:
lr_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
    'clf__C': 55,
    'clf__penalty': 'l2',
    'clf__solver': 'saga'
}
# Just testing
get_final_accuracy(imdb_train, imdb_test, models[1], 'Logistic Reg', lr_parameters)

About to fit  Logistic Reg  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=55, class_weight=None, dual=False,
    

### SVM News

In [31]:
svm_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
    'clf__C': 75,
    'clf__loss': 'squared_hinge',
    'clf__penalty': 'l2'
}
# Just testing
get_final_accuracy(twenty_train, twenty_test, models[0], 'Linear SVM News', svm_parameters)

About to fit  Linear SVM News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LinearSVC(C=75, class_weight=None, dual=True,
           



Final accuracy for:  Linear SVM News
0.7036643653744026
Time taken:  140.64012265205383


### SVM IMDb


In [32]:
svm_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
    'clf__C': 5,
    'clf__loss': 'squared_hinge',
    'clf__penalty': 'l2'
}
# Just testing
get_final_accuracy(imdb_train, imdb_test, models[0], 'Linear SVM IMDb', svm_parameters)

About to fit  Linear SVM IMDb  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LinearSVC(C=5, class_weight=None, dual=True,
            

### Decision Tree News


In [33]:
dt_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
    'clf__max_depth': 15,
    'clf__max_features': None,
    'clf__min_impurity_decrease': 0.0015
}
# Just testing
get_final_accuracy(twenty_train, twenty_test, models[3], 'Decision Tree News', dt_parameters)

About to fit  Decision Tree News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=15,
                   

### Decision Tree IMDb

In [34]:
dt_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
    'clf__max_depth': 40,
    'clf__max_features': None,
    'clf__min_impurity_decrease': 0.0005
}
# Just testing
get_final_accuracy(imdb_train, imdb_test, models[3], 'Decision Tree IMDb', dt_parameters)

About to fit  Decision Tree IMDb  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                        criterion='gini', max_depth=40,
                   

### Random Forest News

In [None]:
rf_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
    'clf__bootstrap': False,
    'clf__max_depth': None,
    'clf__max_features': 'auto',
    'clf__min_samples_split': 10, 
    'clf__n_estimators': 800
}
# Just testing
get_final_accuracy(twenty_train, twenty_test, models[4], 'Random Forest News', rf_parameters)

About to fit  Random Forest News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabular...
                 RandomForestClassifier(bootstrap=False, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=None, max_features='auto',
                                 

### Random Forest IMDb

### Naive Bayes News

In [11]:
nb_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
}

get_final_accuracy(twenty_train, twenty_test, MultinomialNB(), 'Naive Bayes News', nb_parameters)

About to fit  Naive Bayes News  with  <bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 2), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=Tru

### Naive Bayes IMDb

In [None]:
nb_parameters = {
    'vect__ngram_range': (1,2),
    'vect__stop_words': 'english',
    'tfidf__use_idf': True,
}

get_final_accuracy(imdb_train, imdb_test, MultinomialNB(), 'Naive Bayes IMDb', nb_parameters)