In [1]:
import numpy as np
import time
from sklearn.datasets import fetch_20newsgroups 

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer 

from sklearn.linear_model import SGDClassifier 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV


## Creating test + train sets

In [2]:
categories = ['alt.atheism', 'soc.religion.christian', 
              'comp.graphics', 'sci.med']

# currently looking at all categories 
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, remove=('headers', 'footers', 'quotes'))
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, remove=('headers', 'footers', 'quotes'))


In [3]:
model_names = ['SVM', 'Logistic Regression', 'AdaBoost', 'Decision Tree', 'Random Forest']

# just need to add randomforest 
models = [
    LinearSVC(random_state=0,max_iter=2000),
    LogisticRegression(random_state=0,max_iter=1000),
    AdaBoostClassifier(n_estimators=50, learning_rate=1,random_state=0),
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(max_depth=2, random_state=0)
]

parameter_grids = [
    [ # Parameter Grid for Linear SVC.
#         { # l1 Case: Dual False, Squared_hinge
#             'vect__ngram_range': [(1, 1), (1, 2)],
#             'tfidf__use_idf': [True, False],
#             'clf__loss': ['squared_hinge'],
#             'clf__penalty': ['l1'],
#             'clf__C': range(1,100,10),
#             'clf__tol': [1e-2, 1e-4, 1e-9],
#             'clf__dual': [False]
#         },
#         { # l2 Hinge Case: Dual True
#             'vect__ngram_range': [(1, 1), (1, 2)],
#             'tfidf__use_idf': [True, False],
#             'clf__loss': ['hinge'],
#             'clf__penalty': ['l2'],
#             'clf__C': range(1,100,10),
#             'clf__tol': [1e-2, 1e-4, 1e-9],
#             'clf__dual': [True]
#         },
#         { # l2 Square Hinged Case
#             'vect__ngram_range': [(1, 1), (1, 2)],
#             'tfidf__use_idf': [True, False],
#             'clf__loss': ['squared_hinge'],
#             'clf__penalty': ['l2'],
#             'clf__C': range(1,100,10),
#             'clf__tol': [1e-2, 1e-4, 1e-9],
#             'clf__dual': [True,False]
#         },
        {
            'vect__ngram_range': [(1,1), (1,2)],
            'tfidf__use_idf': (True, False),
            'vect__stop_words': ('english', None)
        }
    ],
    [
        {
            'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf': (True, False),
            'clf__penalty': ('l2', 'l1')
        }
    ],
    [
        {
            'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf': (True, False),
            'clf__learning_rate': (1, 2, 0.5),
            'clf__n_estimators': (10, 50, 100)
        }
    ],
    [
        {
            'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf': (True, False),
            'clf__criterion': ('gini', 'entropy'),
        },
    ],
    [
        {
            'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf': (True, False),
            'clf__criterion': ('gini', 'entropy'),
        }
    ]
]

In [4]:
def runGridSearchCV(parameter_grids, models, model_names):
    start = time.time()
    i = 0
    best_scores = []
    best_params = []
    for model in models: 
        print("Currently training model: ", model_names[i])

        text_clf = Pipeline([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('clf', model)])

        _  = text_clf.fit(twenty_train.data, twenty_train.target)    

        gs_clf = GridSearchCV(text_clf, parameter_grids[i], n_jobs=-1, cv=3,error_score=0.0)
        gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

        best_scores.append(gs_clf.best_score_)
        best_params.append(gs_clf.best_params_)

        print("Time taken: ", time.time()-start)
        print("Best score : ", gs_clf.best_score_)
        print("Best params: ", gs_clf.best_params_)
        print("\n\n")

        i+=1

In [5]:
def runRandomizedSearchCV(parameter_grids, models, model_names, num_iters=100):
    start = time.time()
    i = 0
    best_scores = []
    best_params = []
    for model in models: 
        print("Currently training model: ", model_names[i])

        text_clf = Pipeline([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('clf', model)])

        _  = text_clf.fit(twenty_train.data, twenty_train.target)    

        rs_clf = RandomizedSearchCV(text_clf, parameter_grids[i], n_jobs=-1, cv=3,error_score=0.0,n_iter = num_iters, verbose = 2, random_state=0)
        rs_clf = rs_clf.fit(twenty_train.data, twenty_train.target)

        best_scores.append(rs_clf.best_score_)
        best_params.append(rs_clf.best_params_)

        print("Time taken: ", time.time()-start)
        print("Best score : ", rs_clf.best_score_)
        print("Best params: ", rs_clf.best_params_)
        print("\n\n")

        i+=1

In [21]:
runGridSearchCV(parameter_grids, models, model_names)

TypeError: runGridSearchCV() takes 2 positional arguments but 3 were given

## Finding Hyperparameters For Each Model

### Linear SVC Parameters

In [6]:
linear_svc_rs = [
    {
        'vect__ngram_range': [(1,1),(1,2)],
        'vect__stop_words': ['english',None],
        'tfidf__use_idf': [True,False],
        'clf__C': [0.01, 0.05, 0.1, 0.5, 1.0, 10, 20 , 30, 40, 50, 60, 70, 80, 90, 100],
        'clf__penalty': ['l2','l1'],
        'clf__loss': ['hinge','squared_hinge'],
    }
]

In [36]:
# Narrowed from randomized search
linear_svc_gs = [
    {
        'vect__ngram_range': [(1,1),(1,2)],
        'vect__stop_words': ['english',None],
        
        'tfidf__use_idf': [True,False],
        'clf__C': [75,80,85],
        'clf__penalty': ['l2','l1'],
        'clf__loss': ['hinge','squared_hinge'],
    }
]


In [None]:
runGridSearchCV(linear_svc_gs, [models[0]], [model_names[0]])

In [None]:
runRandomizedSearchCV(linear_svc_rs, [models[0]], [model_names[0]])

Currently training model:  SVM
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:   10.0s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed:  8.8min


In [None]:
runGridSearchCV(linear_svc_pg, [models[0]], [model_names[0]])

### Logistic Regression Parameters

In [None]:
logistic_regression_gs = [
        {
            'vect__ngram_range': [(1,1),(1,2)],
            'vect__stop_words': ['english'],
            'tfidf__use_idf': [True],
            'clf__C': [55,60,65],
            'clf__penalty': ['l2'],
            'clf__solver': ['newton-cg','sag','lbfgs','liblinear','saga'],
        }
]

In [None]:
runGridSearchCV(logistic_regression_gs, [models[1]], [model_names[1]])

In [6]:
logistic_regression_rs = [
        {
            'vect__ngram_range': [(1,1),(1,2)],
            'vect__stop_words': ['english',None],
            'tfidf__use_idf': [True,False],
            'clf__C': [0.01, 0.05, 0.1, 0.5, 1.0, 10, 20 , 30, 40, 50, 60, 70, 80, 90, 100],
            'clf__penalty': ['l2','l1','elasticnet'],
            'clf__solver': ['newton-cg','sag','lbfgs','liblinear','saga'],
        }
]

In [7]:
runRandomizedSearchCV(logistic_regression_rs, [models[1]], [model_names[1]])

Currently training model:  Logistic Regression




Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    9.2s
[Parallel(n_jobs=-1)]: Done 130 tasks      | elapsed: 26.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 180.4min finished


Time taken:  10982.911972284317
Best score :  0.7578221672264451
Best params:  {'vect__stop_words': 'english', 'vect__ngram_range': (1, 2), 'tfidf__use_idf': True, 'clf__solver': 'saga', 'clf__penalty': 'l2', 'clf__C': 60}





In [None]:
runGridSearchCV(logistic_regression_pg, [models[1]], [model_names[1]])

### Random Forest Parameters

In [None]:
random_forest_rs = [
        {
            'vect__ngram_range': [(1,1),(1,2)],
            'vect__stop_words': ['english',None],
            'tfidf__use_idf': [True,False],
            'clf__bootstrap': [True,False],
            'clf__max_depth' : [10,20,30,40,50,60,70,80,90,100,None],
            'clf__n_estimators' : [100, 200, 400, 700, 800, 1000, 1200, 1400, 1600, 1800, 2000],
#             'clf__C': [0.01, 0.05, 0.1, 0.5, 1.0, 10, 20 , 30, 40, 50, 60, 70, 80, 90, 100],
            'clf__min_samples_split': [2,5,10],
            'clf__max_features': ['auto','sqrt'],
#             'clf__penalty': ['l2','l1','elasticnet'],
#             'clf__solver': ['newton-cg','sag','lbfgs','liblinear','saga'],
        }
]

In [None]:
runRandomizedSearchCV(random_forest_rs, [models[4]], [model_names[4]])

In [None]:
random_forest_gs = [
        {
            'vect__ngram_range': [(1,2)],
            'vect__stop_words': ['english'],
            'tfidf__use_idf': [True],
            'clf__bootstrap': [True,False],
            'clf__max_depth' : [20,50,100,None],
            'clf__n_estimators' : [600, 700, 800],
            'clf__min_samples_split': [2,10],
            'clf__max_features': ['auto','sqrt'],
        }
]

In [None]:
runGridSearchCV(logistic_regression_pg, [models[4]], [model_names[4]])

### Decision Tree

In [None]:
decision_tree_rs = [
        {
            'vect__ngram_range': [(1,1),(1,2)],
            'vect__stop_words': ['english',None],
            'tfidf__use_idf': [True,False],
            'clf__max_features': [None,'auto','sqrt','log2'],
            'clf__max_depth': [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
            'clf__min_impurity_decrease': [0.00005,0.0001,0.0002,0.0005,0.001,0.0015,0.002,0.005,0.01]
        }
]

In [None]:
runRandomizedSearchCV(decision_tree_rs, [models[3]], [model_names[3]])