In [27]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups 

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer 

from sklearn.linear_model import SGDClassifier 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV


## Creating test + train sets

In [28]:
categories = ['alt.atheism', 'soc.religion.christian', 
              'comp.graphics', 'sci.med']

# currently looking at all categories 
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, remove=('headers', 'footers', 'quotes'))
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, remove=('headers', 'footers', 'quotes'))


In [46]:
model_names = ['SVM', 'Logistic Regression', 'AdaBoost', 'Decision Tree', 'Random Forest']

# just need to add randomforest 
models = [
    LinearSVC(),
    LogisticRegression(random_state=0),
    AdaBoostClassifier(n_estimators=50, learning_rate=1),
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(max_depth=2, random_state=0)
]

parameter_grids = [
    [ # Parameter Grid for Linear SVC.
        { # l1 Case: Dual False, Squared_hinge
            'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf': [True, False],
            'clf__loss': ['squared_hinge'],
            'clf__penalty': ['l1'],
            'clf__C': range(1,100,10),
            'clf__tol': [1e-2, 1e-4, 1e-9],
            'clf__dual': [False]
        },
        { # l2 Hinge Case: Dual True
            'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf': [True, False],
            'clf__loss': ['hinge'],
            'clf__penalty': ['l2'],
            'clf__C': range(1,100,10),
            'clf__tol': [1e-2, 1e-4, 1e-9],
            'clf__dual': [True]
        },
        { # l2 Square Hinged Case
            'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf': [True, False],
            'clf__loss': ['squared_hinge'],
            'clf__penalty': ['l2'],
            'clf__C': range(1,100,10),
            'clf__tol': [1e-2, 1e-4, 1e-9],
            'clf__dual': [True,False]
        }
    ],
    [
        {
            'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf': (True, False),
            'clf__penalty': ('l2', 'l1')
        }
    ],
    [
        {
            'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf': (True, False),
            'clf__learning_rate': (1, 2, 0.5),
            'clf__n_estimators': (10, 50, 100)
        }
    ],
    [
        {
            'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf': (True, False),
            'clf__criterion': ('gini', 'entropy'),
        },
    ],
    [
        {
            'vect__ngram_range': [(1, 1), (1, 2)],
            'tfidf__use_idf': (True, False),
            'clf__criterion': ('gini', 'entropy'),
        }
    ]
]

In [None]:
i = 0
best_scores = []
best_params = []
for model in models: 
    print("Currently training model: ", model_names[i])
    
    text_clf = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', model)])

    _  = text_clf.fit(twenty_train.data, twenty_train.target)    
    
    gs_clf = GridSearchCV(text_clf, parameter_grids[i], n_jobs=-1, cv=3,error_score=0.0)
    gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
    
    best_scores.append(gs_clf.best_score_)
    best_params.append(gs_clf.best_params_)
    
    print("Best score : ", gs_clf.best_score_)
    print("Best params: ", gs_clf.best_params_)
    print("\n\n")
    
    i+=1 


Currently training model:  SVM
