In [72]:
import numpy as np

from sklearn.datasets import fetch_20newsgroups 

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer 

from sklearn.linear_model import SGDClassifier 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV


## Creating test + train sets

In [66]:
categories = ['alt.atheism', 'soc.religion.christian', 
              'comp.graphics', 'sci.med']

# currently looking at all categories 
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, remove=('headers', 'footers', 'quotes'))
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, remove=('headers', 'footers', 'quotes'))

In [70]:
model_names = ['SVM', 'Logistic Regression', 'AdaBoost', 'Decision Tree', 'Random Forest']

# just need to add randomforest 
models = [
    LinearSVC(),
    LogisticRegression(random_state=0),
    AdaBoostClassifier(n_estimators=50, learning_rate=1),
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(max_depth=2, random_state=0)
]

parameters = [
    {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': (True, False),
        'clf__loss': ('hinge', 'squared_hinge')
    },
    {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': (True, False),
        'clf__penalty': ('l2', 'l1')
    },
    {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': (True, False),
        'clf__learning_rate': (1, 2, 0.5),
        'clf__n_estimators': (10, 50, 100)
    },
    {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': (True, False),
        'clf__criterion': ('gini', 'entropy'),
    },
    {
        'vect__ngram_range': [(1, 1), (1, 2)],
        'tfidf__use_idf': (True, False),
        'clf__criterion': ('gini', 'entropy'),
    }    
]

In [73]:
i = 0
best_scores = []
best_params = []
for model in models: 
    print("Currently training model: ", model_names[i])
    
    text_clf = Pipeline([('vect', CountVectorizer()),
                        ('tfidf', TfidfTransformer()),
                        ('clf', model)])

    _  = text_clf.fit(twenty_train.data, twenty_train.target)    
    
    gs_clf = GridSearchCV(text_clf, parameters[i], n_jobs=-1, cv=3)
    gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)
    
    best_scores.append(gs_clf.best_score_)
    best_params.append(gs_clf.best_params_)
    
    print("Best score : ", gs_clf.best_score_)
    print("Best params: ", gs_clf.best_params_)
    print("\n\n")
    
    i+=1 


Currently training model:  SVM




Best score :  0.9209828531023511
Best params:  {'clf__loss': 'hinge', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}



Currently training model:  Logistic Regression




Best score :  0.8830652289199222
Best params:  {'clf__penalty': 'l2', 'tfidf__use_idf': True, 'vect__ngram_range': (1, 2)}



Currently training model:  AdaBoost
Best score :  0.555506452183136
Best params:  {'clf__learning_rate': 0.5, 'clf__n_estimators': 100, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}



Currently training model:  Decision Tree
Best score :  0.626922397030228
Best params:  {'clf__criterion': 'gini', 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}



