In [1]:
import numpy as np
import time
from sklearn.datasets import fetch_20newsgroups 

from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer 

from sklearn.linear_model import SGDClassifier 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV,RandomizedSearchCV

In [2]:
categories = ['alt.atheism', 'soc.religion.christian', 
              'comp.graphics', 'sci.med']

# currently looking at all categories 
twenty_train = fetch_20newsgroups(subset='train', shuffle=True, remove=('headers', 'footers', 'quotes'))
twenty_test = fetch_20newsgroups(subset='test', shuffle=True, remove=('headers', 'footers', 'quotes'))


In [3]:
ensembleModels_names = ['AdaBoost']#we can add bagging here if we want later

ensembleModels = [
    AdaBoostClassifier(random_state=0)
]

baseEstimators_names = ['SVM', 'Logistic Regression', 'Random Forest','Decision Tree']

#add the optimal paramaters once we finish our gridSearch
baseEstimators = [
    LinearSVC(C=80,penalty = 'l2',loss = 'squared_hinge',random_state=0),
    LogisticRegression(random_state=0),
    RandomForestClassifier(max_depth=2, random_state=0),
    DecisionTreeClassifier(random_state=0)
]

In [4]:
def runAdaBoost(parameter_grids, models, model_names):
    start = time.time()
    i = 0
    best_scores = []
    best_params = []
    for model in models: 
        print("Currently training model: ", model_names[i])

        text_clf = Pipeline([('vect', CountVectorizer()),
                            ('tfidf', TfidfTransformer()),
                            ('clf', model)])

        _  = text_clf.fit(twenty_train.data, twenty_train.target)    

        gs_clf = GridSearchCV(text_clf, parameter_grids[i], n_jobs=-1, cv=3,error_score=0.0)
        gs_clf = gs_clf.fit(twenty_train.data, twenty_train.target)

        best_scores.append(gs_clf.best_score_)
        best_params.append(gs_clf.best_params_)

        print("Time taken: ", time.time()-start)
        print("Best score : ", gs_clf.best_score_)
        print("Best params: ", gs_clf.best_params_)
        print("\n\n")

        i+=1
    

### Linear AdaBoost 

In [5]:
#adjust the vect/tfidf to be optimal for each model (stop words, ngram)
Ada_linear_svc = [
    {
        'vect__ngram_range': [(1,2)],
        'vect__stop_words': ['english'],
        'tfidf__use_idf': [True],
        'clf__base_estimator': [baseEstimators[0]], #adjust the index to switch to other base_estimators
        
        #AdaBoost Paramaters
        'clf__algorithm': ['SAMME'], #SVC is a strong learner, so this is needed, possibly will overfit
        'clf__n_estimators': [50, 100],
        'clf__learning_rate' : [0.01,0.05,0.1,0.3,1],
    }
]

In [None]:
runAdaBoost(Ada_linear_svc, [ensembleModels[0]], [ensembleModels_names[0]])

Currently training model:  AdaBoost


### Logistic Regression AdaBoost

In [None]:
#adjust the vect/tfidf to be optimal for each model (stop words, ngram)
Ada_Logistic_Reg = [
    {
        'vect__ngram_range': [(1,2)],
        'vect__stop_words': ['english'],
        'tfidf__use_idf': [True],
        'clf__base_estimator': [baseEstimators[1]], #adjust the index to switch to other base_estimators
        
        #AdaBoost Paramaters
        'clf__n_estimators': [50, 100],
        'clf__learning_rate' : [0.01,0.05,0.1,0.3,1],
    }
]