In [1]:
######################################
########## DATA PREPARATION ##########
######################################

import numpy as np
import matplotlib.pyplot as plt

"""Script to download the 20 newsgroups text classification set"""
from sklearn.datasets import fetch_20newsgroups

########## TRAIN DATA ##############
# categories = ['alt.atheism' , 'soc.religion.christian' , 'comp.graphics' , 'sci.med'] # to match tutorial
twenty_train = fetch_20newsgroups(subset = 'train', remove=(['headers', 'footers', 'quotes']))
# twenty_train = fetch_20newsgroups(subset = 'train', categories=categories)

########## TEST DATA ##############
twenty_test = fetch_20newsgroups(subset = 'test', remove=(['headers', 'footers', 'quotes']))

"""Define count vectorizer and tf-idf transformations"""

###### Count Vectorizer #######
from sklearn.feature_extraction.text import CountVectorizer
### Filter out these stopwords ####
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
#### Filter: Stopwords, token_pattern filters out the non-letter characters
count_vect = CountVectorizer(analyzer='word', stop_words = stopwords,  token_pattern = r'\b[^\d\W_/]+\b')

########## TFIDF ############
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=True,smooth_idf=True) # turned on idf and smooth idf

In [None]:
########################################
### Random Forest: best n estimators ###
########################################

# packages
import os  
from sklearn.pipeline import Pipeline  
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

randforest_clf = RandomForestClassifier()

# Model 1: Gini Criterion, Min Samples Split = 2
randforest_params_1 = {'clf__criterion': ['gini'], 'clf__min_samples_leaf': [1], 'clf__min_samples_split': [2], 'clf__n_estimators': [100,200,300,400,500, 600]}
randforest_pipeline_1 = Pipeline([('vect', count_vect), ('tfidf', tfidf_transformer), ('clf', randforest_clf)])
randforest_CV_1 = RandomizedSearchCV(randforest_pipeline_1, randforest_params_1, cv=5, n_jobs=-1) # 5-fold cross-val - no advantage to randomized search here
search_randforest_CV_1 = randforest_CV_1.fit(twenty_train.data, twenty_train.target)
print("Best estimator with Gini criterion, min sample split = 2 (randomized search): " + str(search_randforest_CV_1.best_params_))
print("Best score with Gini criterion, min sample split = 2 (randomized search):" + str(search_randforest_CV_1.best_score_))
print("Test accuracy of Random Forest (Gini, min sample split = 2) with best params: " + str(search_randforest_CV_1.best_estimator_.score(twenty_test.data, twenty_test.target)))


# Model 2: Gini Criterion, Min Samples Split = 3
randforest_params_2 = {'clf__criterion': ['gini'], 'clf__min_samples_leaf': [1], 'clf__min_samples_split': [3], 'clf__n_estimators': [100,200,300,400,500, 600]}
randforest_pipeline_2 = Pipeline([('vect', count_vect), ('tfidf', tfidf_transformer), ('clf', randforest_clf)])
randforest_CV_2 = RandomizedSearchCV(randforest_pipeline_2, randforest_params_2, cv=5, n_jobs=-1) # 5-fold cross-val
search_randforest_CV_2 = randforest_CV_2.fit(twenty_train.data, twenty_train.target)
print("Best estimator with Gini criterion, min sample split = 3 (randomized search): " + str(search_randforest_CV_2.best_params_))
print("Best score with Gini criterion, min sample split = 3 (randomized search):" + str(search_randforest_CV_2.best_score_))
print("Test accuracy of Random Forest (Gini, min sample split = 3) with best params: " + str(search_randforest_CV_2.best_estimator_.score(twenty_test.data, twenty_test.target)))

# Model 3: Entropy Criterion, Min Samples Split = 2
randforest_params_3 = {'clf__criterion': ['entropy'], 'clf__min_samples_leaf': [1], 'clf__min_samples_split': [2], 'clf__n_estimators': [100,200,300,400,500, 600]}
randforest_pipeline_3 = Pipeline([('vect', count_vect), ('tfidf', tfidf_transformer), ('clf', randforest_clf)])
randforest_CV_3 = RandomizedSearchCV(randforest_pipeline_3, randforest_params_3, cv=5, n_jobs=-1) # 5-fold cross-val
search_randforest_CV_3 = randforest_CV_3.fit(twenty_train.data, twenty_train.target)
print("Best estimator with entropy criterion, min sample split = 2 (randomized search): " + str(search_randforest_CV_3.best_params_))
print("Best score with entropy criterion, min sample split = 2 (randomized search):" + str(search_randforest_CV_3.best_score_))
print("Test accuracy of Random Forest (entropy, min sample split = 2) with best params: " + str(search_randforest_CV_3.best_estimator_.score(twenty_test.data, twenty_test.target)))

# Model 4: Entropy Criterion, Min Samples Split = 3
randforest_params_4 = {'clf__criterion': ['entropy'], 'clf__min_samples_leaf': [1], 'clf__min_samples_split': [3], 'clf__n_estimators': [100,200,300,400,500, 600]}
randforest_pipeline_4 = Pipeline([('vect', count_vect), ('tfidf', tfidf_transformer), ('clf', randforest_clf)])
randforest_CV_4 = RandomizedSearchCV(randforest_pipeline_4, randforest_params_4, cv=5, n_jobs=-1) # 5-fold cross-val
search_randforest_CV_4 = randforest_CV_4.fit(twenty_train.data, twenty_train.target)
print("Best estimator with Entropy criterion, min sample split = 3 (randomized search): " + str(search_randforest_CV_4.best_params_))
print("Best score with Entropy criterion, min sample split = 3 (randomized search):" + str(search_randforest_CV_4.best_score_))
print("Test accuracy of Random Forest (Entropy, min sample split = 3) with best params: " + str(search_randforest_CV_4.best_estimator_.score(twenty_test.data, twenty_test.target)))



Best estimator with Gini criterion, min sample split = 2 (randomized search): {'clf__n_estimators': 500, 'clf__min_samples_split': 2, 'clf__min_samples_leaf': 1, 'clf__criterion': 'gini'}
Best score with Gini criterion, min sample split = 2 (randomized search):0.679335336750928
Test accuracy of Random Forest (Gini, min sample split = 2) with best params: 0.6254646840148699




Best estimator with Gini criterion, min sample split = 3 (randomized search): {'clf__n_estimators': 500, 'clf__min_samples_split': 3, 'clf__min_samples_leaf': 1, 'clf__criterion': 'gini'}
Best score with Gini criterion, min sample split = 3 (randomized search):0.6799540392434152
Test accuracy of Random Forest (Gini, min sample split = 3) with best params: 0.6330323951141795




Best estimator with entropy criterion, min sample split = 2 (randomized search): {'clf__n_estimators': 600, 'clf__min_samples_split': 2, 'clf__min_samples_leaf': 1, 'clf__criterion': 'entropy'}
Best score with entropy criterion, min sample split = 2 (randomized search):0.6225030935124625
Test accuracy of Random Forest (entropy, min sample split = 2) with best params: 0.5770047796070101




In [None]:
##############################################
### Random Forest: best n estimators graph ### 
##############################################

if not os.path.exists('random_forest'):
  os.makedirs('random_forest')

plt.plot(list(search_randforest_CV_1.cv_results_.get('param_clf__n_estimators')), search_randforest_CV_1.cv_results_.get('mean_test_score'), label = 'Gini, min sample split = 2')
plt.plot(list(search_randforest_CV_2.cv_results_.get('param_clf__n_estimators')), search_randforest_CV_2.cv_results_.get('mean_test_score'), label = 'Gini, min sample split = 3')
plt.plot(list(search_randforest_CV_3.cv_results_.get('param_clf__n_estimators')), search_randforest_CV_3.cv_results_.get('mean_test_score'), label = 'Entropy, min sample split = 2')
plt.plot(list(search_randforest_CV_4.cv_results_.get('param_clf__n_estimators')), search_randforest_CV_4.cv_results_.get('mean_test_score'), label = 'Entropy, min sample split = 3')

plt.xlabel("n estimators")
plt.ylabel("Mean accuracy")
plt.title("Random Forest: Accuracy over # of estimators (randomized search)")
plt.legend()
plt.savefig("random_forest/random_forest_estimators_accuracy.png")
plt.show()

plt.plot(list(search_randforest_CV_1.cv_results_.get('param_clf__n_estimators')), search_randforest_CV_1.cv_results_.get('mean_fit_time'), label = 'Gini, min sample split = 2')
plt.plot(list(search_randforest_CV_2.cv_results_.get('param_clf__n_estimators')), search_randforest_CV_2.cv_results_.get('mean_fit_time'), label = 'Gini, min sample split = 3')
plt.plot(list(search_randforest_CV_3.cv_results_.get('param_clf__n_estimators')), search_randforest_CV_3.cv_results_.get('mean_fit_time'), label = 'Entropy, min sample split = 2')
plt.plot(list(search_randforest_CV_4.cv_results_.get('param_clf__n_estimators')), search_randforest_CV_4.cv_results_.get('mean_fit_time'), label = 'Entropy, min sample split = 3')

plt.xlabel("n estimators")
plt.ylabel("Mean Fit Time (Seconds)")
plt.title("Random Forest: Mean Fit Time over # of estimators (randomized search)")
plt.legend()
plt.savefig("random_forest/random_forest_estimators_time.png")
plt.show()
