In [1]:
######################################
########## DATA PREPARATION ##########
######################################

import numpy as np
import matplotlib.pyplot as plt

"""Script to download the 20 newsgroups text classification set"""
from sklearn.datasets import fetch_20newsgroups

########## TRAIN DATA ##############
# categories = ['alt.atheism' , 'soc.religion.christian' , 'comp.graphics' , 'sci.med'] # to match tutorial
twenty_train = fetch_20newsgroups(subset = 'train', remove=(['headers', 'footers', 'quotes']))
# twenty_train = fetch_20newsgroups(subset = 'train', categories=categories)

########## TEST DATA ##############
twenty_test = fetch_20newsgroups(subset = 'test', remove=(['headers', 'footers', 'quotes']))

"""Define count vectorizer and tf-idf transformations"""

###### Count Vectorizer #######
from sklearn.feature_extraction.text import CountVectorizer
### Filter out these stopwords ####
stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
#### Filter: Stopwords, token_pattern filters out the non-letter characters
count_vect = CountVectorizer(analyzer='word', stop_words = stopwords,  token_pattern = r'\b[^\d\W_/]+\b')

########## TFIDF ############
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=True,smooth_idf=True) # turned on idf and smooth idf

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [0]:
####################################################
########## Decision Tree: rough estimates ##########
####################################################

# packages:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

# function to roughly test parameters
def rough_param_estimate(clf__min_samples_leaf, clf__min_samples_split, clf__max_depth):
  decisiontree_clf = DecisionTreeClassifier()
  decisiontree_params_gini = {'clf__criterion': ['gini'], 'clf__min_samples_leaf': clf__min_samples_leaf, # 1 (default = 1)
                        'clf__min_samples_split': clf__min_samples_split, # 3 when i input 2,3,4,5(default = 2)
                      'clf__max_depth': clf__max_depth # depth, we toyed with 500 to 4000 (default = none)
                        }
  decision_tree_pipeline_gini = Pipeline([('vect', count_vect), ('tfidf', tfidf_transformer), ('clf', decisiontree_clf)])
  
  clf_gini = RandomizedSearchCV(decision_tree_pipeline_gini, decisiontree_params_gini, cv=5, n_jobs=-1)

  search_gini = clf_gini.fit(twenty_train.data, twenty_train.target)

  print("Best estimator with Gini criterion (randomized search): " + str(search_gini.best_params_))
  print("Best score with Gini criterion (randomized search):" + str(search_gini.best_score_))
  print("Mean Fit Time with Gini criterion (randomized search):" + str(search_gini.cv_results_.get('mean_fit_time')))

  decisiontree_params_entropy = {'clf__criterion': ['entropy'], 
                        'clf__min_samples_leaf': clf__min_samples_leaf, # 1 (default = 1)
                        'clf__min_samples_split': clf__min_samples_split, # 3 when i input 2,3,4,5(default = 2)
                        'clf__max_depth': clf__max_depth #depth, we toyed with 500 to 4000 (default = none)
                        }
  decision_tree_pipeline_entropy = Pipeline([('vect', count_vect), ('tfidf', tfidf_transformer), ('clf', decisiontree_clf)])
  clf_entropy = RandomizedSearchCV(decision_tree_pipeline_entropy, decisiontree_params_entropy, cv=5, n_jobs=4)
  search_entropy = clf_entropy.fit(twenty_train.data, twenty_train.target)
  print("Best estimator with Entropy criterion (randomized search): " + str(search_entropy.best_params_))
  print("Best score with Entropy criterion (randomized search):" + str(search_entropy.best_score_))
  print("Mean Fit Time with Entropy criterion (randomized search):" + str(search_entropy.cv_results_.get('mean_fit_time')))
clf__min_samples_leaf = [1]
clf__min_samples_split = [2,3,4]
clf__max_depth = [250,500,750]
rough_param_estimate(clf__min_samples_leaf, clf__min_samples_split, clf__max_depth)

In [0]:
#####################################################
########## Decision Tree: model comparison ##########
#####################################################

# packages:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
import os

decisiontree_clf = DecisionTreeClassifier()

# Model 1: Gini, min samples split = 2
decisiontree_params_gini_1 = {'clf__criterion': ['gini'], 'clf__min_samples_leaf': [1], 'clf__min_samples_split': [2], 
                            'clf__max_depth': [250, 500, 750, 1000]}
decision_tree_pipeline_gini_1 = Pipeline([('vect', count_vect), ('tfidf', tfidf_transformer), ('clf', decisiontree_clf)])
clf_gini_1 = RandomizedSearchCV(decision_tree_pipeline_gini_1, decisiontree_params_gini_1, cv=5, n_jobs=-1)
search_gini_1 = clf_gini_1.fit(twenty_train.data, twenty_train.target)
print("Best estimator with Gini criterion, min sample split = 2 (randomized search): " + str(search_gini_1.best_params_))
print("Best score with Gini criterion, min sample split = 2 (randomized search):" + str(search_gini_1.best_score_))
print("Test accuracy of Decision Tree (Gini, min sample split = 2) with best params: " + str(search_gini_1.best_estimator_.score(twenty_test.data, twenty_test.target)))

# Model 2: Entropy, min samples split = 2
decisiontree_params_entropy_1 = {'clf__criterion': ['entropy'], 'clf__min_samples_leaf': [1], 'clf__min_samples_split': [2], 
                            'clf__max_depth': [250, 500, 750, 1000]}
decision_tree_pipeline_entropy_1 = Pipeline([('vect', count_vect), ('tfidf', tfidf_transformer), ('clf', decisiontree_clf)])
clf_entropy_1 = RandomizedSearchCV(decision_tree_pipeline_entropy_1, decisiontree_params_entropy_1, cv=5, n_jobs=-1)
search_entropy_1 = clf_entropy_1.fit(twenty_train.data, twenty_train.target)
print("Best estimator with Entropy criterion, min sample split = 2 (randomized search): " + str(search_entropy_1.best_params_))
print("Best score with Entropy criterion, min sample split = 2 (randomized search):" + str(search_entropy_1.best_score_))
print("Test accuracy of Decision Tree (Entropy, min sample split = 2) with best params: " + str(search_entropy_1.best_estimator_.score(twenty_test.data, twenty_test.target)))

# Model 3: Gini, min samples split = 3
decisiontree_params_gini_2 = {'clf__criterion': ['gini'], 'clf__min_samples_leaf': [1], 'clf__min_samples_split': [3], 
                            'clf__max_depth': [250, 500, 750, 1000]}
decision_tree_pipeline_gini_2 = Pipeline([('vect', count_vect), ('tfidf', tfidf_transformer), ('clf', decisiontree_clf)])
clf_gini_2 = RandomizedSearchCV(decision_tree_pipeline_gini_2, decisiontree_params_gini_2, cv=5, n_jobs=-1)
search_gini_2 = clf_gini_2.fit(twenty_train.data, twenty_train.target)
print("Best estimator with Gini criterion, min sample split = 3 (randomized search): " + str(search_gini_2.best_params_))
print("Best score with Gini criterion, min sample split = 3 (randomized search):" + str(search_gini_2.best_score_))
print("Test accuracy of Decision Tree (Gini, min sample split = 3) with best params: " + str(search_gini_2.best_estimator_.score(twenty_test.data, twenty_test.target)))

# Model 4: Entropy, min samples split = 3
decisiontree_params_entropy_2 = {'clf__criterion': ['entropy'], 'clf__min_samples_leaf': [1], 'clf__min_samples_split': [2], 
                            'clf__max_depth': [250, 500, 750, 1000]}
decision_tree_pipeline_entropy_2 = Pipeline([('vect', count_vect), ('tfidf', tfidf_transformer), ('clf', decisiontree_clf)])
clf_entropy_2 = RandomizedSearchCV(decision_tree_pipeline_entropy_2, decisiontree_params_entropy_2, cv=5, n_jobs=-1)
search_entropy_2 = clf_entropy_2.fit(twenty_train.data, twenty_train.target)
print("Best estimator with Entropy criterion, min sample split = 3 (randomized search): " + str(search_entropy_2.best_params_))
print("Best score with Entropy criterion, min sample split = 3 (randomized search):" + str(search_entropy_2.best_score_))
print("Test accuracy of Decision Tree (Entropy, min sample split = 3) with best params: " + str(search_entropy_2.best_estimator_.score(twenty_test.data, twenty_test.target)))


Best estimator with Gini criterion (randomized search): {'clf__min_samples_split': 7, 'clf__min_samples_leaf': 1, 'clf__criterion': 'gini'}
Best score with Gini criterion (randomized search):0.4658834524408145
Mean Fit Time with Gini criterion (randomized search):[40.56883984 37.67145109 34.87990828 38.43001657 35.11679025 34.54104257
 37.10123801 36.06419778 35.72085252 29.14020457]
Best estimator with Entropy criterion (randomized search): {'clf__min_samples_split': 6, 'clf__min_samples_leaf': 1, 'clf__criterion': 'entropy'}
Best score with Entropy criterion (randomized search):0.35734592508633684
Mean Fit Time with Entropy criterion (randomized search):[66.62211881 63.0900722  69.01786757 58.4111279  56.44176087 64.15077138
 60.74244342 57.4838346  61.34027867 47.6233387 ]


In [0]:
###########################################################
########## Decision Tree: model comparison graph ##########
###########################################################

if not os.path.exists('decision_tree'):
  os.makedirs('decision_tree')

plt.plot(list(search_gini_1.cv_results_.get('param_clf__max_depth')), search_gini_1.cv_results_.get('mean_test_score'), label = 'Gini, min sample split = 2')
plt.plot(list(search_entropy_1.cv_results_.get('param_clf__max_depth')), search_entropy_1.cv_results_.get('mean_test_score'), label = 'Entropy, min sample split = 2')
plt.plot(list(search_gini_2.cv_results_.get('param_clf__max_depth')), search_gini_2.cv_results_.get('mean_test_score'), label = 'Gini, min sample split = 3')
plt.plot(list(search_entropy_2.cv_results_.get('param_clf__max_depth')), search_entropy_2.cv_results_.get('mean_test_score'), label = 'Entropy, min sample split = 3')

plt.xlabel("Max Tree Depth")
plt.ylabel("Mean accuracy")
plt.title("Decision Tree: Accuracy over Max Tree Depth (randomized search)")
plt.legend()
plt.savefig("decision_tree/decision_tree_accuracy.png")
plt.show()

plt.plot(list(search_gini_1.cv_results_.get('param_clf__max_depth')), search_gini_1.cv_results_.get('mean_fit_time'), label = 'Gini, min sample split = 2')
plt.plot(list(search_entropy_1.cv_results_.get('param_clf__max_depth')), search_entropy_1.cv_results_.get('mean_fit_time'), label = 'Entropy, min sample split = 2')
plt.plot(list(search_gini_2.cv_results_.get('param_clf__max_depth')), search_gini_2.cv_results_.get('mean_fit_time'), label = 'Gini, min sample split = 3')
plt.plot(list(search_entropy_2.cv_results_.get('param_clf__max_depth')), search_entropy_2.cv_results_.get('mean_fit_time'), label = 'Entropy, min sample split = 3')

plt.xlabel("Max Tree Depth")
plt.ylabel("Mean Fit Time (Seconds)")
plt.title("Decision Tree: Mean Fit Time over Max Tree Depth (randomized search)")
plt.legend()
plt.savefig("decision_tree/decision_tree_time.png")
plt.show()
