# Importing the tools

In [11]:
import numpy as np
import pandas as pd
import os
import sklearn
import sklearn.linear_model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import grid_search
from sklearn import tree
from sklearn import model_selection
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

In [2]:
def segmentWords(s): 
    return s.split()

def readFile(fileName):
    # Function for reading file
    # input: filename as string
    # output: contents of file as list containing single words
    contents = []
    f = open(fileName)
    for line in f:
        contents.append(line)
    f.close()
    result = segmentWords('\n'.join(contents))
    return result

#### Create a Dataframe containing the counts of each word in a file

In [3]:
d = []

for c in os.listdir("data_training/train"):
    directory = "data_training/train/" + c
    for f in os.listdir(directory):
        words = readFile(directory + "/" + f)
        e = {x:words.count(x) for x in words}
        e['__FileID__'] = f
        e['__CLASS__'] = 1 if c[:3] == 'pos' else 0
        d.append(e)

**Create a dataframe from d - make sure to fill all the nan values with zeros.**


In [4]:
df = pd.DataFrame(d).fillna(0)

In [71]:
print(df.shape)
df.head()

(1400, 42776)


Unnamed: 0,,earth,goodies,if,ripley,suspend,they,white,,,...,zukovsky,zundel,zurg's,zweibel,zwick,zwick's,zwigoff's,zycie,zycie',|
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
df.describe()

Unnamed: 0,,earth,goodies,if,ripley,suspend,they,white,,,...,zukovsky,zundel,zurg's,zweibel,zwick,zwick's,zwigoff's,zycie,zycie',|
count,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,...,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0,1400.0
mean,0.000714,0.000714,0.000714,0.000714,0.000714,0.000714,0.000714,0.000714,0.003571,0.008571,...,0.000714,0.001429,0.000714,0.000714,0.006429,0.002857,0.001429,0.000714,0.000714,0.001429
std,0.026726,0.026726,0.026726,0.026726,0.026726,0.026726,0.026726,0.026726,0.080127,0.272517,...,0.026726,0.053452,0.026726,0.026726,0.128058,0.065426,0.037783,0.026726,0.026726,0.037783
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,10.0,...,1.0,2.0,1.0,1.0,3.0,2.0,1.0,1.0,1.0,1.0


In [7]:
print(df.__FileID__.head())
df.__CLASS__.tail()

0    cv676_22202.txt
1     cv155_7845.txt
2    cv465_23401.txt
3    cv398_17047.txt
4    cv206_15893.txt
Name: __FileID__, dtype: object


1395    1
1396    1
1397    1
1398    1
1399    1
Name: __CLASS__, dtype: int64

#### Split data into training and validation set 

In [8]:
features = df.drop(['__FileID__', '__CLASS__'], axis=1)
labels = df.__CLASS__
X_train, X_val, Y_train, Y_val = sklearn.model_selection.train_test_split(features, labels, test_size=0.2, 
                                                                         random_state=42)

In [9]:
# this step was done above before splitting data into training and validation set
print(X_train.shape, X_val.shape, Y_train.shape, Y_val.shape)

(1120, 42774) (280, 42774) (1120,) (280,)


In [12]:
logreg = sklearn.linear_model.LogisticRegression()
logreg.fit(X_train, Y_train)
print("Train acc:", logreg.score(X_train, Y_train), "\nValidation acc:", 
      logreg.score(X_val, Y_val))

Train acc: 1.0 
Validation acc: 0.839285714286


## Changing Parameters

In [13]:
Cs = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
# gamma parameter which inversely controls the standard deviation of our kernel's distribution
penalty = ['l1', 'l2']
# initialize the dictionary of parameters
param_grid = {'C': Cs, 'penalty' : penalty}
# initialize the search using input as nfold cross validation
lr = sklearn.linear_model.LogisticRegression()
search = grid_search.GridSearchCV(lr, param_grid)
# fit the search object to our input training data
search.fit(X_train, Y_train)
# output the best parameters
search.best_params_

{'C': 1000, 'penalty': 'l1'}

In [16]:
logreg2 = sklearn.linear_model.LogisticRegression(penalty='l1', C=1000)
logreg2.fit(X_train, Y_train)
print("Training acc:", logreg2.score(X_train, Y_train), "\nValidation acc:", 
      logreg2.score(X_val, Y_val))

Training acc: 1.0 
Validation acc: 0.885714285714


In [18]:
# A recursive feature elimination approach
from sklearn.feature_selection import RFE

# A new logistic regression model with parameters from above and a feature selector
lr2 = sklearn.linear_model.LogisticRegression(C=1000, penalty='l1')
selector = RFE(lr2, step=10000, n_features_to_select=41000)

In [19]:
# fit RFE selector to training set
selector.fit(X_train, Y_train)
lr2 = selector.estimator_

In [20]:
# figure out which columns to drop
columns = features.columns
feature_mask = selector.support_
columns_to_drop = [columns[i] for i in range(columns.size) if not feature_mask[i]]

In [22]:
# Create print function to print scores of estimators
def print_results(estimator, X, y, leadingString=''):
    print(leadingString, estimator.score(X, y))

In [23]:
# show training and testing accuracies after feature reduction
print_results(lr2, X_train.drop(columns_to_drop, axis=1), Y_train, "Training results: ")
print_results(lr2, X_val.drop(columns_to_drop, axis=1), Y_val, "Testing results: ")

Training results:  1.0
Testing results:  0.892857142857


# Single Decision Tree

In [25]:
dt_clf = tree.DecisionTreeClassifier(criterion='entropy')
dt_clf.fit(X_train, Y_train)
print("Training acc:", dt_clf.score(X_train, Y_train), "\nValidation acc:", dt_clf.score(X_val, Y_val))

Training acc: 1.0 
Validation acc: 0.635714285714


In [26]:
parameters = {"max_depth": [None, 10, 100, 1000, 10000],
              "min_samples_split": [5, 10, 50, 100, 500, 1000],
              "min_samples_leaf": [10, 100, 1000, 10000],
              "max_leaf_nodes": [None, 10, 100, 1000, 10000],
              }
gridsearch = GridSearchCV(dt_clf, parameters)
gridsearch.fit(X_train, Y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_split': [5, 10, 50, 100, 500, 1000], 'min_samples_leaf': [10, 100, 1000, 10000], 'max_depth': [None, 10, 100, 1000, 10000], 'max_leaf_nodes': [None, 10, 100, 1000, 10000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [27]:
# show the best parameters of the gridsearchCV regularized decision tree
gridsearch.best_params_

{'max_depth': None,
 'max_leaf_nodes': 10000,
 'min_samples_leaf': 10,
 'min_samples_split': 100}

In [29]:
# use parameters from gridsearchCV above in new decision tree model
reg_tree = gridsearch.best_estimator_

print("Training acc:", reg_tree.score(X_train, Y_train), "\nValidation acc:",
      reg_tree.score(X_val, Y_val))

Training acc: 0.757142857143 
Validation acc: 0.621428571429


In [32]:
# create decision tree model with manually searched parameters (best in class)
reg_tree2 = tree.DecisionTreeClassifier(criterion = "entropy", max_depth = None, max_leaf_nodes = 125, min_samples_leaf = 2, min_samples_split = 60)
reg_tree2.fit(X_train, Y_train)

# print model training and test accuracies
print_results(reg_tree2, X_train, Y_train, "Training score: ")
print_results(reg_tree2, X_val, Y_val, "Testing score: ")

Training score:  0.858035714286
Testing score:  0.692857142857


**Add a Boost to your Decision Tree Classifer using AdaBoost( )**

In [33]:
# train an AdaBoost classifier using the tuned random forest model above as the base estimator
boost_clf = AdaBoostClassifier(base_estimator=reg_tree2, n_estimators=100)
boost_clf.fit(X_train, Y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=125,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=2, min_samples_split=60,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1.0, n_estimators=100, random_state=None)

In [34]:
print("Training acc:", boost_clf.score(X_train, Y_train), "\nValidation acc:",
      boost_clf.score(X_val, Y_val))

Training acc: 1.0 
Validation acc: 0.739285714286


In [40]:
rfc = RandomForestClassifier(criterion = 'entropy', n_estimators=100)
rfc.fit(X_train, Y_train)
print("Training acc:", rfc.score(X_train, Y_train), "\nValidation acc:",
      rfc.score(X_val, Y_val))

Training acc: 1.0 
Validation acc: 0.821428571429


## Changing Parameters

parameters = {"min_samples_split": [2, 5, 10],
              "max_depth": [None, 2, 5, 10],
              "min_samples_leaf": [1, 5, 10],
              "max_leaf_nodes": [None, 5, 10, 20],
              }
gridsearch2 = GridSearchCV(rfc, parameters)
gridsearch2.fit(X_train, Y_train)

In [42]:
gridsearch2.best_params_

{'max_leaf_nodes': None, 'min_samples_leaf': 10, 'min_samples_split': 100}

In [63]:
reg_forest = gridsearch2.best_estimator_
reg_forest.fit(X_train, Y_train)
print("Training acc:", reg_forest.score(X_train, Y_train), "\nValidation acc:",
      reg_forest.score(X_val, Y_val))

Training acc: 0.935714285714 
Validation acc: 0.817857142857


**Add a Boost to Random Forest model with sklearn's AdaBoostClassifier( )**

In [65]:
boost_reg2 = AdaBoostClassifier(base_estimator=reg_forest)
boost_reg2.fit(X_train, Y_train)
print("Training acc:", boost_reg2.score(X_train, Y_train), "\nValidation acc:",
      boost_reg2.score(X_val, Y_val))

Training acc: 1.0 
Validation acc: 0.882142857143
