In [1]:
import numpy as np
import pandas as pd
import os
import sklearn

# Reading Data

In [2]:
def segmentWords(s): 
    return s.split()

def readFile(fileName):
    contents = []
    f = open(fileName)
    for line in f:
        contents.append(line)
    f.close()
    result = segmentWords('\n'.join(contents))
    return result

#### Create a Dataframe containing the counts of each word in a file

In [3]:
d = []

for c in os.listdir("data_training"):
    directory = "data_training/" + c
    for file in os.listdir(directory):
        words = readFile(directory + "/" + file)
        e = {x:words.count(x) for x in words}
        e['__FileID__'] = str(file)[2:-4]
        e['__CLASS__'] = str(c)
        d.append(e)

In [4]:
df = pd.DataFrame(d)

In [5]:
df.shape[1]

45673

In [6]:
df = df.fillna(0)
df.head()

Unnamed: 0,,earth,goodies,if,ripley,suspend,they,white,,,...,zukovsky,zundel,zurg's,zweibel,zwick,zwick's,zwigoff's,zycie,zycie',|
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Split data into training and validation set 

In [7]:
train_df = df.sample(frac = 0.8, replace = False)
train_df.head()
train_df.shape[0]
train_df.head()

Unnamed: 0,,earth,goodies,if,ripley,suspend,they,white,,,...,zukovsky,zundel,zurg's,zweibel,zwick,zwick's,zwigoff's,zycie,zycie',|
1051,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
513,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1086,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
921,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
val_df = df.drop(train_df.index)
val_df.shape[0]
val_df.head()

Unnamed: 0,,earth,goodies,if,ripley,suspend,they,white,,,...,zukovsky,zundel,zurg's,zweibel,zwick,zwick's,zwigoff's,zycie,zycie',|
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Split the dataframe for both training and validation data into x and y dataframes - where y contains the labels and x contains the words


In [9]:
train_x = train_df.drop('__CLASS__', axis=1)
train_y = train_df.drop(train_x.axes[1], axis=1)


val_x = val_df.drop('__CLASS__', axis=1)
val_y = val_df.drop(val_x.axes[1], axis=1)

In [10]:
train_y.head()

Unnamed: 0,__CLASS__
1051,pos
513,neg
1086,pos
1116,pos
921,pos


# Logistic Regression

In [11]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(penalty= 'l1', C=2500.0)
LR.fit(train_x, train_y) 
print(LR.score(val_x, val_y))

  y = column_or_1d(y, warn=True)


0.85


#### Changing Parameters

We changed the parameters of penalty to 'L1' and C = 2500.0

# Single Decision Tree

#### Basic Decision Tree

* Initialize your model as a decision tree with sklearn.
* Fit the data and labels to the model.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html


In [12]:
from sklearn.tree import DecisionTreeClassifier
d_tree = DecisionTreeClassifier(random_state=0)
d_tree.fit(train_x, train_y) 
print(d_tree.score(val_x, val_y))

0.6125


In [13]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
from sklearn import svm, grid_search




In [27]:
from sklearn.tree import DecisionTreeClassifier
d_tree = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=4, min_samples_leaf=5)
%time d_tree.fit(train_x, train_y) 
print(d_tree.score(val_x, val_y))


CPU times: user 4.6 s, sys: 4.47 s, total: 9.06 s
Wall time: 10.8 s
0.609375


In [20]:
train_x_mtx = train_x.as_matrix()
train_y_ravel = train_y.values.ravel()

In [None]:
param_grid = {'max_depth': np.arange(3, 6), 
              'min_samples_leaf' : np.arange(2, 5),
              "criterion": ["gini"],
              "max_features": np.arange(1,5)}


search = grid_search.GridSearchCV(DecisionTreeClassifier(), param_grid, cv=3)
search.fit(train_x_mtx, train_y_ravel)
search.best_params_


In [None]:
search.best_score_

I decided to change these parameters: 'max_depth', 'min_samples_leaf', 'random_state', criterion, and max_features. I decided to experiment with various values that were suggested online or that appeared in Stack Overflow posts. 

"Decision trees can learn a training set to a point of high granularity that makes them easily overfit. 
Allowing a decision tree to split to a granular degree, is the behavior of this model that makes it prone to learning every point extremely well — to the point of perfect classification — ie: overfitting." (source: https://gist.github.com/dyerrington/b136a24e4137415b307fde68aa8cb53b)
In essence, if you overcomplicate the model, i.e. split into many different branches, the model can be prone to overfitting.

# Random Forest Classifier

#### Basic Random Forest

* Use sklearn's ensemble.RandomForestClassifier() to create your model.
* Fit the data and labels with your model.
* Score your model with the same data and labels.

References:

http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html


In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf.fit(train_x, train_y)
print(clf.score(val_x, val_y))


#### Changing Parameters

In [None]:
#score should be in high 70s or 80s
param_grid = {"max_depth": np.arange(3, 6),
              "max_features": np.arange(1,4),
              "criterion": ["gini"],
              'n_estimators': np.arange(10, 15)}


forest_grid_search = GridSearchCV(clf, param_grid=param_grid, cv = 3)
forest_grid_search.fit(train_x_mtx, train_y_ravel)
forest_grid_search.best_params_


In [None]:
forest_grid_search.best_score_

What parameters did you choose to change and why?

I chose to change the "max_depth", "max_features", "min_samples_split", "min_samples_leaf", "bootstrap", "criterion", 'n_estimators' parameters just to see all the possible combinations I could get to output the best parameters to get us the highest score. I thought that the more parameters I decide to change, the better of a chance we have of finding a more accurate score. I also saw a tutorial on github that changed these parameters to similar ranges and went from there.


How does a random forest classifier prevent overfitting better than a single decision tree?

Random forest classifier creates random subsets of the features and building smaller (shallow) trees using the subsets and then it combines the subtrees. It then votes on the decision tree with the highest accuracy and least overfitting.

