In [1]:
import pandas as pd
import numpy as np
import ml_utils

from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from ml_utils import edaDF

##Seaborn for fancy plots. 
import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams["figure.figsize"] = (8,8)

## 3950 Assignment 1: Part 2

For this assignment we want to use some sort of tree based model to classify the data below. We have a very small training set, so overfitting is a very real concern. 

Some specifics for this assignment:
<ul>
<li>Please use the show_eda to control if EDA stuff is shown. I don't really need to see all the EDA stuff (nor do you after you've done it), so we can make it configurable with a variable to speed up time. Please set this FALSE when you submit, so I can run all and see the outcome without histograms etc...
<li>Please ensure that whatever model you end up with is in a variable named best at the end.
<li>Please use some pipeline in prepping the data. The test data is in an identical format to the training data, so whatever pipeline you've created for your training will work for the testing. 
<li>The accuracy scoring will be an average of accuracy and roc_auc. 
</ul>

### Grading Metrics
<ul>
<li><b>Pipeline Used - 10pts</b> The data loading needs to be in a pipeline. See the test part for illustration. When testing I'll call your pipe with the new data (format is identical to training), so any prep stuff should be in the pipeline. 
<li><b>Tree Based Model Used - 5pts</b> The model used for classification needs to be some variety of tree, beyond that it is up to you. 
<li><b>Accuracy - 5pts</b> The final accuracy acheived. This will be a rough ranking, I'm assuming most people will get a similar level of accuracy, marks will only be deducted if yours is far wosrse, as that's an indication that you probably didn't take any/many steps to improve things. 
<li><b>Clarity and Formatting - 5pts</b> Is it organized and can I read it?
    <ul>
    <li> <b>Note:</b> for this assignment, and in general, please get rid of my comments and replace them with your own. I'm going to read this, so all of these instructions aren't really required. Think of this as a template, get rid of the stuff that isn't needed, and leave only the things you need to explain your code. 
    </ul>
</ul>

For submission, please drop the URL for your repository in the dropbox.

In [2]:
name = "Lucas Riehl"

show_eda = False


In [3]:
#Load data
df = pd.read_csv("training.csv")
df = df.drop(columns={"id"})
df.head(5)

Unnamed: 0,target,var_1,var_2,var_3,var_4,var_5,var_6,var_7,var_8,var_9,...,var_191,var_192,var_193,var_194,var_195,var_196,var_197,var_198,var_199,var_200
0,0,0.66,0.106,0.434,0.387,0.903,0.661,0.158,0.291,0.21,...,0.015,0.377,0.479,0.05,0.395,0.123,0.833,0.461,0.99,0.105
1,1,0.844,0.813,0.03,0.939,0.721,0.287,0.539,0.874,0.787,...,0.112,0.048,0.088,0.86,0.56,0.346,0.511,0.883,0.858,0.599
2,0,0.56,0.567,0.568,0.434,0.414,0.18,0.448,0.888,0.023,...,0.874,0.236,0.599,0.602,0.005,0.493,0.122,0.395,0.782,0.943
3,0,0.681,0.245,0.909,0.785,0.738,0.57,0.692,0.411,0.182,...,0.219,0.691,0.261,0.031,0.968,0.353,0.798,0.104,0.944,0.09
4,0,0.846,0.431,0.805,0.237,0.465,0.642,0.219,0.102,0.795,...,0.704,0.242,0.089,0.605,0.577,0.043,0.686,0.07,0.666,0.572


In [4]:
eda = ml_utils.edaDF(df,"target")
print(eda.giveTarget())

target


#### EDA

In [5]:
if show_eda == True: 
    eda.fullEDA(k=1.5, scatterplot=False, optional_countplots=False, optional_histplots=False)

#### Creating list of columns for pipeline constructor

In [6]:
eda.num

['var_1',
 'var_2',
 'var_3',
 'var_4',
 'var_5',
 'var_6',
 'var_7',
 'var_8',
 'var_9',
 'var_10',
 'var_11',
 'var_12',
 'var_13',
 'var_14',
 'var_15',
 'var_16',
 'var_17',
 'var_18',
 'var_19',
 'var_20',
 'var_21',
 'var_22',
 'var_23',
 'var_24',
 'var_25',
 'var_26',
 'var_27',
 'var_28',
 'var_29',
 'var_30',
 'var_31',
 'var_32',
 'var_33',
 'var_34',
 'var_35',
 'var_36',
 'var_37',
 'var_38',
 'var_39',
 'var_40',
 'var_41',
 'var_42',
 'var_43',
 'var_44',
 'var_45',
 'var_46',
 'var_47',
 'var_48',
 'var_49',
 'var_50',
 'var_51',
 'var_52',
 'var_53',
 'var_54',
 'var_55',
 'var_56',
 'var_57',
 'var_58',
 'var_59',
 'var_60',
 'var_61',
 'var_62',
 'var_63',
 'var_64',
 'var_65',
 'var_66',
 'var_67',
 'var_68',
 'var_69',
 'var_70',
 'var_71',
 'var_72',
 'var_73',
 'var_74',
 'var_75',
 'var_76',
 'var_77',
 'var_78',
 'var_79',
 'var_80',
 'var_81',
 'var_82',
 'var_83',
 'var_84',
 'var_85',
 'var_86',
 'var_87',
 'var_88',
 'var_89',
 'var_90',
 'var_91',
 'var_92

#### GridsearchCV so I can find the best model and hyperparameters to fit my model. Train test and split data

In [7]:

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.utils import column_or_1d

y = df["target"]
X = df.drop(columns={"target"})

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

y_test=y_test.ravel()
y_train=y_train.ravel()

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', DecisionTreeClassifier())
])

# Define the parameters for the grid search
param_grid = {
    # 'classifier__n_estimators': [10, 50, 100],
    'classifier__max_depth': [5, 10, 20],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

# Create the GridSearchCV object
grid_search = GridSearchCV(pipeline, param_grid, cv=5)

# Fit the GridSearchCV object to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters
print(grid_search.best_estimator_)

# Print the accuracy on the test set
print(grid_search.score(X_test, y_test))

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 DecisionTreeClassifier(max_depth=20, min_samples_leaf=4,
                                        min_samples_split=5))])
0.5396825396825397


#### Pipeline constructor, only numeric was used cause we have all numeric data. Then i plugged that into my preprocessor to be used in my model

In [8]:

numeric_transformer = Pipeline( steps=[
        ("imputer", SimpleImputer(strategy="mean")),
        ("scaler", StandardScaler())
    ])

preprocessor = Pipeline( steps=[
        ("num", numeric_transformer)
    ])

#### Implemented my tree model

In [9]:

best=Pipeline(steps=[('pre', preprocessor), ('classifier', DecisionTreeClassifier(max_depth=10, min_samples_leaf=4, max_features=7, random_state=42))])

best.fit(X_train, y_train)

print("Training Accuracy:", best.score(X_train, y_train))
print("Testing Accuracy:", best.score(X_test, y_test)) 
print(best._final_estimator)

Training Accuracy: 0.893048128342246
Testing Accuracy: 0.6507936507936508
DecisionTreeClassifier(max_depth=10, max_features=7, min_samples_leaf=4,
                       random_state=42)


#### Everything below was just me testing out other models and trying to see what worked and what didnt

In [10]:

# # Define the pipeline with a StandardScaler and a RandomForestClassifier
# pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('classifier', RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=5, min_samples_leaf=1))
# ])

# # Fit the pipeline to the training data
# pipeline.fit(X_train, y_train)

# # Print the train accuracy
# print("Train accuracy:", pipeline.score(X_train, y_train))

# # Print the test accuracy
# print("Test accuracy:", pipeline.score(X_test, y_test))






In [11]:

# # Define the pipeline with a StandardScaler and a RandomForestClassifier
# pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('classifier', RandomForestClassifier(n_estimators=100, max_depth=20, min_samples_split=5, min_samples_leaf=1, 
#                                           max_features='sqrt', random_state=42, criterion='entropy', 
#                                           min_impurity_decrease=0.01))
# ])

# # Fit the pipeline to the training data
# pipeline.fit(X_train, y_train)

# # Print the train accuracy
# print("Train accuracy:", pipeline.score(X_train, y_train))

# # Print the test accuracy
# print("Test accuracy:", pipeline.score(X_test, y_test))

In [12]:
# from sklearn.ensemble import ExtraTreesClassifier
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler

# # Define the pipeline with a StandardScaler and an ExtraTreesClassifier
# pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('classifier', ExtraTreesClassifier(n_estimators=100, max_depth=10, min_samples_split=2, min_samples_leaf=1, 
#                                           max_features='sqrt', random_state=42, criterion='entropy', 
#                                           min_impurity_decrease=0.01, bootstrap=True, n_jobs=-1))
# ])

# # Fit the pipeline to the training data
# pipeline.fit(X_train, y_train)

# # Print the train accuracy
# print("Train accuracy:", pipeline.score(X_train, y_train))

# # Print the test accuracy
# print("Test accuracy:", pipeline.score(X_test, y_test))

In [13]:
# from sklearn.ensemble import BaggingClassifier
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.pipeline import Pipeline
# from sklearn.preprocessing import StandardScaler

# # Define the pipeline with a StandardScaler and a BaggingClassifier
# pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('classifier', BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42), n_estimators=100, max_samples=0.5, max_features=1.0, bootstrap=True, bootstrap_features=False, n_jobs=-1))
# ])

# # Fit the pipeline to the training data
# pipeline.fit(X_train, y_train)

# # Print the train accuracy
# print("Train accuracy:", pipeline.score(X_train, y_train))

# # Print the test accuracy
# print("Test accuracy:", pipeline.score(X_test, y_test))

In [14]:
print(best.score(X_test, y_test))
print(best)

0.6507936507936508
Pipeline(steps=[('pre',
                 Pipeline(steps=[('num',
                                  Pipeline(steps=[('imputer', SimpleImputer()),
                                                  ('scaler',
                                                   StandardScaler())]))])),
                ('classifier',
                 DecisionTreeClassifier(max_depth=10, max_features=7,
                                        min_samples_leaf=4, random_state=42))])


### Testing

Please leave the stuff below as-is in your file. 

This will take your best model and score it with the test data. If you want to test to make sure that yours works, make a copy of the data file and rename it testing.csv, then make sure this runs ok. I will do the same, but the contents of my test file will be different. 

In [15]:
#Load Test Data
test_df = pd.read_csv("testing.csv")
test_df = test_df.drop(columns={"id"})
#Create tests and score
test_y = np.array(test_df["target"]).reshape(-1,1)
test_X = np.array(test_df.drop(columns={"target"}))

preds = best.predict(test_X)

roc_score = roc_auc_score(test_y, preds)
acc_score = accuracy_score(test_y, preds)

print(roc_score)
print(acc_score)
print(name, np.mean([roc_score, acc_score]))


FileNotFoundError: [Errno 2] No such file or directory: 'testing.csv'

### What Accuracy Changes Were Used

Please list here what you did to try to increase accuracy and/or limit overfitting:
<li>Used GridsearchCV to help me find the most accurate model to use and the most accurate hyperparameters
<li>used max depth and experimented the amount of layers to balance overfitting
<li>min_samples_leaf was used to control the minimum number of samples required to be at a leaf node, this helped to prevent overfitting by making the tree more robust to noise in the data.
<li> And lastly I just expereimented with different tree models and tried to get to using different hyperparamters. This led to me fine tuning what worked best and then finally giving me a reasonably accurate score while compensating for overfitting.