### Load the libraries

In [17]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler

from joblib import dump

from sklearn.metrics import confusion_matrix, accuracy_score

from sklearn.tree import DecisionTreeClassifier

import os
import sys
sys.path.append(os.path.abspath('..'))
from src.common_lib import DataReader, NBARawData

### Load the data

In [18]:
data_reader = DataReader()

# Load Raw Train Data
df = data_reader.read_data(NBARawData.TRAIN)

# Load Test Raw Data
test_df = data_reader.read_data(NBARawData.TEST)

In [9]:
df

Unnamed: 0,Id_old,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,10556,3799,80,24.3,7.8,3.0,6.4,45.7,0.1,0.3,...,2.9,72.1,2.2,2.0,3.8,3.2,1.1,0.2,1.6,1
1,5342,3800,75,21.8,10.5,4.2,7.9,55.1,-0.3,-1.0,...,3.6,67.8,3.6,3.7,6.6,0.7,0.5,0.6,1.4,1
2,5716,3801,85,19.1,4.5,1.9,4.5,42.8,0.4,1.2,...,0.6,75.7,0.6,1.8,2.4,0.8,0.4,0.2,0.6,1
3,13790,3802,63,19.1,8.2,3.5,6.7,52.5,0.3,0.8,...,1.5,66.9,0.8,2.0,3.0,1.8,0.4,0.1,1.9,1
4,5470,3803,63,17.8,3.7,1.7,3.4,50.8,0.5,1.4,...,0.5,54.0,2.4,2.7,4.9,0.4,0.4,0.6,0.7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,2996,11794,32,9.2,1.8,0.7,1.8,40.3,-0.1,-0.2,...,0.6,65.7,0.3,1.8,1.9,0.5,0.3,0.2,0.4,1
7996,11679,11795,54,6.0,1.8,0.7,1.4,48.7,0.1,0.1,...,0.4,70.1,1.0,1.1,2.0,0.1,0.0,0.3,0.3,1
7997,5537,11796,85,28.2,10.7,4.0,9.0,45.1,0.2,0.6,...,3.9,69.7,1.0,2.1,3.1,3.4,1.2,0.2,1.8,1
7998,1292,11797,39,7.7,2.5,1.0,2.3,40.1,-0.3,-0.5,...,0.7,74.3,0.4,0.6,0.9,0.2,0.3,0.3,0.5,1


### Explore Data and Quality Check

In [3]:
df.isnull().values.any()
df.isnull().sum()

cols_drop = ["Id", "Id_old", "MIN", "FGM", "FGA", 'TOV', '3PA', 'FTM', 'FTA', 'REB']
df.drop(cols_drop, axis=1, inplace=True)

### Prepare the data

In [4]:
# # Scaling
# scaler = StandardScaler()
# df_cleaned = scaler.fit_transform(df_cleaned)
# df_cleaned

# # Save the scaler 
# dump(scaler, '../models/standard_scaler.joblib')

# Split the original dataset into Data-Test set
X_train, X_val, y_train, y_val  = data_reader.split_data(df)

### Build The Decision Tree Model

In [7]:
model = DecisionTreeClassifier(max_depth = 10)
model.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

### Make Predictions and Check Model Accuracy on Training Data Set

In [8]:
y_train_prediction = model.predict(X_train)
confusion_matrix(y_train, y_train_prediction)
accuracy_score(y_train, y_train_prediction)

# y_train_prediction = model.predict_proba(X_train) 
# y_train_prediction

0.88859375

### Make Predictions and Check Model Accuracy on Valiation Data Set

In [9]:
y_val_prediction = model.predict(X_val)
confusion_matrix(y_val, y_val_prediction)
accuracy_score(y_val, y_val_prediction)

# y_val_prediction = model.predict_proba(X_val)
# y_val_prediction

0.800625

### Prepare Test Data Set

In [14]:
cols_drop = ["Id", "Id_old", "MIN", "FGM", "FGA", 'TOV', '3PA', 'FTM', 'FTA', 'REB']
test_df.drop(cols_drop, axis=1, inplace=True)

test_df.head(5)

Unnamed: 0,GP,PTS,FG%,3P Made,3P%,FT%,OREB,DREB,AST,STL,BLK
0,56,4.0,43.7,0.1,7.3,63.4,1.2,0.8,0.4,0.2,0.3
1,43,10.1,46.0,0.6,35.1,75.3,0.5,0.9,3.5,0.6,0.0
2,82,11.3,45.6,0.5,44.8,71.2,1.3,3.3,2.5,1.3,0.3
3,86,18.8,42.9,0.5,13.5,70.9,1.5,3.2,4.1,0.9,0.1
4,58,4.7,40.0,0.5,38.7,76.9,0.2,0.6,1.5,0.5,-0.4


###  Prediction on Test Set

In [15]:
# scaler = StandardScaler()
# df_test_cleaned = scaler.fit_transform(df_test)

y_test_prediction = model.predict_proba(test_df)[:,1]

final_prediction_test = pd.DataFrame({'Id': range(0,3799), 'TARGET_5Yrs': [p for p in y_test_prediction]})

final_prediction_test.to_csv("../reports/tin_submission_decision_tree.csv", index=False)

In [16]:
y_test_prediction

array([0.88679245, 0.8683274 , 1.        , ..., 0.96428571, 0.98342541,
       0.8683274 ])