# B3
Partition the continuous variable LOGSALAR into three intervals: (0, Mean –
0.25*StdDev), (Mean – 0.25*StdDev, Mean + 0.25*StdDev], (Mean + 0.25*StdDev, +].
Given this partition, generate the best’ Decision Tree. You should consider the use of
derived variables.


# Load Data

In [1]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

import re
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None) 
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore

from sklearn.svm import SVC
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier


In [5]:
from google.colab import drive
drive.mount ('/content/drive')
file_path = '/content/drive/MyDrive/IST340_Final_Project/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
df = pd.read_csv(file_path + 'TaskB.csv')
df_input=df.copy()
df_input.head()

Unnamed: 0,no_atbat,no_hits,no_home,no_runs,no_rbi,no_bb,yr_major,cr_atbat,cr_hits,cr_home,cr_runs,cr_rbi,cr_bb,no_outs,no_assts,no_error,logsalar,ranking,position_copy
0,293.0,66.0,1.0,30.0,29.0,14.0,1.0,293.0,66.0,1.0,30.0,29.0,14.0,446.0,33.0,20.0,4.347245,11.0,2.0
1,315.0,81.0,7.0,24.0,38.0,39.0,14.0,3449.0,835.0,69.0,321.0,414.0,375.0,632.0,43.0,10.0,6.163315,2.0,2.0
2,479.0,130.0,18.0,66.0,72.0,76.0,3.0,1624.0,457.0,63.0,224.0,266.0,263.0,880.0,82.0,14.0,6.173786,25.0,3.0
3,496.0,141.0,20.0,65.0,78.0,37.0,11.0,5628.0,1575.0,225.0,828.0,838.0,354.0,200.0,11.0,3.0,6.214608,14.0,9.0
4,321.0,87.0,10.0,39.0,42.0,30.0,2.0,396.0,101.0,12.0,48.0,46.0,33.0,805.0,40.0,4.0,4.516339,14.0,3.0


In [34]:
mean = df['logsalar'].mean()
std = df['logsalar'].std()

# Generate Test Design

## Data Splitting

In [35]:
partitions = [0, mean - 0.25 * std, mean + 0.25 * std, float('inf')]
categories = ['0', '1', '2']
df['logsalar_split'] = pd.cut(df['logsalar'], bins=partitions, labels=categories, include_lowest= True)

In [36]:
# Split the data into training and testing sets - 80% training, 20% testing
X = df.drop('logsalar_split', axis=1)
y = df['logsalar_split']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=666)

print("The length of training set:", len(X_train))
print("The length of testing  set:", len(y_test))

The length of training set: 257
The length of testing  set: 65


# Decision Tree

In [37]:
# Define the hyperparameter grid
param_grid = {
    'max_depth': [3, 5, 7, 9, 11, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10, int(len(X_train)*0.03), int(len(X_train)*0.06)],
    'criterion': ['gini', 'entropy']}

# Create the decision tree classifier model
dt_model = DecisionTreeClassifier(random_state=666)

# Create the grid search object
grid_search = GridSearchCV(estimator=dt_model, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)

# Fit the grid search object to the training data
grid_search.fit(X_train, y_train)

# Display the best hyperparameters found
print("Best Hyperparameters:")
print(grid_search.best_params_)

# Get the best model from grid search
best_dt_model = grid_search.best_estimator_

Best Hyperparameters:
{'criterion': 'gini', 'max_depth': 3, 'min_samples_leaf': 1, 'min_samples_split': 2}


In [38]:
# Create the decision tree classifier model
dt_model = DecisionTreeClassifier(random_state=666, **grid_search.best_params_)

# Train the model
dt_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_dt = dt_model.predict(X_test)

# Display the confusion matrix, classification report, and accuracy score
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_dt))
print("\nClassification Report:")
print(classification_report(y_test, y_pred_dt))
print("\nAccuracy Score:")
print(accuracy_score(y_test, y_pred_dt))


Confusion Matrix:
[[27  0  0]
 [ 0 10  0]
 [ 0  0 28]]

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        27
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        28

    accuracy                           1.00        65
   macro avg       1.00      1.00      1.00        65
weighted avg       1.00      1.00      1.00        65


Accuracy Score:
1.0
