# Name: Suong Tran

In [49]:
# Import modules
import pandas as pd # v 1.4.4
import numpy as np # v 1.21.5
import matplotlib.pyplot as plt # v 3.5.2

import scikitplot as skplt #v 0.3.7

from sklearn.model_selection import train_test_split #v 1.0.2
from sklearn import metrics  #v 1.0.2
from sklearn.tree import DecisionTreeRegressor,export_text, DecisionTreeClassifier #v 1.0.2
from sklearn.linear_model import Lasso, LinearRegression #v 1.0.2

# REGRESSION

### Load data

In [50]:
sales = pd.read_csv("https://raw.githubusercontent.com/KennedyOdongo/DATA-300-Statistical-Machine-Learning-Fall-2023-/main/Data/sales_market_data.csv")
sales

Unnamed: 0,Sale,InStrSpending,Discount,TVSpending,StockRate,Price,Radio,OnlineAdsSpending
0,240368,59.90,0.46,46.30,0.45,12.56,1065,1081.60
1,207276,7.28,0.39,166.91,0.05,5.44,1832,651.24
2,172572,9.81,0.41,73.67,0.06,3.54,587,1066.56
3,82697,46.94,0.27,33.95,0.88,28.90,1407,1855.27
4,141762,2.59,0.26,82.63,0.36,4.66,2057,1608.91
...,...,...,...,...,...,...,...,...
987,26191,19.69,0.18,32.60,0.81,26.22,1840,2238.49
988,132714,19.48,0.23,116.64,0.21,17.53,2156,1564.64
989,32894,22.79,0.49,39.33,0.26,29.23,503,1020.22
990,39091,13.84,0.14,37.71,0.33,23.13,2902,1201.03


## 1. Exploratory Data Analysis (EDA)

In [51]:
sales.describe()

Unnamed: 0,Sale,InStrSpending,Discount,TVSpending,StockRate,Price,Radio,OnlineAdsSpending
count,992.0,992.0,992.0,992.0,992.0,992.0,992.0,992.0
mean,171327.118952,30.593034,0.251139,98.67874,0.494526,14.599829,1479.569556,1596.504284
std,81397.843301,17.493103,0.145348,57.117347,0.28681,8.715533,885.419636,927.474787
min,1992.0,0.19,0.0,0.13,0.0,0.14,4.0,12.54
25%,112479.25,14.83,0.13,49.6375,0.25,6.9175,708.25,786.3275
50%,170390.5,31.385,0.25,97.51,0.49,14.82,1413.5,1595.455
75%,226027.25,45.66,0.38,147.62,0.74,22.1,2273.0,2420.6875
max,393914.0,59.96,0.5,199.91,1.0,29.99,2997.0,3198.27


## No Missing values 

In [52]:
sales.isna().sum()

Sale                 0
InStrSpending        0
Discount             0
TVSpending           0
StockRate            0
Price                0
Radio                0
OnlineAdsSpending    0
dtype: int64

## 2. Decision Tree Regressor vs LASSO vs LR with forward selection

In [67]:
X = sales.drop('Sale', axis=1)
y = sales['Sale']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Decision Tree Regressor
tree_regressor = DecisionTreeRegressor()
tree_regressor.fit(X_train, y_train)

# LASSO
lasso_regressor = Lasso()
lasso_regressor.fit(X_train, y_train)

#forward selection
selected_features = []

for feature in X.columns:
    candidates = selected_features + [feature]
    X_train_subset = X_train[candidates]
    X_test_subset = X_test[candidates]

    LR = LinearRegression()
    LR.fit(X_train_subset, y_train)
    predictions = LR.predict(X_test_subset)
    mse= metrics.mean_squared_error(y_test, predictions)
    
    #condition
    if not selected_features or mse < min_mse:
        selected_features = candidates
        min_mse = metrics.mean_squared_error(y_test, predictions)
        
print("Features-fw:", candidates, "MSE forward selection:", metrics.mean_squared_error(y_test, predictions))
print("Features-fw:", candidates, "R-squared forward selection:", metrics.r2_score(y_test, predictions))

# Predictions
tree_predictions = tree_regressor.predict(X_test)
lasso_predictions = lasso_regressor.predict(X_test)

# MSE for DT and LASSO
print("MSE Decision Tree Regressor:", metrics.mean_squared_error(y_test, tree_predictions))
print("MSE LASSO:", metrics.mean_squared_error(y_test, lasso_predictions))
                            
#R2 for DT and LASSO
print("R-squared Decision Tree Regressor:", metrics.r2_score(y_test, tree_predictions))
print("R-squared LASSO:", metrics.r2_score(y_test, lasso_predictions)) 
                               

Features-fw: ['InStrSpending', 'Discount', 'TVSpending', 'Price', 'Radio', 'OnlineAdsSpending'] MSE forward selection: 20781388.04230963
Features-fw: ['InStrSpending', 'Discount', 'TVSpending', 'Price', 'Radio', 'OnlineAdsSpending'] R-squared forward selection: 0.9970143751298145
MSE Decision Tree Regressor: 518602106.0954774
MSE LASSO: 8367367.183238087
R-squared Decision Tree Regressor: 0.9254933624964381
R-squared LASSO: 0.9987978753147101


### Prefere LASSO and LR with forward selection as MSE is smaller and R-squared is larger

## 3.Fit Decision Tree Regressor again with criterion="mae", splitter="random"

In [54]:
X = sales.drop('Sale', axis=1)
y = sales['Sale']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Decision Tree Regressor
tree_regressor = DecisionTreeRegressor(criterion="mae", splitter="random")
tree_regressor.fit(X_train, y_train)

tree_predictions = tree_regressor.predict(X_test)

print("MSE Decision Tree Regressor:", metrics.mean_squared_error(y_test, tree_predictions))
print("R-squared Decision Tree Regressor:", metrics.r2_score(y_test, tree_predictions))             
                               

MSE Decision Tree Regressor: 560091807.4723618
R-squared Decision Tree Regressor: 0.9087860974136961




### Still prefer LASSO and LR with forward selection more.

## 4. The most predictive of the dependent variable:

In [55]:
#decision rule
tree_rules = export_text(tree_regressor, feature_names=X.columns.tolist())
print("Decision Rules:", tree_rules)

Decision Rules: |--- Price <= 11.60
|   |--- InStrSpending <= 10.19
|   |   |--- TVSpending <= 170.10
|   |   |   |--- TVSpending <= 91.02
|   |   |   |   |--- TVSpending <= 65.61
|   |   |   |   |   |--- Price <= 8.58
|   |   |   |   |   |   |--- TVSpending <= 55.98
|   |   |   |   |   |   |   |--- Price <= 1.81
|   |   |   |   |   |   |   |   |--- InStrSpending <= 5.06
|   |   |   |   |   |   |   |   |   |--- StockRate <= 0.49
|   |   |   |   |   |   |   |   |   |   |--- value: [126782.00]
|   |   |   |   |   |   |   |   |   |--- StockRate >  0.49
|   |   |   |   |   |   |   |   |   |   |--- value: [131631.00]
|   |   |   |   |   |   |   |   |--- InStrSpending >  5.06
|   |   |   |   |   |   |   |   |   |--- value: [138225.00]
|   |   |   |   |   |   |   |--- Price >  1.81
|   |   |   |   |   |   |   |   |--- StockRate <= 0.13
|   |   |   |   |   |   |   |   |   |--- InStrSpending <= 9.91
|   |   |   |   |   |   |   |   |   |   |--- value: [143800.00]
|   |   |   |   |   |   |   |   

In [56]:
print("The most predictive variable based on decision rules:",X.columns[abs(tree_regressor.feature_importances_).argmax()])

The most predictive variable based on decision rules: Price


In [57]:
#LASSO and forward selection is chosen based on their coefficient
print("The most predictive variable based on LASSO:",X.columns[abs(lasso_regressor.coef_).argmax()])
print("The most predictive variable based on forward selection:",X.columns[abs(LR.coef_).argmax()])

The most predictive variable based on LASSO: StockRate
The most predictive variable based on forward selection: TVSpending


# Classification


### Load data

In [74]:
nba = pd.read_csv("https://raw.githubusercontent.com/KennedyOdongo/DATA-300-Statistical-Machine-Learning-Fall-2023-/main/Data/nba.csv")
nba

Unnamed: 0,Name,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,Brandon Ingram,36,27.4,7.4,2.6,7.6,34.7,0.5,2.1,25.0,...,2.3,69.9,0.7,3.4,4.1,1.9,0.4,0.4,1.3,0.0
1,Andrew Harrison,35,26.9,7.2,2.0,6.7,29.6,0.7,2.8,23.5,...,3.4,76.5,0.5,2.0,2.4,3.7,1.1,0.5,1.6,0.0
2,JaKarr Sampson,74,15.3,5.2,2.0,4.7,42.2,0.4,1.7,24.4,...,1.3,67.0,0.5,1.7,2.2,1.0,0.5,0.3,1.0,0.0
3,Malik Sealy,58,11.6,5.7,2.3,5.5,42.6,0.1,0.5,22.6,...,1.3,68.9,1.0,0.9,1.9,0.8,0.6,0.1,1.0,1.0
4,Matt Geiger,48,11.5,4.5,1.6,3.0,52.4,0.0,0.1,0.0,...,1.9,67.4,1.0,1.5,2.5,0.3,0.3,0.4,0.8,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1335,Chris Smith,80,15.8,4.3,1.6,3.6,43.3,0.0,0.2,14.3,...,1.5,79.2,0.4,0.8,1.2,2.5,0.6,0.2,0.8,0.0
1336,Brent Price,68,12.6,3.9,1.5,4.1,35.8,0.1,0.7,16.7,...,1.0,79.4,0.4,1.1,1.5,2.3,0.8,0.0,1.3,1.0
1337,Marlon Maxey,43,12.1,5.4,2.2,3.9,55.0,0.0,0.0,0.0,...,1.6,64.3,1.5,2.3,3.8,0.3,0.3,0.4,0.9,0.0
1338,Litterial Green,52,12.0,4.5,1.7,3.8,43.9,0.0,0.2,10.0,...,1.8,62.5,0.2,0.4,0.7,2.2,0.4,0.1,0.8,1.0


## 1. Exploratory Data Analysis (EDA)

In [69]:
nba.describe()

Unnamed: 0,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
count,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1329.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0,1340.0
mean,60.414179,17.624627,6.801493,2.629104,5.885299,44.169403,0.247612,0.779179,19.308126,1.297687,1.82194,70.300299,1.009403,2.025746,3.034478,1.550522,0.618507,0.368582,1.193582,0.620149
std,17.433992,8.307964,4.357545,1.683555,3.593488,6.137679,0.383688,1.061847,16.022916,0.987246,1.322984,10.578479,0.777119,1.360008,2.057774,1.471169,0.409759,0.429049,0.722541,0.485531
min,11.0,3.1,0.7,0.3,0.8,23.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.3,0.0,0.0,0.0,0.1,0.0
25%,47.0,10.875,3.7,1.4,3.3,40.2,0.0,0.0,0.0,0.6,0.9,64.7,0.4,1.0,1.5,0.6,0.3,0.1,0.7,0.0
50%,63.0,16.1,5.55,2.1,4.8,44.1,0.1,0.3,22.4,1.0,1.5,71.25,0.8,1.7,2.5,1.1,0.5,0.2,1.0,1.0
75%,77.0,22.9,8.8,3.4,7.5,47.9,0.4,1.2,32.5,1.6,2.3,77.6,1.4,2.6,4.0,2.0,0.8,0.5,1.5,1.0
max,82.0,40.9,28.2,10.2,19.8,73.7,2.3,6.5,100.0,7.7,10.2,100.0,5.3,9.6,13.9,10.6,2.5,3.9,4.4,1.0


### Column "3P%" has Missing values. But do not replace N/A values 

In [70]:
nba.isna().sum()

Name            0
GP              0
MIN             0
PTS             0
FGM             0
FGA             0
FG%             0
3P Made         0
3PA             0
3P%            11
FTM             0
FTA             0
FT%             0
OREB            0
DREB            0
REB             0
AST             0
STL             0
BLK             0
TOV             0
TARGET_5Yrs     0
dtype: int64

### Classes are imbalance, TARGET_5Yrs = 0 is minority. Decide not to resample 

In [71]:
nba['TARGET_5Yrs'].value_counts(normalize = True)

1.0    0.620149
0.0    0.379851
Name: TARGET_5Yrs, dtype: float64

## 2. Fit Decision Tree Classifier with default parameters

In [82]:
#when running having this error
#ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

nba = nba.replace([np.inf, -np.inf], np.nan).dropna()

In [83]:
X = nba.drop(['TARGET_5Yrs','Name'], axis=1) 
# aslo drop "Name" because it does not have any meaningful contribution to the prediction
# Every players has unique names so "Name" is excluded
y = nba['TARGET_5Yrs']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

#Decision Tree Regressor
tree_classifier2 = DecisionTreeClassifier()
tree_classifier2.fit(X_train, y_train)

tree_predictions2 = tree_classifier2.predict(X_test)

print("Precision score of Decision Tree Classifier:", metrics.precision_score(y_test, tree_predictions2))             
                               

Precision score of Decision Tree Classifier: 0.6911764705882353


## 3. Fit Decision Tree Classifier with criterion='gini' and max_depth = 4

In [87]:
X = nba.drop(['TARGET_5Yrs', 'Name'], axis=1)
y = nba['TARGET_5Yrs']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

tree_classifier3 = DecisionTreeClassifier(criterion='gini', max_depth=4)
tree_classifier3.fit(X_train, y_train)

tree_predictions3 = tree_classifier3.predict(X_test)

print("Precision score of Decision Tree Classifier with non-default parameters:", metrics.precision_score(y_test, tree_predictions3))


Precision score of Decision Tree Classifier with non-default parameters: 0.7678571428571429


### Prefer the Decision Tree Classifier with criterion='gini' and max_depth = 4

## 4. Decision rule with the tree from 3

In [88]:
tree_rules = export_text(tree_classifier3, feature_names=X.columns.tolist())
print("Decision Rules:", tree_rules)

Decision Rules: |--- GP <= 59.50
|   |--- FG% <= 39.65
|   |   |--- FT% <= 97.05
|   |   |   |--- GP <= 45.50
|   |   |   |   |--- class: 0.0
|   |   |   |--- GP >  45.50
|   |   |   |   |--- class: 0.0
|   |   |--- FT% >  97.05
|   |   |   |--- class: 1.0
|   |--- FG% >  39.65
|   |   |--- FTM <= 0.85
|   |   |   |--- MIN <= 6.85
|   |   |   |   |--- class: 1.0
|   |   |   |--- MIN >  6.85
|   |   |   |   |--- class: 0.0
|   |   |--- FTM >  0.85
|   |   |   |--- GP <= 32.50
|   |   |   |   |--- class: 0.0
|   |   |   |--- GP >  32.50
|   |   |   |   |--- class: 1.0
|--- GP >  59.50
|   |--- DREB <= 1.25
|   |   |--- GP <= 75.50
|   |   |   |--- FG% <= 41.35
|   |   |   |   |--- class: 0.0
|   |   |   |--- FG% >  41.35
|   |   |   |   |--- class: 1.0
|   |   |--- GP >  75.50
|   |   |   |--- 3P% <= 29.35
|   |   |   |   |--- class: 1.0
|   |   |   |--- 3P% >  29.35
|   |   |   |   |--- class: 1.0
|   |--- DREB >  1.25
|   |   |--- FGM <= 3.15
|   |   |   |--- 3PA <= 2.05
|   |   |   | 

#### First Split (Root Node):
#### If the player has fewer than 55.5 games played (GP <= 55.5), we move to the left branch.
#### If the player has 55.5 or more games played (GP > 55.5), we move to the right branch.
#### Left Branch (GP <= 55.5):
#### If FTM is less than or equal to 1.05 and BLK are less than or equal to 0.55:
#### If MIN is less than or equal to 13.05, the predicted value is approximately 0.38.
#### If MIN is greater than 13.05, the predicted value is approximately 0.19.
#### If BLK are greater than 0.55:
#### If PTS are less than or equal to 3.25, the predicted value is approximately 0.33.
#### If PTS are greater than 3.25, the predicted value is approximately 0.88.
#### Similiar interpretation for Right Branch

#### The root node with "GP" feature is often considered the most important feature in rapport with all other features.

In [66]:
print("The most predictive variable based on decision rules:",X.columns[abs(tree_classifier3.feature_importances_).argmax()])

The most predictive variable based on decision rules: GP
