### GITHUB setup instructions are in a separate notebook

#### it's published on the github repo

# LOGISTIC REGRESSION MODEL FOR NBA ROOKIE PLAYERS

# Load Packages

In [1]:
# Load the packages needed for Logistic Regression Modelling upfront
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import roc_auc_score

from joblib import dump

# Load the data

In [2]:
# Load the training data set
train = pd.read_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_1/adsi_at1/data/raw/train.csv')

In [3]:
# Load the test data set
test = pd.read_csv('/Users/jasle1/Desktop/MDSI/ADSI/AT_1/adsi_at1/data/raw/test.csv')

# Explore the Data

In [4]:
# check rows and columns of the training set
train.shape

(8000, 21)

In [5]:
# check rows and columns of the test set
test.shape

(3799, 20)

In [6]:
train.head()

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
0,10556,80,24.3,7.8,3.0,6.4,45.7,0.1,0.3,22.6,...,2.9,72.1,2.2,2.0,3.8,3.2,1.1,0.2,1.6,1
1,5342,75,21.8,10.5,4.2,7.9,55.1,-0.3,-1.0,34.9,...,3.6,67.8,3.6,3.7,6.6,0.7,0.5,0.6,1.4,1
2,5716,85,19.1,4.5,1.9,4.5,42.8,0.4,1.2,34.3,...,0.6,75.7,0.6,1.8,2.4,0.8,0.4,0.2,0.6,1
3,13790,63,19.1,8.2,3.5,6.7,52.5,0.3,0.8,23.7,...,1.5,66.9,0.8,2.0,3.0,1.8,0.4,0.1,1.9,1
4,5470,63,17.8,3.7,1.7,3.4,50.8,0.5,1.4,13.7,...,0.5,54.0,2.4,2.7,4.9,0.4,0.4,0.6,0.7,1


In [7]:
test.head()

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,FTM,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV
0,1,56,9.1,4.0,1.6,3.7,43.7,0.1,0.3,7.3,0.7,1.2,63.4,1.2,0.8,1.7,0.4,0.2,0.3,0.8
1,8194,43,19.3,10.1,3.7,8.1,46.0,0.6,1.7,35.1,1.8,2.5,75.3,0.5,0.9,1.5,3.5,0.6,-0.0,1.8
2,3,82,33.9,11.3,4.9,10.6,45.6,0.5,1.9,44.8,1.8,2.7,71.2,1.3,3.3,4.5,2.5,1.3,0.3,2.0
3,8196,86,44.7,18.8,6.8,15.9,42.9,0.5,1.8,13.5,4.5,6.3,70.9,1.5,3.2,5.0,4.1,0.9,0.1,3.6
4,8197,58,12.3,4.7,1.6,4.0,40.0,0.5,1.7,38.7,1.1,1.3,76.9,0.2,0.6,0.9,1.5,0.5,-0.4,0.9


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 21 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Id           8000 non-null   int64  
 1   GP           8000 non-null   int64  
 2   MIN          8000 non-null   float64
 3   PTS          8000 non-null   float64
 4   FGM          8000 non-null   float64
 5   FGA          8000 non-null   float64
 6   FG%          8000 non-null   float64
 7   3P Made      8000 non-null   float64
 8   3PA          8000 non-null   float64
 9   3P%          8000 non-null   float64
 10  FTM          8000 non-null   float64
 11  FTA          8000 non-null   float64
 12  FT%          8000 non-null   float64
 13  OREB         8000 non-null   float64
 14  DREB         8000 non-null   float64
 15  REB          8000 non-null   float64
 16  AST          8000 non-null   float64
 17  STL          8000 non-null   float64
 18  BLK          8000 non-null   float64
 19  TOV   

In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3799 entries, 0 to 3798
Data columns (total 20 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Id       3799 non-null   int64  
 1   GP       3799 non-null   int64  
 2   MIN      3799 non-null   float64
 3   PTS      3799 non-null   float64
 4   FGM      3799 non-null   float64
 5   FGA      3799 non-null   float64
 6   FG%      3799 non-null   float64
 7   3P Made  3799 non-null   float64
 8   3PA      3799 non-null   float64
 9   3P%      3799 non-null   float64
 10  FTM      3799 non-null   float64
 11  FTA      3799 non-null   float64
 12  FT%      3799 non-null   float64
 13  OREB     3799 non-null   float64
 14  DREB     3799 non-null   float64
 15  REB      3799 non-null   float64
 16  AST      3799 non-null   float64
 17  STL      3799 non-null   float64
 18  BLK      3799 non-null   float64
 19  TOV      3799 non-null   float64
dtypes: float64(18), int64(2)
memory usage: 593.7 KB


In [10]:
train.describe()

Unnamed: 0,Id,GP,MIN,PTS,FGM,FGA,FG%,3P Made,3PA,3P%,...,FTA,FT%,OREB,DREB,REB,AST,STL,BLK,TOV,TARGET_5Yrs
count,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,...,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0,8000.0
mean,6856.971,62.777875,18.576662,7.267088,2.807037,6.231212,44.6089,0.264525,0.816562,19.5837,...,1.947788,71.365825,1.077838,2.1685,3.2453,1.624513,0.648687,0.245212,1.257763,0.833625
std,3977.447579,17.118774,8.935263,4.318732,1.693373,3.584559,6.155453,0.384093,1.060964,16.003155,...,1.252352,10.430447,0.78567,1.392224,2.085154,1.355986,0.407626,0.821037,0.72327,0.37244
min,4.0,-8.0,2.9,0.8,0.3,0.8,21.3,-1.1,-3.1,-38.5,...,0.0,-13.3,0.0,0.2,0.3,0.0,0.0,-17.9,0.1,0.0
25%,3413.75,51.0,12.0,4.1,1.6,3.6,40.4,0.0,0.1,8.4,...,1.0,65.0,0.5,1.1,1.7,0.7,0.3,0.1,0.7,1.0
50%,6787.5,63.0,16.8,6.3,2.4,5.4,44.4,0.3,0.8,19.5,...,1.7,71.4,0.9,1.9,2.8,1.3,0.6,0.2,1.1,1.0
75%,10299.25,74.0,23.5,9.5,3.7,8.1,48.7,0.5,1.5,30.6,...,2.6,77.5,1.5,2.9,4.3,2.2,0.9,0.4,1.6,1.0
max,13798.0,123.0,73.8,34.2,13.1,28.9,67.2,1.7,4.7,82.1,...,11.1,168.9,5.5,11.0,15.9,12.8,3.6,18.9,5.3,1.0


# Transform the Data

In [4]:
# extract the target variable out as y = target
y_train = train.pop('TARGET_5Yrs')
y_train.shape

(8000,)

In [48]:
# Standardise the dataset - SKIP THIS FOR NOW, IT CAUSES PROBLEMS AT THE END
# scaler = StandardScaler()
# train = scaler.fit_transform(train)
# test = scaler.fit_transform(test)

In [5]:
# Split the training data into train and validation sets for evaluation
# set the validation set at 20% of the full training set
X_train, X_val, y_train, y_val = train_test_split(train, y_train, test_size=0.2, random_state = 42)

# Train Logistic Regression Model

In [6]:
# Build the logistic regression classifier
model = LogisticRegression(solver='liblinear',class_weight='balanced') 
# liblinear is using both L1 and L2 penalty
# The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data

In [7]:
# fit the model
model.fit(X_train, y_train)

LogisticRegression(class_weight='balanced', solver='liblinear')

In [8]:
# Make predictions on the validation dataset
val_pred = model.predict_proba(X_val)
val_pred

array([[0.61261071, 0.38738929],
       [0.39816654, 0.60183346],
       [0.44198532, 0.55801468],
       ...,
       [0.15190973, 0.84809027],
       [0.30436975, 0.69563025],
       [0.21793354, 0.78206646]])

In [9]:
#pd.DataFrame(val_pred[:,1])

Unnamed: 0,0
0,0.387389
1,0.601833
2,0.558015
3,0.477971
4,0.624276
...,...
1595,0.757248
1596,0.533790
1597,0.848090
1598,0.695630


# Evaluate Model Performance

In [11]:
# Calculate auc scores for performance evaluation
auc_score = roc_auc_score(y_val, val_pred[:,1])

print(f'The Logistic regression ROC AUC score is {auc_score}')

The Logistic regression ROC AUC score is 0.7165409193159875


In [12]:
# Make predictions on the test set now
test_pred = model.predict_proba(test)
test_pred

array([[0.50972983, 0.49027017],
       [0.56457628, 0.43542372],
       [0.30361411, 0.69638589],
       ...,
       [0.63323303, 0.36676697],
       [0.18647159, 0.81352841],
       [0.5965568 , 0.4034432 ]])

In [13]:
# pd.DataFrame(test_pred[:,1])

Unnamed: 0,0
0,0.490270
1,0.435424
2,0.696386
3,0.767211
4,0.396864
...,...
3794,0.797431
3795,0.510827
3796,0.366767
3797,0.813528


# Prepare the file for extracting final prediction output - for Kaggle submission

In [15]:
# Convert the datasets into pandas dataframe for easy merge with prediction file
test = pd.DataFrame(test)
test

In [21]:
# Convert the datasets into pandas dataframe for easy merge with prediction file
test_pred = pd.DataFrame(test_pred)
test_pred.head()
# it seems that the framework is giving out prediction for both yes and no

In [23]:
# Now, merge the test data set with predictions data
df = pd.concat([test, test_pred], axis=1)
df.head()

In [36]:
# Rename the '1' prediction column to align with Kaggle submission requirements
df_final = df.rename(columns={1: 'TARGET_5Yrs'})
df_final.head()

In [38]:
# Extract the ID and Prediction columns for Kaggle Submission
submission = df_final[['Id', 'TARGET_5Yrs']]
submission.head()

Unnamed: 0,Id,TARGET_5Yrs
0,1,0.49027
1,8194,0.435424
2,3,0.696386
3,8196,0.767211
4,8197,0.396864


In [40]:
# save the submission file for kaggle upload
submission.to_csv('jasleen_logistic_reg_week1', index=False)