In [1]:
! pip install category-encoders

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


# Importing the necessary libraries

-  Importing the libraries

In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from category_encoders.target_encoder import TargetEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Data import and EDA

- Loadind the data from the CSV file
- Spliting the data into features and target.
- Target encoding is been performed on the categorical features
- Imputing the missing values with the mean of the features.
- Spliting the data into training and testing sets

In [3]:
data = pd.read_csv("/Users/nitheshrajmohan/Desktop/Ganesh_review/data/shot_logs.csv")

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128069 entries, 0 to 128068
Data columns (total 21 columns):
 #   Column                      Non-Null Count   Dtype  
---  ------                      --------------   -----  
 0   GAME_ID                     128069 non-null  int64  
 1   MATCHUP                     128069 non-null  object 
 2   LOCATION                    128069 non-null  object 
 3   W                           128069 non-null  object 
 4   FINAL_MARGIN                128069 non-null  int64  
 5   SHOT_NUMBER                 128069 non-null  int64  
 6   PERIOD                      128069 non-null  int64  
 7   GAME_CLOCK                  128069 non-null  object 
 8   SHOT_CLOCK                  122502 non-null  float64
 9   DRIBBLES                    128069 non-null  int64  
 10  TOUCH_TIME                  128069 non-null  float64
 11  SHOT_DIST                   128069 non-null  float64
 12  PTS_TYPE                    128069 non-null  int64  
 13  SHOT_RESULT   

In [5]:
data.isnull().sum()

GAME_ID                          0
MATCHUP                          0
LOCATION                         0
W                                0
FINAL_MARGIN                     0
SHOT_NUMBER                      0
PERIOD                           0
GAME_CLOCK                       0
SHOT_CLOCK                    5567
DRIBBLES                         0
TOUCH_TIME                       0
SHOT_DIST                        0
PTS_TYPE                         0
SHOT_RESULT                      0
CLOSEST_DEFENDER                 0
CLOSEST_DEFENDER_PLAYER_ID       0
CLOSE_DEF_DIST                   0
FGM                              0
PTS                              0
player_name                      0
player_id                        0
dtype: int64

### Splitting the data into features and target

In [6]:
X = data.drop(columns=['PTS'])
y = data['PTS']

### Identifying the categorical features and performing the target encoding on the categorical features

In [7]:
encoder = TargetEncoder()

categorical_cols = X.select_dtypes(include=['object']).columns
X_encoded = encoder.fit_transform(X[categorical_cols], y)

### Imputing the missing values with the mean

In [8]:
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X_encoded)

### Spliting the data into training and testing sets

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Feature Selection Technique 1: SelectKBest

-  Training a linear regression model on the reduced feature set using SelectKBest
-  Making predictions on the test set using SelectKBest
-  Calculating the mean squared error using SelectKBest

In [10]:
selector = SelectKBest(score_func=f_regression, k=5)
X_train_kbest = selector.fit_transform(X_train, y_train)
X_test_kbest = selector.transform(X_test)

### Train a linear regression model on the reduced feature set using SelectKBest

In [11]:
model_kbest = LinearRegression()
model_kbest.fit(X_train_kbest, y_train)

### Make predictions on the test set using SelectKBest

In [12]:
y_pred_kbest = model_kbest.predict(X_test_kbest)

### Calculate the mean squared error using SelectKBest

In [13]:
mse_kbest = mean_squared_error(y_test, y_pred_kbest)

# Feature Selection Technique 2: PCA

- Training a linear regression model on the reduced feature set using PCA
- Making predictions on the test set using PCA
- Calculating the mean squared error using PCA

In [14]:
pca = PCA(n_components=5)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

### Train a linear regression model on the reduced feature set using PCA

In [15]:
model_pca = LinearRegression()
model_pca.fit(X_train_pca, y_train)

### Make predictions on the test set using PCA

In [16]:
y_pred_pca = model_pca.predict(X_test_pca)

### Calculate the mean squared error using PCA

In [17]:
mse_pca = mean_squared_error(y_test, y_pred_pca)

### Calculate the mean squared error using PCA

In [18]:
mse_pca = mean_squared_error(y_test, y_pred_pca)

# Print the mean squared error for each technique

In [19]:
print("Mean Squared Error using SelectKBest: ", mse_kbest)
print("Mean Squared Error using PCA: ", mse_pca)

Mean Squared Error using SelectKBest:  0.07518167691904357
Mean Squared Error using PCA:  0.07518836292889028


# INFERENCE

### The SelectKBest feature selection strategy appears to have outperformed PCA in terms of lowering the number of features while preserving model performance, based on the mean squared error values. The model trained on the smaller feature set using SelectKBest was able to generate better predictions on the test data, as evidenced by the fact that the MSE using SelectKBest was lower than the MSE using PCA. It's crucial to remember that the selection of the feature selection or reduction strategy should be based on the particular dataset and problem at hand. This is because different techniques may perform better or worse depending on the circumstances.