### **Importing the required packages**

In [301]:
import numpy as np
import pandas  as pd

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.decomposition import PCA

### **Reading and Exploring the Data**

In [302]:
df = pd.read_csv('/content/nba_final.csv')

In [303]:
df.head()

Unnamed: 0,Rk,Player.x,Player_ID,Pos1,Pos2,Age,Tm,G,GS,MP,FG,FGA,FG.,X3P,X3PA,X3P.,X2P,X2PA,X2P.,eFG.,FT,FTA,FT.,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Salary,mean_views,Season,Conference,Role,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score,Play
0,170,A.J. Hammons,hammoaj01,C,,24,DAL,22,0,7.4,0.8,1.9,0.405,0.2,0.5,0.5,0.5,1.5,0.375,0.464,0.4,0.9,0.45,0.4,1.3,1.6,0.2,0.0,0.6,0.5,1.0,2.2,,3.32,2016-17,West,Front,786,123,,,,,83.5,No
1,58,Aaron Brooks,brookaa01,PG,,32,IND,65,0,13.8,1.9,4.6,0.403,0.7,2.0,0.375,1.1,2.6,0.424,0.483,0.5,0.6,0.8,0.3,0.8,1.1,1.9,0.4,0.1,1.0,1.4,5.0,2700000.0,11.155738,2016-17,Est,Back,2474,64,,,,,48.2,No
2,157,Aaron Gordon,gordoaa01,SF,,21,ORL,80,72,28.7,4.9,10.8,0.454,1.0,3.3,0.288,4.0,7.5,0.528,0.499,2.0,2.7,0.719,1.5,3.6,5.1,1.9,0.8,0.5,1.1,2.2,12.7,4351320.0,1713.986339,2016-17,Est,Front,22774,29,,,,,40.0,No
3,352,Adreian Payne,paynead01,PF,,25,MIN,18,0,7.5,1.3,3.0,0.426,0.2,0.8,0.2,1.1,2.2,0.513,0.454,0.8,1.1,0.737,0.5,1.3,1.8,0.4,0.4,0.4,0.4,1.8,3.5,2022240.0,205.855191,2016-17,West,Front,861,120,1.0,52.0,,,75.5,No
4,10,Al-Farouq Aminu,aminual01,PF,,26,POR,61,25,29.1,3.0,7.6,0.393,1.1,3.5,0.33,1.9,4.2,0.445,0.468,1.6,2.2,0.706,1.3,6.1,7.4,1.6,1.0,0.7,1.5,1.7,8.7,7680965.0,604.34153,2016-17,West,Front,4971,69,7.0,23.0,,,42.8,No


In [304]:
df.shape

(1408, 45)

In [305]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1408 entries, 0 to 1407
Data columns (total 45 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Rk          1408 non-null   int64  
 1   Player.x    1408 non-null   object 
 2   Player_ID   1408 non-null   object 
 3   Pos1        1408 non-null   object 
 4   Pos2        12 non-null     object 
 5   Age         1408 non-null   int64  
 6   Tm          1408 non-null   object 
 7   G           1408 non-null   int64  
 8   GS          1408 non-null   int64  
 9   MP          1408 non-null   float64
 10  FG          1408 non-null   float64
 11  FGA         1408 non-null   float64
 12  FG.         1404 non-null   float64
 13  X3P         1408 non-null   float64
 14  X3PA        1408 non-null   float64
 15  X3P.        1309 non-null   float64
 16  X2P         1408 non-null   float64
 17  X2PA        1408 non-null   float64
 18  X2P.        1393 non-null   float64
 19  eFG.        1404 non-null  

In [306]:
# If there was a misclassified datatype
# df[''] = df[''].astype(dtype)

In [307]:
df.isnull().sum() # print the total number of missing values columns-wise

Unnamed: 0,0
Rk,0
Player.x,0
Player_ID,0
Pos1,0
Pos2,1396
Age,0
Tm,0
G,0
GS,0
MP,0


In [308]:
df.isnull().sum(axis=1) # print the total number of missing values row-wise

Unnamed: 0,0
0,6
1,5
2,5
3,3
4,3
...,...
1403,1
1404,1
1405,4
1406,1


In [309]:
df[df.isnull().sum(axis=1) > 6]

Unnamed: 0,Rk,Player.x,Player_ID,Pos1,Pos2,Age,Tm,G,GS,MP,FG,FGA,FG.,X3P,X3PA,X3P.,X2P,X2PA,X2P.,eFG.,FT,FTA,FT.,ORB,DRB,TRB,AST,STL,BLK,TOV,PF,PTS,Salary,mean_views,Season,Conference,Role,Fvot,FRank,Pvot,PRank,Mvot,MRank,Score,Play
27,341,Arinze Onuaku,onuakar01,C,,29,ORL,8,0,3.5,0.3,0.5,0.5,0.0,0.0,,0.3,0.5,0.5,0.5,0.0,0.0,,0.3,0.5,0.8,0.3,0.0,0.1,0.3,0.4,0.5,426775.0,,2016-17,Est,Front,216,129,,,,,90.0,No
78,235,Damian Jones,jonesda03,C,,21,GSW,10,0,8.5,0.8,1.6,0.5,0.0,0.0,,0.8,1.6,0.5,0.5,0.3,1.0,0.3,0.9,1.4,2.3,0.0,0.1,0.4,0.6,1.5,1.9,1171560.0,,2016-17,West,Front,12176,46,,,,,45.0,No
86,204,Danuel House,houseda01,SG,,23,WAS,1,0,1.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,,0.0,0.0,,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,543471.0,74.25,2016-17,Est,Front,653,121,2.0,40.0,,,73.5,No
169,360,Jakob Pöltl,poeltja01,C,,21,TOR,54,4,11.6,1.2,2.1,0.583,0.0,0.0,,1.2,2.1,0.583,0.583,0.6,1.1,0.544,1.4,1.6,3.1,0.2,0.3,0.4,0.5,2.1,3.1,,,2016-17,Est,Front,42,73,,,,,62.0,No
472,348,Ben Moore,moorebe01,PF,,22,IND,2,0,4.5,0.0,0.0,,0.0,0.0,,0.0,0.0,,,0.0,0.0,,0.0,0.5,0.5,0.5,0.0,0.0,0.0,2.0,0.0,77250.0,,2017-18,Est,Front,267,136,0.0,74.0,0.0,8.0,158.5,No
904,307,Tyler Lydon,lydonty01,PF,,21,DEN,1,0,2.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1579440.0,,2017-18,West,Front,850,135,1.0,47.0,0.0,9.0,118.2,No
1089,282,George King,kingge03,SF,,25,PHO,1,0,6.0,0.0,0.0,,0.0,0.0,,0.0,0.0,,,0.0,0.0,,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,77250.0,,2018-19,West,Front,2964,119,0.0,84.0,0.0,8.0,152.0,No


In [310]:
df = df[df.isnull().sum(axis=1) <= 6]
# alternate method of running the above condition
# df = df[~(df.isnull().sum(axis=1) >6)]

In [311]:
df['Pos2']

Unnamed: 0,Pos2
0,
1,
2,
3,
4,
...,...
1403,
1404,
1405,
1406,


In [312]:
df['Pos2'] = df['Pos2'].fillna('None')

In [313]:
df.isnull().sum()

Unnamed: 0,0
Rk,0
Player.x,0
Player_ID,0
Pos1,0
Pos2,0
Age,0
Tm,0
G,0
GS,0
MP,0


In [314]:
df.isnull().mean()*100 # Percentage of missing values

Unnamed: 0,0
Rk,0.0
Player.x,0.0
Player_ID,0.0
Pos1,0.0
Pos2,0.0
Age,0.0
Tm,0.0
G,0.0
GS,0.0
MP,0.0


In [315]:
df['Salary']

Unnamed: 0,Salary
0,
1,2700000.0
2,4351320.0
3,2022240.0
4,7680965.0
...,...
1403,3628920.0
1404,19500000.0
1405,77250.0
1406,2393887.0


In [316]:
df = df.fillna(0)

### **Check for Duplicates**

In [317]:
df.duplicated().sum()

np.int64(0)

In [318]:
df.drop_duplicates(inplace =True) # drop all the duplicate rows

In [319]:
# df.drop_duplicates(subset=['Pos2', 'Role', 'Pvot'])

In [320]:
df.drop(columns=['Player.x','Player_ID'], inplace=True)

### **Encode the Categorical columns values**

In [321]:
obj_cols = df.select_dtypes('object').columns

In [322]:
le = LabelEncoder()

for  col in obj_cols:
  df[col] = le.fit_transform(df[col])

In [323]:
df['Play'].value_counts()

Unnamed: 0_level_0,count
Play,Unnamed: 1_level_1
0,1328
1,73


### **Machine Learning Process**

In [324]:
X = df.drop(columns= 'Play')
y = df['Play']

In [325]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2,stratify=y , random_state=34)

### **Standardization**


In [326]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

### **Apply Logistic Regression on the data**

In [327]:
log_reg = LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

In [328]:
y_pred = log_reg.predict(X_test_scaled)

In [329]:
accuracy_score(y_test,y_pred)*100

98.57651245551602

In [330]:
roc_auc_score(y_test, y_pred)*100

np.float64(86.66666666666667)

### **Now Let's transfom the data using PCA**

In [331]:
pca = PCA(n_components=0.95)

In [332]:
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

In [333]:
X_train_pca.shape

(1120, 22)

In [334]:
pd.DataFrame(X_train_pca)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
0,1.743592,-2.962779,1.275286,-2.144956,-1.356296,-1.767555,0.617016,0.443513,0.291809,0.712095,0.439202,-0.300581,-0.387854,-0.854399,-0.932270,-0.714240,0.380290,-0.113331,-0.894713,0.604208,0.229382,-0.698032
1,-1.720394,2.632226,-0.588966,-2.523427,0.899243,0.856896,1.107480,-1.559127,0.197596,-0.884529,0.259799,0.380768,-0.284205,-0.141527,0.034378,0.135299,0.563018,-0.207073,-0.069672,-0.011686,1.370912,0.109538
2,-3.207607,2.427227,0.771961,-0.750912,-0.557805,0.303317,0.270859,-0.370691,1.016623,-1.097607,0.266412,-0.099761,-1.745636,0.772155,0.819823,-0.499726,-0.007913,-0.717638,0.411349,0.525933,-0.036536,0.909073
3,-1.095497,-1.300507,0.204654,-3.130417,1.380971,0.593637,-0.694676,1.479025,-1.153294,0.988956,-0.252806,1.298878,0.645045,-0.194782,1.535337,-0.002117,-1.237932,0.223093,-0.690182,0.441277,-0.807869,0.434454
4,2.829147,-3.748800,0.595624,-2.359235,-1.455519,-0.485318,0.249235,0.906769,-0.322096,-0.115170,-0.857834,1.156728,1.168809,0.681318,-0.485190,0.480073,-0.355783,0.287400,-0.635689,-0.057565,-0.065620,-0.369492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1115,2.059740,1.001131,-0.322681,-1.872604,-0.934275,0.359340,-1.745301,-0.842281,0.536548,-0.460131,-0.106421,-0.631828,0.664896,1.585242,-0.831777,0.135593,-0.063579,0.400679,0.321901,1.383374,0.012538,-0.466676
1116,-4.724008,1.657865,1.915426,0.436659,-1.227938,0.793730,1.783531,-0.390004,0.485434,-1.114174,0.940638,0.234893,-0.546840,-0.714257,0.866837,-0.703750,-0.295037,-0.495044,0.381727,-0.181213,0.245849,-0.353612
1117,-2.272954,0.716595,-0.974287,0.954028,0.120327,-0.478525,0.233415,-0.487869,-0.518456,0.964609,0.006862,0.457332,-1.744528,-0.287922,-1.122352,-0.104895,0.355996,0.879286,-0.589149,0.432826,0.368559,0.927371
1118,-1.392386,-0.351281,-1.686259,0.454432,0.872558,0.098578,0.730712,-1.833962,0.609412,-0.871443,-0.557313,-0.745661,-1.742329,0.512685,-1.254044,-0.999474,0.016982,1.434386,-0.255540,0.523921,0.630223,0.780769


### **Applying Logistic Regression on the PCA transformed Data**

In [335]:
log_reg2 = LogisticRegression()
log_reg2.fit(X_train_pca, y_train)

In [336]:
y_pred2 = log_reg2.predict(X_test_pca)

In [337]:
roc_auc_score(y_test, y_pred2)*100

np.float64(80.0)