2. Consider the dataset at the link Santander Customer Satisfaction | Kaggle.
Do the following:
Find out how many principal components explain more than 90% variation taking 
all the variables except ID and target. 
Try the following models with PCA transform (Pipeline):
- Random Forest
- X G Boost
- Cat Boost
- Light GBM
Mention the leaderboard scores if possible

In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import os
from warnings import filterwarnings

In [2]:
filterwarnings('ignore')

os.chdir(r"D:\March 2024\PML\Assignment\Assignment 2 - Santander Customer Satisfaction")

In [3]:
train = pd.read_csv("train.csv", index_col=0)
print(train.isnull().sum().sum())

test = pd.read_csv("test.csv", index_col=0)
print(test.isnull().sum().sum())

0
0


In [4]:
X_train = train.drop(['TARGET'], axis = 1)
y_train = train['TARGET']
X_test = test.copy()

In [5]:
scaler = StandardScaler().set_output(transform='pandas')
train_scl = scaler.fit_transform(train)

In [6]:
pca = PCA().set_output(transform='pandas')
principalComponents = pca.fit_transform(train_scl)

Principal components explain more than 90% variation taking all the variables except ID and target.

In [7]:
cumsum = pd.Series(np.cumsum(pca.explained_variance_ratio_*100))
print(cumsum)

0        8.057630
1       13.272155
2       16.905268
3       20.373755
4       23.778130
          ...    
365    100.000000
366    100.000000
367    100.000000
368    100.000000
369    100.000000
Length: 370, dtype: float64


In [8]:
pca_90 = (cumsum<90).sum()+1
print("Principal components needed to explain more than 90% variation",pca_90)

Principal components needed to explain more than 90% variation 76


In [9]:
prcomp = PCA(n_components=pca_90).set_output(transform='pandas')

## RandomForestClassifier

In [10]:
rfc = RandomForestClassifier(random_state=24)

pipe_rfc = Pipeline([('SCL', scaler), ('PCA', prcomp), ('TREE', rfc)]) 

pipe_rfc.fit(X_train, y_train)

In [None]:
y_pred_rfc = pipe_rfc.predict(X_test)
y_pred_prob_rfc = pipe_rfc.predict_proba(X_test)
y_pred_prob_rfc = y_pred_prob_rfc [:,1]

In [None]:
submit_rfr = pd.DataFrame({'ID': list(test.index), 'TARGET': y_pred_prob_rfc})

print("\nFor RandomForest Regressor")
print(submit_rfr)


For RandomForest Regressor
           ID    TARGET
0           2  0.060000
1           5  0.030000
2           6  0.010000
3           7  0.060000
4           9  0.039762
...       ...       ...
75813  151831  0.320000
75814  151832  0.190000
75815  151833  0.030000
75816  151834  0.310000
75817  151837  0.024000

[75818 rows x 2 columns]


In [None]:
submit_rfr.to_csv('Santander-RandomForestClassifier.csv', index=False)

## XGBClassifier

In [None]:
xgb = XGBClassifier()

pipe_xgb = Pipeline([('SCL', scaler), ('PCA', prcomp), ('XGB', xgb)]) 

pipe_xgb.fit(X_train, y_train)

In [None]:
y_pred_xgb = pipe_xgb.predict(X_test)
y_pred_prob_xgb = pipe_xgb.predict_proba(X_test)
y_pred_prob_xgb = y_pred_prob_xgb [:,1]

In [None]:
submit_xgb = pd.DataFrame({'ID': list(test.index), 'TARGET': y_pred_prob_xgb})

print("\nFor XGB Classifier")
print(submit_xgb)


For XGB Classifier
           ID    TARGET
0           2  0.035947
1           5  0.020006
2           6  0.001776
3           7  0.006436
4           9  0.000420
...       ...       ...
75813  151831  0.059518
75814  151832  0.034790
75815  151833  0.001660
75816  151834  0.048362
75817  151837  0.000947

[75818 rows x 2 columns]


In [None]:
submit_xgb.to_csv('Santander-XGBClassifier.csv', index=False)

## CatBoostClassifier

In [None]:
cat = CatBoostClassifier()

pipe_cat = Pipeline([('SCL', scaler), ('PCA', prcomp), ('CAT', cat)]) 

pipe_cat.fit(X_train, y_train)

Learning rate set to 0.065477
0:	learn: 0.5925999	total: 218ms	remaining: 3m 37s
1:	learn: 0.5108762	total: 242ms	remaining: 2m
2:	learn: 0.4433857	total: 266ms	remaining: 1m 28s
3:	learn: 0.3902014	total: 289ms	remaining: 1m 11s
4:	learn: 0.3507235	total: 311ms	remaining: 1m 1s
5:	learn: 0.3129792	total: 335ms	remaining: 55.5s
6:	learn: 0.2823807	total: 359ms	remaining: 50.9s
7:	learn: 0.2593898	total: 382ms	remaining: 47.4s
8:	learn: 0.2400831	total: 405ms	remaining: 44.6s
9:	learn: 0.2247434	total: 428ms	remaining: 42.4s
10:	learn: 0.2120966	total: 452ms	remaining: 40.7s
11:	learn: 0.2006892	total: 476ms	remaining: 39.2s
12:	learn: 0.1912323	total: 500ms	remaining: 38s
13:	learn: 0.1832906	total: 523ms	remaining: 36.8s
14:	learn: 0.1773016	total: 545ms	remaining: 35.8s
15:	learn: 0.1722191	total: 569ms	remaining: 35s
16:	learn: 0.1676791	total: 592ms	remaining: 34.3s
17:	learn: 0.1642097	total: 615ms	remaining: 33.6s
18:	learn: 0.1610736	total: 638ms	remaining: 33s
19:	learn: 0.1584

In [None]:
y_pred_cat = pipe_cat.predict(X_test)
y_pred_prob_cat = pipe_cat.predict_proba(X_test)
y_pred_prob_cat = y_pred_prob_cat [:,1]

In [None]:
submit_cat = pd.DataFrame({'ID': list(test.index), 'TARGET': y_pred_prob_cat})

print("\nFor CatBoost Classifier")
print(submit_cat)


For CatBoost Classifier
           ID    TARGET
0           2  0.034197
1           5  0.023592
2           6  0.004566
3           7  0.012528
4           9  0.001949
...       ...       ...
75813  151831  0.108376
75814  151832  0.019316
75815  151833  0.010565
75816  151834  0.054278
75817  151837  0.001652

[75818 rows x 2 columns]


In [None]:
submit_cat.to_csv('Santander-CatBoostClassifier.csv', index=False)

## LGBMClassifier

In [None]:
lgmb = LGBMClassifier()

pipe_lgmb = Pipeline([('SCL', scaler), ('PCA', prcomp), ('LGBM', lgmb)]) 

pipe_lgmb.fit(X_train, y_train)

  File "c:\Users\Administrator.DAI-PC2\.conda\envs\New_Env\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


[LightGBM] [Info] Number of positive: 3008, number of negative: 73012
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.028041 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 19380
[LightGBM] [Info] Number of data points in the train set: 76020, number of used features: 76
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.039569 -> initscore=-3.189348
[LightGBM] [Info] Start training from score -3.189348


In [None]:
y_pred_lgmb = pipe_lgmb.predict(X_test)
y_pred_prob_lgmb = pipe_lgmb.predict_proba(X_test)
y_pred_prob_lgmb = y_pred_prob_lgmb [:,1]

In [None]:
submit_lgbm = pd.DataFrame({'ID': list(test.index), 'TARGET': y_pred_prob_lgmb})

print("\nFor LGBMClassifier")
print(submit_lgbm)


For LGBMClassifier
           ID    TARGET
0           2  0.031369
1           5  0.047384
2           6  0.004798
3           7  0.010803
4           9  0.001997
...       ...       ...
75813  151831  0.089510
75814  151832  0.017213
75815  151833  0.006006
75816  151834  0.059209
75817  151837  0.002150

[75818 rows x 2 columns]


In [None]:
submit_lgbm.to_csv('Santander-LGBMClassifier.csv', index=False)