In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt



In [3]:
!unzip /content/playground-series-s4e1.zip
!unzip /content/archive.zip

Archive:  /content/playground-series-s4e1.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               
Archive:  /content/archive.zip
  inflating: Churn_Modelling.csv     


In [4]:
train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")
orig_df = pd.read_csv("/content/Churn_Modelling.csv")

test_ids = test_df.id

In [5]:
orig_df.rename(columns = {"RowNumber": "id"}, inplace=True)

In [6]:
data = pd.concat([orig_df, train_df], axis=0, ignore_index=True)
data.drop_duplicates()
target = data.Exited

data.drop(["id", "Exited", "Surname", "CustomerId"], axis=1, inplace=True)
test_df.drop(["id", "Surname", "CustomerId"], axis=1, inplace=True)
data_concat = pd.concat([data, test_df], axis=0)

In [10]:
assert len(test_df) + len(data) == len(data_concat)

In [11]:
data_concat.shape, data.shape, test_df.shape, target.shape

((285057, 10), (175034, 10), (110023, 10), (175034,))

In [7]:
categ_cols = data_concat.select_dtypes(include="object").columns.tolist()
num_cols = data_concat.select_dtypes(exclude="object").columns.tolist()


In [8]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

encoder = LabelEncoder()
for col in categ_cols:
  data_concat[col] = encoder.fit_transform(data_concat[col])

for col in num_cols:
  encoder = MinMaxScaler()
  encoder.fit(np.array(data_concat[col]).reshape(-1, 1))
  data_concat[col] = encoder.transform(np.array(data_concat[col]).reshape(-1, 1))

data_concat.columns = data_concat.columns.astype(str)

In [9]:
import scipy

skew_df = pd.DataFrame(data_concat.select_dtypes(np.number).columns, columns=['Feature'])
skew_df['Skew'] = skew_df['Feature'].apply(lambda feature: scipy.stats.skew(data_concat[feature]))
skew_df['Absolute skew'] = skew_df['Skew'].apply(abs)
skew_df['Skewed'] = skew_df['Absolute skew'].apply(lambda x: True if x >= 0.5 else False                                                                              )
skew_df

Unnamed: 0,Feature,Skew,Absolute skew,Skewed
0,CreditScore,-0.064673,0.064673,False
1,Geography,0.722024,0.722024,True
2,Gender,-0.254914,0.254914,False
3,Age,0.97715,0.97715,True
4,Tenure,0.012382,0.012382,False
5,Balance,0.363909,0.363909,False
6,NumOfProducts,0.376797,0.376797,False
7,HasCrCard,-1.16646,1.16646,True
8,IsActiveMember,0.010405,0.010405,False
9,EstimatedSalary,-0.299008,0.299008,False


In [10]:
data_concat.describe()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
count,285057.0,285057.0,285057.0,285057.0,285057.0,285057.0,285057.0,285057.0,285057.0,285057.0
mean,0.612552,0.649905,0.563217,0.27233,0.501093,0.223833,0.184389,0.751902,0.497399,0.560179
std,0.161659,0.816555,0.495988,0.120649,0.280923,0.250743,0.182493,0.43191,0.499994,0.253068
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.494,0.0,0.0,0.189189,0.3,0.0,0.0,1.0,0.0,0.368327
50%,0.618,0.0,1.0,0.256757,0.5,0.0,0.333333,1.0,0.0,0.585154
75%,0.72,1.0,1.0,0.324324,0.7,0.48067,0.333333,1.0,1.0,0.773606
max,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [11]:
for col in skew_df.query("Skewed == True")["Feature"].values:
  data_concat[col] = np.log1p(data_concat[col])

In [12]:
!pip install -q pycaret

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.7/484.7 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m94.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.4/73.4 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m81.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m160.5/160.5 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.8/106.8 kB[0m [31m13.7 M

In [13]:
train_final = data_concat.loc[:data.index.max(), :].copy()
test_final = data_concat.loc[data.index.max():, :].reset_index(drop=True).copy()

In [15]:
from pycaret.regression import setup, compare_models

_ = setup(data=pd.concat([data, target], axis=1)[:6000], target='Exited')

KeyboardInterrupt: 

In [38]:
compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,0.2229,0.108,0.3285,0.3567,0.2305,0.5158,0.588
lightgbm,Light Gradient Boosting Machine,0.217,0.1125,0.3352,0.3296,0.2357,0.4998,0.661
rf,Random Forest Regressor,0.2224,0.1157,0.3399,0.31,0.2413,0.4905,1.261
et,Extra Trees Regressor,0.2213,0.1202,0.3465,0.2833,0.2461,0.4869,0.615
ada,AdaBoost Regressor,0.2847,0.1246,0.3529,0.2578,0.2543,0.5476,0.263
xgboost,Extreme Gradient Boosting,0.2431,0.1287,0.3586,0.2323,0.2525,0.5146,0.235
br,Bayesian Ridge,0.2935,0.143,0.378,0.1486,0.2656,0.667,0.153
lr,Linear Regression,0.2935,0.143,0.378,0.1486,0.2657,0.6651,0.897
lar,Least Angle Regression,0.2935,0.143,0.378,0.1486,0.2657,0.6649,0.102
ridge,Ridge Regression,0.2935,0.143,0.378,0.1486,0.2657,0.6651,0.1


Processing:   0%|          | 0/81 [00:00<?, ?it/s]

In [16]:
from sklearn.model_selection import train_test_split, StratifiedKFold

# Initializing Stratified K-Fold with 5 folds
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [17]:
import lightgbm as lgb

# Parameters for the LightGBM model
params = {
    'max_depth': 15,
    'min_child_samples': 13,
    'learning_rate': 0.05285597081335651,
    'n_estimators': 584,
    'min_child_weight': 5,
    'subsample': 0.7717873512945741,
    'colsample_bytree': 0.10012816493265511,
    'reg_alpha': 0.8767668608061822,
    'reg_lambda': 0.8705834466355764,
    'random_state': 42,
    'verbose': -1
}

# Training multiple LightGBM models using Stratified K-Fold
LGBModel = lgb.LGBMClassifier(**params)

In [18]:
import xgboost as xgb


# Parameters for the XGBoost model
params = {
    'n_estimators': 767,
    'learning_rate': 0.03820381248841593,
    'max_depth': 9,
    'subsample': 0.5717706003972762,
    'colsample_bytree': 0.1386492762520236,
    'min_child_weight': 6,
    'random_state': 42,
    'verbose': -1
}

xgb_model = xgb.XGBClassifier(**params)

In [19]:
from sklearn.ensemble import GradientBoostingClassifier

params = {
    'n_estimators': 867,
    'learning_rate': 0.03420381248841593,
    'subsample': 0.5717706003972762,
}

gbc_model = GradientBoostingClassifier(**params)

In [20]:
!pip install -q catboost

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [21]:
from catboost import CatBoostClassifier


# Parameters for the CatBoost model
params = {
    'logging_level': 'Silent',
    'random_seed': 42,
    'iterations': 593,
    'depth': 43,
    'min_data_in_leaf': 42,
    'learning_rate': 0.025456006693305914,
    'subsample': 0.8018560299887264,
    'random_strength': 0.04176274518438195,
    'grow_policy': 'Lossguide',
    'bootstrap_type' : 'Bernoulli',
    # 'bootstrap_type': 'Poisson'
}

cat_model = CatBoostClassifier(**params)

In [22]:
from sklearn.ensemble import RandomForestClassifier

params = {
    'n_estimators': 800,
    'max_depth': 19,
    'min_samples_split': 3,
    'min_samples_leaf': 20,
    'max_features': 'auto',
    'random_state': 42  # Set a random state for reproducibility
}


rf_model = RandomForestClassifier(**params)

In [23]:
from sklearn.ensemble import StackingClassifier
from sklearn.neural_network import MLPClassifier

# Initializing an MLPClassifier
mlp = MLPClassifier(
    hidden_layer_sizes=(64, 32),
    max_iter=1000,
    random_state=42,
    activation='relu',
    learning_rate_init=0.001,
    solver='adam',
    validation_fraction=0.1,
    momentum=0.9,
    nesterovs_momentum=True,
    batch_size=32,
    beta_1=0.9,
    beta_2=0.999
)

# Creating a StackingClassifier
stacking_model = StackingClassifier(
    estimators=[
        ('LGBM', LGBModel),
        ('XGB', xgb_model),
        ('CAT', cat_model),
        ('GBC', gbc_model),
        ('RF', rf_model),
    ],
    final_estimator=mlp,
    cv=skf
)

In [24]:
%%capture

stacking_model.fit(train_final, target)

In [25]:
preds = stacking_model.predict_proba(test_final[1:])

In [149]:
test_ids.shape, test_final.shape

((110023,), (110024, 10))

In [26]:
preds = pd.DataFrame(preds[:,1:])
preds.columns = ['Exited']

In [27]:
preds_final = pd.concat([test_ids, preds], axis=1)

In [163]:
preds_final.tail

<bound method NDFrame.tail of             id    Exited
0       165034  0.020965
1       165035  0.763849
2       165036  0.041352
3       165037  0.180972
4       165038  0.407128
...        ...       ...
110018  275052  0.045466
110019  275053  0.164214
110020  275054  0.019496
110021  275055  0.105748
110022  275056  0.187407

[110023 rows x 2 columns]>

In [84]:
preds_final.head

<bound method NDFrame.head of             id       0
0       165034  Exited
0       165034       0
1       165035       1
2       165036       0
3       165037       0
...        ...     ...
110017  275051       0
110018  275052       0
110019  275053       0
110020  275054       0
110021  275055       0

[110023 rows x 2 columns]>

In [28]:
preds_final.to_csv("submission.csv", index=False, header=True)

In [None]:
output_df.to_csv('submission.csv', index=False)