# Import Libraries

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold

from catboost import CatBoostClassifier, Pool

In [17]:
trainset = pd.read_csv('C:/Users/16010/Desktop/Deep learning from Scratch/kaggle-S4E7/data/train.csv').drop('id', axis=1)
testset = pd.read_csv('C:/Users/16010/Desktop/Deep learning from Scratch/kaggle-S4E7/data/test.csv').drop('id', axis=1)

In [18]:
cat_cols = [col for col in trainset.columns if trainset[col].dtype=='object']
num_cols = [col for col in trainset.columns if col not in cat_cols]

In [19]:
trainset['Annual_Premium'] = trainset['Annual_Premium'].astype('int32')
num_cols.pop(num_cols.index('Annual_Premium'))
trainset[num_cols] = trainset[num_cols].astype('int16')

In [20]:
#Concatenate train and test dataframes
df = pd.concat([trainset, testset])

categorical_features = ['Gender','Vehicle_Age','Vehicle_Damage']

#Label encode categorical features
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

df["Vehicle_Age"] = df["Vehicle_Age"].astype('int8')
df['Gender'] = df['Gender'].astype('int8')
df['Vehicle_Damage'] = df['Vehicle_Damage'].astype('int8')
df['Age'] = df['Age'].astype('int8')
df['Driving_License'] = df['Driving_License'].astype('int8')
df['Region_Code'] = df['Region_Code'].astype('int8')
df['Previously_Insured'] = df['Previously_Insured'].astype('int8')
df['Annual_Premium'] = df['Annual_Premium'].astype('int32')
df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype('int16')
df['Vintage'] = df['Vintage'].astype('int16')

# Create the new features by factorizing the concatenated string columns
df['Previously_Insured_Annual_Premium'] = (pd.factorize((df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str)).to_numpy())[0]).astype('int32')
df['Previously_Insured_Vehicle_Age'] = (pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str)).to_numpy())[0]).astype('int8')
df['Previously_Insured_Vehicle_Damage'] = (pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str)).to_numpy())[0]).astype('int8')
df['Previously_Insured_Vintage'] = (pd.factorize((df['Previously_Insured'].astype(str) + df['Vintage'].astype(str)).to_numpy())[0]).astype('int16')

# Split the combined dataframe back into train and test
trainset = df.iloc[:trainset.shape[0]].reset_index(drop=True)
testset = df.iloc[trainset.shape[0]:].reset_index(drop=True)

del df

In [21]:
# trainset.info()

In [22]:
X = trainset.drop('Response', axis=1)
y = trainset['Response']

In [23]:
# testset = pd.read_csv("/kaggle/input/playground-series-s4e7/test.csv").drop('id', axis=1)
testset['Annual_Premium'] = testset['Annual_Premium'].astype('int32')
num_cols.pop(num_cols.index('Response'))
testset[num_cols] = testset[num_cols].astype('int16')

In [24]:
# testset.info()

In [25]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds = []
fold_roc_auc = []
val_preds=pd.DataFrame(columns=['index', 'true', 'pred'])

for train_idx, val_idx in skf.split(X, y):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    X_tr_pool = Pool(X_tr, y_tr, cat_features=X.columns.values)
    X_val_pool = Pool(X_val, y_val, cat_features=X.columns.values)
    X_test_pool = Pool(testset, cat_features=X.columns.values)

    cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'class_names': [0, 1],
    'learning_rate': 0.075,
    'iterations': 100000,
    'depth': 9,
    'random_strength': 0,
    'l2_leaf_reg': 0.5,
    'max_leaves': 512,
    'fold_permutation_block': 64,
    'task_type': 'GPU',
    'random_seed': 42,
    'verbose': False,
    'allow_writing_files': False
}
    
    model = CatBoostClassifier(**cat_params)
    model.fit(X=X_tr_pool, eval_set=X_val_pool, verbose=1000, early_stopping_rounds=200)
    y_pred = model.predict(X_val_pool)
    fold_preds = pd.DataFrame({'index': val_idx, 'true': y_val, 'pred': y_pred})
    val_preds = pd.concat([val_preds, fold_preds], ignore_index=True)
    
    y_pred = model.predict_proba(X_val_pool)[:, 1]
    fold_roc_auc.append(roc_auc_score(y_val, y_pred))
    test_preds.append(model.predict_proba(X_test_pool)[:, 1])

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8757758	best: 0.8757758 (0)	total: 673ms	remaining: 18h 41m 28s
1000:	test: 0.8950536	best: 0.8950536 (1000)	total: 9m 33s	remaining: 15h 45m 26s
2000:	test: 0.8952756	best: 0.8952757 (1999)	total: 19m 1s	remaining: 15h 31m 36s
bestTest = 0.895295918
bestIteration = 2320
Shrink model to first 2321 iterations.


  val_preds = pd.concat([val_preds, fold_preds], ignore_index=True)
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8751968	best: 0.8751968 (0)	total: 682ms	remaining: 18h 55m 49s
1000:	test: 0.8946249	best: 0.8946249 (1000)	total: 9m 32s	remaining: 15h 43m 27s
2000:	test: 0.8948678	best: 0.8948702 (1985)	total: 18m 55s	remaining: 15h 27m 15s
bestTest = 0.8948943317
bestIteration = 2492
Shrink model to first 2493 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8754984	best: 0.8754984 (0)	total: 763ms	remaining: 21h 11m 15s
1000:	test: 0.8949112	best: 0.8949112 (1000)	total: 9m 28s	remaining: 15h 37m 32s
2000:	test: 0.8951159	best: 0.8951165 (1984)	total: 18m 52s	remaining: 15h 24m 7s
bestTest = 0.8951507211
bestIteration = 2529
Shrink model to first 2530 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8751477	best: 0.8751477 (0)	total: 628ms	remaining: 17h 26m 16s
1000:	test: 0.8947049	best: 0.8947049 (1000)	total: 9m 30s	remaining: 15h 40m 33s
2000:	test: 0.8949487	best: 0.8949491 (1989)	total: 18m 57s	remaining: 15h 28m 11s
bestTest = 0.894970417
bestIteration = 2246
Shrink model to first 2247 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8762399	best: 0.8762399 (0)	total: 497ms	remaining: 13h 48m 45s
1000:	test: 0.8954354	best: 0.8954354 (1000)	total: 9m 29s	remaining: 15h 38m 6s
2000:	test: 0.8956285	best: 0.8956287 (1967)	total: 19m 1s	remaining: 15h 31m 51s
bestTest = 0.8956375718
bestIteration = 2188
Shrink model to first 2189 iterations.


In [26]:
false_preds = val_preds[val_preds['true'] != val_preds['pred']]
false_indices = false_preds['index'].values
false_rows = X.iloc[false_indices]

false_rows_with_info = false_rows.copy()
false_rows_with_info['true'] = false_preds['true'].values
false_rows_with_info['pred'] = false_preds['pred'].values

In [27]:
final_roc_auc = np.mean(fold_roc_auc)
print(final_roc_auc)

0.8951897979469642


In [28]:
blend1 = pd.read_csv('/kaggle/input/s04e07-insurance-cross-selling-stacking/sub_stacking_0.89521.csv')
blend2 = pd.read_parquet('/kaggle/input/stacking-xgb-lgbm-catb-ann/submission.parquet')
sub = pd.read_csv("/kaggle/input/playground-series-s4e7/sample_submission.csv")
sub['Response'] = np.mean(test_preds, axis=0)
sub['Response'] = np.mean([blend['Response'], sub['Response']], axis=0)
sub.to_csv('submission.csv', index=False)

blend1['Response'] = np.mean([blend1['Response'], blend2['Response']], axis=0)
blend1.to_csv('submission.csv', index=False)

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/input/s04e07-insurance-cross-selling-stacking/sub_stacking_0.89521.csv'