# Import Libraries

In [14]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold

from catboost import CatBoostClassifier, Pool

In [15]:
trainset = pd.read_csv('/root/autodl-tmp/train.csv').drop('id', axis=1)
testset = pd.read_csv('/root/autodl-tmp/test.csv').drop('id', axis=1)

In [16]:
cat_cols = [col for col in trainset.columns if trainset[col].dtype=='object']
num_cols = [col for col in trainset.columns if col not in cat_cols]

In [17]:
trainset['Annual_Premium'] = trainset['Annual_Premium'].astype('int32')
num_cols.pop(num_cols.index('Annual_Premium'))
trainset[num_cols] = trainset[num_cols].astype('int16')

In [18]:
#Concatenate train and test dataframes
df = pd.concat([trainset, testset])

categorical_features = ['Gender','Vehicle_Age','Vehicle_Damage']

#Label encode categorical features
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

df["Vehicle_Age"] = df["Vehicle_Age"].astype('int8')
df['Gender'] = df['Gender'].astype('int8')
df['Vehicle_Damage'] = df['Vehicle_Damage'].astype('int8')
df['Age'] = df['Age'].astype('int8')
df['Driving_License'] = df['Driving_License'].astype('int8')
df['Region_Code'] = df['Region_Code'].astype('int8')
df['Previously_Insured'] = df['Previously_Insured'].astype('int8')
df['Annual_Premium'] = df['Annual_Premium'].astype('int32')
df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype('int16')
df['Vintage'] = df['Vintage'].astype('int16')

# Create the new features by factorizing the concatenated string columns
df['Previously_Insured_Annual_Premium'] = (pd.factorize((df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str)).to_numpy())[0]).astype('int32')
df['Previously_Insured_Vehicle_Age'] = (pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str)).to_numpy())[0]).astype('int8')
df['Previously_Insured_Vehicle_Damage'] = (pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str)).to_numpy())[0]).astype('int8')
df['Previously_Insured_Vintage'] = (pd.factorize((df['Previously_Insured'].astype(str) + df['Vintage'].astype(str)).to_numpy())[0]).astype('int16')

# Split the combined dataframe back into train and test
trainset = df.iloc[:trainset.shape[0]].reset_index(drop=True)
testset = df.iloc[trainset.shape[0]:].reset_index(drop=True)

del df

In [19]:
# trainset.info()

In [20]:
X = trainset.drop('Response', axis=1)
y = trainset['Response']

In [21]:
# testset = pd.read_csv("/kaggle/input/playground-series-s4e7/test.csv").drop('id', axis=1)
testset['Annual_Premium'] = testset['Annual_Premium'].astype('int32')
num_cols.pop(num_cols.index('Response'))
testset[num_cols] = testset[num_cols].astype('int16')

In [22]:
# testset.info()

In [23]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds = []
fold_roc_auc = []
val_preds=pd.DataFrame(columns=['index', 'true', 'pred'])

for train_idx, val_idx in skf.split(X, y):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    X_tr_pool = Pool(X_tr, y_tr, cat_features=X.columns.values)
    X_val_pool = Pool(X_val, y_val, cat_features=X.columns.values)
    X_test_pool = Pool(testset, cat_features=X.columns.values)

    cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'class_names': [0, 1],
    'learning_rate': 0.075,
    'iterations': 100000,
    'depth': 9,
    'random_strength': 0,
    'l2_leaf_reg': 0.5,
    'max_leaves': 512,
    'fold_permutation_block': 64,
    'task_type': 'GPU',
    'random_seed': 42,
    'verbose': False,
    'allow_writing_files': False
}
    
    model = CatBoostClassifier(**cat_params)
    model.fit(X=X_tr_pool, eval_set=X_val_pool, verbose=1000, early_stopping_rounds=200)
    y_pred = model.predict(X_val_pool)
    fold_preds = pd.DataFrame({'index': val_idx, 'true': y_val, 'pred': y_pred})
    val_preds = pd.concat([val_preds, fold_preds], ignore_index=True)
    
    y_pred = model.predict_proba(X_val_pool)[:, 1]
    fold_roc_auc.append(roc_auc_score(y_val, y_pred))
    test_preds.append(model.predict_proba(X_test_pool)[:, 1])

Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8757758	best: 0.8757758 (0)	total: 209ms	remaining: 5h 49m 7s
1000:	test: 0.8948372	best: 0.8948372 (1000)	total: 3m 44s	remaining: 6h 9m 18s
2000:	test: 0.8950548	best: 0.8950548 (2000)	total: 7m 27s	remaining: 6h 4m 55s
bestTest = 0.8950665891
bestIteration = 2215
Shrink model to first 2216 iterations.


  val_preds = pd.concat([val_preds, fold_preds], ignore_index=True)
Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8751968	best: 0.8751968 (0)	total: 206ms	remaining: 5h 42m 43s
1000:	test: 0.8944306	best: 0.8944306 (1000)	total: 3m 44s	remaining: 6h 10m 37s
2000:	test: 0.8946346	best: 0.8946407 (1963)	total: 7m 27s	remaining: 6h 4m 54s
bestTest = 0.8946615458
bestIteration = 2521
Shrink model to first 2522 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8754973	best: 0.8754973 (0)	total: 208ms	remaining: 5h 45m 53s
1000:	test: 0.8947486	best: 0.8947487 (993)	total: 3m 44s	remaining: 6h 10m 38s
bestTest = 0.8949151635
bestIteration = 1770
Shrink model to first 1771 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8751477	best: 0.8751477 (0)	total: 244ms	remaining: 6h 47m 18s
1000:	test: 0.8945301	best: 0.8945301 (1000)	total: 3m 44s	remaining: 6h 10m 40s
2000:	test: 0.8947668	best: 0.8947668 (2000)	total: 7m 26s	remaining: 6h 4m 11s
3000:	test: 0.8947908	best: 0.8948044 (2814)	total: 11m 8s	remaining: 6h 1s
bestTest = 0.8948043585
bestIteration = 2814
Shrink model to first 2815 iterations.


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8762399	best: 0.8762399 (0)	total: 210ms	remaining: 5h 50m 8s
1000:	test: 0.8952098	best: 0.8952098 (1000)	total: 3m 43s	remaining: 6h 7m 46s
2000:	test: 0.8954019	best: 0.8954073 (1900)	total: 7m 25s	remaining: 6h 3m 20s
bestTest = 0.8954072595
bestIteration = 1900
Shrink model to first 1901 iterations.


In [24]:
false_preds = val_preds[val_preds['true'] != val_preds['pred']]
false_indices = false_preds['index'].values
false_rows = X.iloc[false_indices]

false_rows_with_info = false_rows.copy()
false_rows_with_info['true'] = false_preds['true'].values
false_rows_with_info['pred'] = false_preds['pred'].values

In [25]:
final_roc_auc = np.mean(fold_roc_auc)
print(final_roc_auc)

0.8949709896696504


In [26]:
blend1 = pd.read_csv('/root/autodl-tmp/sub_stacking_0.89521.csv')
blend2 = pd.read_parquet('/root/autodl-tmp/submission.parquet')


blend1['Response'] = np.mean([blend1['Response'], blend2['Response']], axis=0)
blend1.to_csv('submission_final.csv', index=False)