# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
file:///home/grealish/Documents/xwechat_files/wxid_l1jgqvkdovso12_f7f9/msg/file/2024-07/main.py

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold

from catboost import CatBoostClassifier, Pool

In [2]:
trainset = pd.read_csv('/home/zhenghao/kaggle/train.csv').drop('id', axis=1)
testset = pd.read_csv('/home/zhenghao/kaggle/test.csv').drop('id', axis=1)

In [3]:
cat_cols = [col for col in trainset.columns if trainset[col].dtype=='object']
num_cols = [col for col in trainset.columns if col not in cat_cols]

In [4]:
trainset['Annual_Premium'] = trainset['Annual_Premium'].astype('int32')
num_cols.pop(num_cols.index('Annual_Premium'))
trainset[num_cols] = trainset[num_cols].astype('int16')

In [5]:
#Concatenate train and test dataframes
df = pd.concat([trainset, testset])

categorical_features = ['Gender','Vehicle_Age','Vehicle_Damage']

#Label encode categorical features
for col in categorical_features:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

df["Vehicle_Age"] = df["Vehicle_Age"].astype('int8')
df['Gender'] = df['Gender'].astype('int8')
df['Vehicle_Damage'] = df['Vehicle_Damage'].astype('int8')
df['Age'] = df['Age'].astype('int8')
df['Driving_License'] = df['Driving_License'].astype('int8')
df['Region_Code'] = df['Region_Code'].astype('int8')
df['Previously_Insured'] = df['Previously_Insured'].astype('int8')
df['Annual_Premium'] = df['Annual_Premium'].astype('int32')
df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype('int16')
df['Vintage'] = df['Vintage'].astype('int16')

# Create the new features by factorizing the concatenated string columns
df['Previously_Insured_Annual_Premium'] = (pd.factorize((df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str)).to_numpy())[0]).astype('int32')
df['Previously_Insured_Vehicle_Age'] = (pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str)).to_numpy())[0]).astype('int8')
df['Previously_Insured_Vehicle_Damage'] = (pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str)).to_numpy())[0]).astype('int8')
df['Previously_Insured_Vintage'] = (pd.factorize((df['Previously_Insured'].astype(str) + df['Vintage'].astype(str)).to_numpy())[0]).astype('int16')

# Split the combined dataframe back into train and test
trainset = df.iloc[:trainset.shape[0]].reset_index(drop=True)
testset = df.iloc[trainset.shape[0]:].reset_index(drop=True)

del df

In [6]:
# trainset.info()

In [7]:
X = trainset.drop('Response', axis=1)
y = trainset['Response']

In [8]:
# testset = pd.read_csv("/kaggle/input/playground-series-s4e7/test.csv").drop('id', axis=1)
testset['Annual_Premium'] = testset['Annual_Premium'].astype('int32')
num_cols.pop(num_cols.index('Response'))
testset[num_cols] = testset[num_cols].astype('int16')

In [24]:
# testset.info()

In [10]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
test_preds = []
fold_roc_auc = []
val_preds=pd.DataFrame(columns=['index', 'true', 'pred'])

for train_idx, val_idx in skf.split(X, y):
    X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_tr, y_val = y[train_idx], y[val_idx]
    
    X_tr_pool = Pool(X_tr, y_tr, cat_features=X.columns.values)
    X_val_pool = Pool(X_val, y_val, cat_features=X.columns.values)
    X_test_pool = Pool(testset, cat_features=X.columns.values)

    cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'Logloss',
    'class_names': [0, 1],
    'learning_rate': 0.075,
    'iterations': 100000,
    'depth': 9,
    'random_strength': 0,
    'l2_leaf_reg': 0.5,
    'max_leaves': 512,
    'fold_permutation_block': 64,
    'task_type': 'GPU',
    'random_seed': 42,
    'verbose': False,
    'allow_writing_files': False
}
    
    model = CatBoostClassifier(**cat_params)
    model.fit(X=X_tr_pool, eval_set=X_val_pool, verbose=1000, early_stopping_rounds=200)
    y_pred = model.predict(X_val_pool)
    fold_preds = pd.DataFrame({'index': val_idx, 'true': y_val, 'pred': y_pred})
    val_preds = pd.concat([val_preds, fold_preds], ignore_index=True)
    
    y_pred = model.predict_proba(X_val_pool)[:, 1]
    fold_roc_auc.append(roc_auc_score(y_val, y_pred))
    test_preds.append(model.predict_proba(X_test_pool)[:, 1])

0:	learn: 0.5646826	test: 0.5647473	best: 0.5647473 (0)	total: 480ms	remaining: 13h 20m 6s
1000:	learn: 0.2395255	test: 0.2403852	best: 0.2403852 (1000)	total: 9m 22s	remaining: 15h 27m 38s
2000:	learn: 0.2366319	test: 0.2402143	best: 0.2402138 (1928)	total: 18m 42s	remaining: 15h 16m 14s
bestTest = 0.2402039258
bestIteration = 2180
Shrink model to first 2181 iterations.


  val_preds = pd.concat([val_preds, fold_preds], ignore_index=True)


0:	learn: 0.5677809	test: 0.5677639	best: 0.5677639 (0)	total: 478ms	remaining: 13h 15m 54s
1000:	learn: 0.2394519	test: 0.2407799	best: 0.2407797 (997)	total: 9m 24s	remaining: 15h 30m 19s
2000:	learn: 0.2365456	test: 0.2405782	best: 0.2405767 (1960)	total: 18m 48s	remaining: 15h 20m 49s


In [None]:
false_preds = val_preds[val_preds['true'] != val_preds['pred']]
false_indices = false_preds['index'].values
false_rows = X.iloc[false_indices]

false_rows_with_info = false_rows.copy()
false_rows_with_info['true'] = false_preds['true'].values
false_rows_with_info['pred'] = false_preds['pred'].values

In [None]:
final_roc_auc = np.mean(fold_roc_auc)
print(final_roc_auc)

In [None]:
blend1 = pd.read_csv('/kaggle/input/s04e07-insurance-cross-selling-stacking/sub_stacking_0.89521.csv')
blend2 = pd.read_parquet('/kaggle/input/stacking-xgb-lgbm-catb-ann/submission.parquet')
sub = pd.read_csv("/kaggle/input/playground-series-s4e7/sample_submission.csv")
sub['Response'] = np.mean(test_preds, axis=0)
sub['Response'] = np.mean([blend['Response'], sub['Response']], axis=0)
sub.to_csv('submission.csv', index=False)

blend1['Response'] = np.mean([blend1['Response'], blend2['Response']], axis=0)
blend1.to_csv('submission.csv', index=False)