In [1]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error

import warnings
warnings.filterwarnings("ignore")

import joblib

In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s4e12/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e12/test.csv")

sample = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv')

train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True) 

In [3]:
def date(Df):

    Df['Policy Start Date'] = pd.to_datetime(Df['Policy Start Date'])
    Df['Year'] = Df['Policy Start Date'].dt.year
    Df['Day'] = Df['Policy Start Date'].dt.day
    Df['Month'] = Df['Policy Start Date'].dt.month
    Df['Month_name'] = Df['Policy Start Date'].dt.month_name()
    Df['Day_of_week'] = Df['Policy Start Date'].dt.day_name()
    Df['Week'] = Df['Policy Start Date'].dt.isocalendar().week
    Df['Year_sin'] = np.sin(2 * np.pi * Df['Year'])
    Df['Year_cos'] = np.cos(2 * np.pi * Df['Year'])
    Df['Month_sin'] = np.sin(2 * np.pi * Df['Month'] / 12) 
    Df['Month_cos'] = np.cos(2 * np.pi * Df['Month'] / 12)
    Df['Day_sin'] = np.sin(2 * np.pi * Df['Day'] / 31)  
    Df['Day_cos'] = np.cos(2 * np.pi * Df['Day'] / 31)
    Df['Group']=(Df['Year']-2020)*48+Df['Month']*4+Df['Day']//7
    
    Df.drop('Policy Start Date', axis=1, inplace=True)

    return Df

In [4]:
train = date(train)
test = date(test)

# cat_c = [col for col in train.columns if train[col].dtype == 'object']

# def update(df):
#     global cat_c

#     for c in cat_c:
#         df[c] = df[c].fillna('None').astype('category')
                
#     return df

# train = update(train)
# test = update(test)

columns_to_convert = train.columns.difference(['Premium Amount'])

train[columns_to_convert] = train[columns_to_convert].fillna('None').astype('string')
test[columns_to_convert] = test[columns_to_convert].fillna('None').astype('string')

In [5]:
X = train.drop('Premium Amount', axis=1)  
y = train['Premium Amount']
cat_features = X.columns.values

In [6]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [7]:
def train():
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    oof = np.zeros(len(X))
    models = []

    for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
        print(f"Fold {fold + 1}")
        X_train, X_valid = X.iloc[train_idx], X.iloc[valid_idx]
        y_train, y_valid = y.iloc[train_idx], y.iloc[valid_idx]

        model = CatBoostRegressor(
            iterations=1000,
            learning_rate=0.1,
            depth=6,
            eval_metric="RMSE",
            random_seed=42,
            verbose=200,
            task_type='GPU',
            l2_leaf_reg =  0.7,
        )
        
        model.fit(X_train,
                  y_train,
                  eval_set=(X_valid, y_valid), 
                  early_stopping_rounds=300,
                  cat_features=cat_features,
                 )
        models.append(model)
        oof[valid_idx] = np.maximum(0, model.predict(X_valid))
        fold_rmsle = rmsle(y_valid, oof[valid_idx])
        print(f"Fold {fold + 1} RMSLE: {fold_rmsle}")
        
    return models, oof

In [8]:
models,oof = train()

Fold 1
0:	learn: 863.0665179	test: 862.0076188	best: 862.0076188 (0)	total: 2.44s	remaining: 40m 33s
200:	learn: 844.2642746	test: 839.9112690	best: 839.9111877 (199)	total: 24.4s	remaining: 1m 37s
400:	learn: 842.8756264	test: 839.3871034	best: 839.3858428 (396)	total: 47.5s	remaining: 1m 11s
600:	learn: 841.8764362	test: 839.1342129	best: 839.1339688 (599)	total: 1m 10s	remaining: 47.1s
800:	learn: 840.8740291	test: 839.0453292	best: 839.0448004 (793)	total: 1m 34s	remaining: 23.4s
999:	learn: 840.0187934	test: 839.0566385	best: 839.0319450 (854)	total: 1m 57s	remaining: 0us
bestTest = 839.031945
bestIteration = 854
Shrink model to first 855 iterations.
Fold 1 RMSLE: 1.1407747931996586
Fold 2
0:	learn: 863.3809140	test: 861.3106193	best: 861.3106193 (0)	total: 107ms	remaining: 1m 47s
200:	learn: 844.4909738	test: 838.6962937	best: 838.6960902 (198)	total: 22.1s	remaining: 1m 27s
400:	learn: 843.0542366	test: 838.1310510	best: 838.1253087 (372)	total: 45s	remaining: 1m 7s
600:	learn: 

In [9]:
print(rmsle(y, oof))

1.1408728857820265


In [10]:
test_predictions = np.zeros(len(test))

for model in models:
    test_predictions += np.maximum(0, model.predict(test)) / len(models)


sample['Premium Amount'] = test_predictions
sample.to_csv('submission.csv', index = False)

joblib.dump([oof,test_predictions],"cat_non_loged.pkl")

['cat_non_loged.pkl']