In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import Lasso, Ridge
import lightgbm as lgb

import joblib

train_ag = False

import warnings
warnings.filterwarnings("ignore")

In [2]:
import h2o
from h2o.automl import H2OAutoML

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.24" 2024-07-16; OpenJDK Runtime Environment (build 11.0.24+8-post-Ubuntu-1ubuntu320.04); OpenJDK 64-Bit Server VM (build 11.0.24+8-post-Ubuntu-1ubuntu320.04, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmprub4m9d4
  JVM stdout: /tmp/tmprub4m9d4/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmprub4m9d4/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,03 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,3 months and 5 days
H2O_cluster_name:,H2O_from_python_unknownUser_5fmcsj
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.500 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [3]:
train = pd.read_csv("/kaggle/input/playground-series-s4e12/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s4e12/test.csv")

sample = pd.read_csv('/kaggle/input/playground-series-s4e12/sample_submission.csv')

train.drop('id', axis=1, inplace=True)
test.drop('id', axis=1, inplace=True) 

In [4]:
def date(Df):

    Df['Policy Start Date'] = pd.to_datetime(Df['Policy Start Date'])
    Df['Year'] = Df['Policy Start Date'].dt.year
    Df['Day'] = Df['Policy Start Date'].dt.day
    Df['Month'] = Df['Policy Start Date'].dt.month
    Df['Month_name'] = Df['Policy Start Date'].dt.month_name()
    Df['Day_of_week'] = Df['Policy Start Date'].dt.day_name()
    Df['Week'] = Df['Policy Start Date'].dt.isocalendar().week
    Df['Year_sin'] = np.sin(2 * np.pi * Df['Year'])
    Df['Year_cos'] = np.cos(2 * np.pi * Df['Year'])
    Df['Month_sin'] = np.sin(2 * np.pi * Df['Month'] / 12) 
    Df['Month_cos'] = np.cos(2 * np.pi * Df['Month'] / 12)
    Df['Day_sin'] = np.sin(2 * np.pi * Df['Day'] / 31)  
    Df['Day_cos'] = np.cos(2 * np.pi * Df['Day'] / 31)
    Df['Group']=(Df['Year']-2020)*48+Df['Month']*4+Df['Day']//7
    
    Df.drop('Policy Start Date', axis=1, inplace=True)

    return Df

In [5]:
train = date(train)
test = date(test)

cat_cols = [col for col in train.columns if train[col].dtype == 'object']
feature_cols = list(test.columns)

In [6]:
class CategoricalEncoder:
    def __init__(self, train, test):
        self.train = train
        self.test = test

    def frequency_encode(self, cat_cols, feature_cols, drop_org=False):

        new_cat_cols = []
        for col in cat_cols:
            freq_encoding = self.train[col].value_counts().to_dict()

            self.train[f"{col}_freq"] = self.train[col].map(freq_encoding).astype('category')
            self.test[f"{col}_freq"] = self.test[col].map(freq_encoding).astype('category')

            new_col_name = f"{col}_freq"
            new_cat_cols.append(new_col_name)
            feature_cols.append(new_col_name)
            if drop_org:
                feature_cols.remove(col)

        return self.train, self.test, new_cat_cols, feature_cols

In [7]:
encoder = CategoricalEncoder(train, test)
train, test, cat_cols, feature_cols = encoder.frequency_encode(cat_cols, feature_cols, drop_org=True)

train = train[feature_cols + ['Premium Amount']]
test = test[feature_cols]

train['Premium Amount'] = np.log1p(train['Premium Amount'])

In [8]:
train.head()

Unnamed: 0,Age,Annual Income,Number of Dependents,Health Score,Previous Claims,Vehicle Age,Credit Score,Insurance Duration,Year,Day,...,Occupation_freq,Location_freq,Policy Type_freq,Customer Feedback_freq,Smoking Status_freq,Exercise Frequency_freq,Property Type_freq,Month_name_freq,Day_of_week_freq,Premium Amount
0,19.0,10049.0,1.0,22.598761,2.0,17.0,372.0,5.0,2023,23,...,282645.0,397511,401846,375518.0,598127,306179,400349,97522,171232,7.962067
1,39.0,31678.0,3.0,15.569731,1.0,12.0,694.0,2.0,2023,12,...,,400947,399600,377905.0,601873,299830,400349,98500,172495,7.302496
2,23.0,25602.0,3.0,47.177549,1.0,14.0,,3.0,2023,30,...,282645.0,401542,401846,368753.0,601873,306179,400349,99377,171232,6.342121
3,21.0,141855.0,2.0,10.938144,1.0,0.0,367.0,1.0,2024,12,...,,400947,398554,375518.0,601873,294571,399978,98500,172546,6.641182
4,21.0,39651.0,1.0,20.376094,0.0,8.0,598.0,4.0,2021,1,...,282645.0,400947,401846,375518.0,601873,306179,400349,97522,172546,7.612337


In [9]:
def rmsle(y_true, y_pred):
    return np.sqrt(mean_squared_log_error(y_true, y_pred))

In [10]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

for i, (_, val_index) in enumerate(kf.split(train)):
    train.loc[val_index, 'fold'] = i

h_train = h2o.H2OFrame(train)
h_test = h2o.H2OFrame(test)

x = [col for col in h_train.columns if col not in ['Premium Amount', 'fold']]
y = 'Premium Amount'
fold_column = 'fold'

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [11]:
if train_ag:
    aml = H2OAutoML(
            max_runtime_secs=3600* 5,
            include_algos=["GBM", "DRF", "XGBoost", "DeepLearning"],
            keep_cross_validation_predictions=True,
            seed=42,
            verbosity="info"
        )
    aml.train(x=x, y=y, training_frame=h_train,fold_column=fold_column)
    
    leaderboard = aml.leaderboard.as_data_frame()
    print(leaderboard)

    model_ids = leaderboard['model_id'].tolist()
    
    oofs = pd.DataFrame()
    for model_id in model_ids:
        model = h2o.get_model(model_id)
        oof_predictions = model.cross_validation_holdout_predictions().as_data_frame()
        oofs[model_id] = oof_predictions['predict']

    preds = pd.DataFrame()
    for model_id in model_ids:
        model = h2o.get_model(model_id)
        test_predictions = model.predict(h_test).as_data_frame()
        preds[model_id] = test_predictions['predict']
    
    joblib.dump([oofs, preds], "h2o_automl.pkl")

else:
    oofs, preds = joblib.load("/kaggle/input/h2o-automl/h2o_automl_2.pkl")

In [12]:
models = list(oofs.columns)
for model in models:
    print(f"{model}: {rmsle(np.expm1(oofs[model]), np.expm1(train['Premium Amount']))}")

GBM_3_AutoML_1_20241204_143413: 1.0320014297226836
GBM_grid_1_AutoML_1_20241204_143413_model_5: 1.0321768694470312
GBM_2_AutoML_1_20241204_143413: 1.0325475642308148
GBM_4_AutoML_1_20241204_143413: 1.0325850144687516
GBM_grid_1_AutoML_1_20241204_143413_model_8: 1.0326429468128449
GBM_grid_1_AutoML_1_20241204_143413_model_13: 1.0327319049016155
GBM_5_AutoML_1_20241204_143413: 1.0329097423704767
GBM_grid_1_AutoML_1_20241204_143413_model_22: 1.0329897098749885
GBM_grid_1_AutoML_1_20241204_143413_model_6: 1.0330098489551862
XGBoost_grid_1_AutoML_1_20241204_143413_model_20: 1.0331724947254493
XGBoost_grid_1_AutoML_1_20241204_143413_model_43: 1.0332206890379585
XGBoost_grid_1_AutoML_1_20241204_143413_model_25: 1.0332744459198833
GBM_grid_1_AutoML_1_20241204_143413_model_15: 1.0334064188394942
XGBoost_grid_1_AutoML_1_20241204_143413_model_38: 1.0334706090313233
GBM_grid_1_AutoML_1_20241204_143413_model_2: 1.033577885597724
XGBoost_grid_1_AutoML_1_20241204_143413_model_56: 1.0335950784282746
X

In [13]:
ridge = Ridge(alpha=0.1)  

ridge.fit(oofs, train['Premium Amount'])
oof_preds = ridge.predict(oofs)
print(rmsle(np.expm1(oof_preds), np.expm1(train['Premium Amount'])))

1.030959465086669


In [14]:
test_predictions = ridge.predict(preds)

sample['Premium Amount'] = np.expm1(test_predictions)
sample.to_csv('submission.csv', index = False)