In [1]:
import numpy as np
import pandas as pd
import optuna
import seaborn as sns
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
train_df = pd.read_csv('playground-series-s4e12/train.csv', index_col='id')
test_df = pd.read_csv('playground-series-s4e12/test.csv', index_col='id')

In [3]:
from datetime import timedelta
train_df.columns = train_df.columns.str.lower().str.replace(' ', '_')
test_df.columns = test_df.columns.str.lower().str.replace(' ', '_')
def feature_transformer(df):

    df['policy_start_date'] = pd.to_datetime(df['policy_start_date'])

    df['year'] = df['policy_start_date'].dt.year
    df['day'] = df['policy_start_date'].dt.day
    df['month'] = df['policy_start_date'].dt.month
    df['day_of_year'] = df['policy_start_date'].dt.dayofyear
    df['day_of_week'] = df['policy_start_date'].dt.weekday
    df['sin_day_of_week'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['cos_day_of_week'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    
    df["seconds since 1970"] = df['policy_start_date'].astype("int64") // 10**9
    
    df['days_passed'] = (df['policy_start_date'].max() - df['policy_start_date']).dt.days
    
    df['cat_day_of_week'] = df['day_of_week'].astype('string')
    df['cat_year'] = df['year'].astype('string')
    df['cat_day'] = df['day'].astype('string')
    df['cat_day_of_year'] = df['day_of_year'].astype('string')
    df['cat_month'] = df['month'].astype('string')
    

    policy_starts_min = df['policy_start_date'].min()  # 2019-08-17
    year = policy_starts_min.year  
    
    if policy_starts_min >= pd.Timestamp(f'{year}-01-01'):
        fiscal_year_start = pd.Timestamp(f'{year}-08-01')
    else:
        fiscal_year_start = pd.Timestamp(f'{year-1}-08-01')

    
    df['time_from_fiscal_year'] = (df['policy_start_date'] - fiscal_year_start).dt.days
    df['seconds_from_fiscal_year'] = (df['policy_start_date'] - fiscal_year_start).dt.total_seconds()

    new_date = policy_starts_min - timedelta(days=1)
    
    df['time_from_first_policy'] = (df['policy_start_date'] - new_date).dt.days
    
    df['time_from_first_policy_seconds'] = (df['policy_start_date'] - new_date).dt.total_seconds()
    
    df['Days Passed'] = (df['policy_start_date'].max() - df['policy_start_date']).dt.days

    df['claims_vs_duration'] = df['previous_claims'] / df['insurance_duration']
    df['days_from_2019_crisis'] = (df['policy_start_date'] - pd.Timestamp('2019-01-01')).dt.days
    df['revenue_per_dependent'] = (df['annual_income'] / df['number_of_dependents'] + 1)
    df['ratio_of_doubts'] = (df['previous_claims'] + 1) / df['annual_income'] # NEW
    
    df['marital_status_customer_feedback'] = df['marital_status'] + df['customer_feedback']
    df['customer_feedback_property_type'] = df['customer_feedback'] + df['property_type']
    df['customer_feedback_year'] = df['customer_feedback'] + df['cat_year'] # тут упала
    df['marital_status_year'] = df['marital_status'] + df['cat_year']
    df['exercise_frequency_year'] = df['exercise_frequency'] + df['cat_year']
    df['customer_feedback_smoking_status'] = df['customer_feedback'] + df['smoking_status']
    df['customer_feedback_property_type'] = df['customer_feedback'] + df['property_type']
    df['year_day_of_week'] = df['cat_year'] + df['cat_day_of_week']
    df['property_type_year'] = df['property_type'] + df['cat_year']
    df['policy_type_year'] = df['policy_type'] + df['cat_year']
    df['gender_marital_status'] = df['gender'] + df['marital_status']
    df['education_level_customer_feedback'] = df['education_level'] + df['customer_feedback']
    df['gender_year'] = df['gender'] + df['cat_year']
    df['marital_status_day_of_week'] = df['marital_status'] + df['cat_day_of_week']
    df['occupation_customer_feedback'] = df['occupation'] + df['customer_feedback']
    df['occupation_year'] = df['occupation'] + df['cat_year']
    df['location_customer_feedback'] = df['location'] + df['customer_feedback'] 
    df['location_day_of_week'] = df['location'] + df['cat_day_of_week']
    df['education_level_year'] = df['education_level'] + df['cat_year']
    df['customer_feedback_month'] = df['customer_feedback'] + df['cat_month']
    df['marital_status_day'] = df['marital_status'] + df['cat_day']
    df['exercise_frequency_property_type'] = df['exercise_frequency'] + df['property_type']
    df['education_level_day_of_week'] = df['education_level'] + df['cat_day_of_week']

    df.drop(columns=['time_from_first_policy', 'policy_start_date'], inplace=True)

    return df

In [4]:
def columns_imputer(df):
    for c in df.columns:
        df[f"is_{c}_na"] = df[c].isna().astype(int)

    return df

In [5]:
num_cols = ['age', 'annual_income', 'number_of_dependents', 'health_score','previous_claims', 'vehicle_age', 'credit_score', 'insurance_duration']

for col in num_cols:
    print(f"Column {col} has {train_df[train_df[col] == 0].shape[0]} zeros")

Column age has 0 zeros
Column annual_income has 0 zeros
Column number_of_dependents has 218124 zeros
Column health_score has 0 zeros
Column previous_claims has 305433 zeros
Column vehicle_age has 61615 zeros
Column credit_score has 0 zeros
Column insurance_duration has 0 zeros


In [6]:
def num_to_cat(df):
    global num_cols

    for col in num_cols:
        df[f"categorial_{col}"] = df[col].astype('category')

    return df

In [7]:
X = train_df.drop('premium_amount', axis=1)
y = train_df['premium_amount']

y_log1p = np.log1p(y)

X = feature_transformer(X)
X = columns_imputer(X)
X = num_to_cat(X) # categorical_age

cats = X.select_dtypes(include=['string', 'category', 'object']).columns.tolist()
nums = X.select_dtypes(include=['number']).columns.tolist()

In [8]:
for col in cats:
    X[col] = X[col].astype('string')


X[cats] = X[cats].fillna('None').astype('string')
X[nums] = X[nums].fillna(-999).astype(float)

In [9]:
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from catboost import CatBoostRegressor, Pool

class CatboostWithOptuna:
    def __init__(self, X, y, n_folds=10, random_state=42, verbose=0, task_type='GPU', optuna_params=None):
        self.X = X
        self.y = y
        self.n_folds = n_folds
        self.random_state = random_state
        self.verbose = verbose
        self.task_type = task_type
        self.best_model_params = {}

        # Placeholders
        self.oof_predictions = []  # Out-of-fold predictions
        self.catboost_params_history = []  # История гиперпараметров
        self.best_score = float('inf')  # Лучший результат

    def _suggest_hyperparams(self, trial):
        return {
            'iterations' : trial.suggest_int('iterations', 400, 1000),
            'learning_rate' : trial.suggest_float('learning_rate', 0.01, 0.4, log=True),
            'depth' : trial.suggest_int('depth', 4, 9),
            'l2_leaf_reg' : trial.suggest_float('l2_leaf_reg', 0.1, 1, log=True),
            'loss_function' : trial.suggest_categorical('loss_function', ['RMSE']),
            'random_strength' : trial.suggest_float('random_strength', 1e-3, 1, log=True),
            'bagging_temperature' : trial.suggest_float('bagging_temperature', 1e-2, 1, log=True)
        }
    
    def objective(self, trial):
        
        params = self._suggest_hyperparams(trial)

        cat_features = self.X.select_dtypes(include=['category', 'string', 'object']).columns.tolist()

        # K-fold CV
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=self.random_state)
        fold_rmsle = []
        oof_preds = np.zeros((self.X.shape[0]))

        for fold, (train_index, val_index) in enumerate(kf.split(self.X)):
            print(f"Fold {fold + 1} out of {self.n_folds}")

            X_train, X_val = self.X.iloc[train_index], self.X.iloc[val_index]
            y_train, y_val = self.y.iloc[train_index], self.y.iloc[val_index]

            # Model creation 
            model = CatBoostRegressor(iterations=params['iterations'],
                                      learning_rate=params['learning_rate'],
                                      depth=params['depth'],
                                      l2_leaf_reg=params['l2_leaf_reg'],
                                      loss_function=params['loss_function'],
                                      random_strength=params['random_strength'],
                                      bagging_temperature=params['bagging_temperature'],
                                     verbose=0,
                                     task_type=self.task_type)
            
            model.fit(X_train, y_train, cat_features=cat_features)

            val_pred = model.predict(X_val)

            oof_preds[val_index] = val_pred

            fold_rmsle.append(mean_squared_error(y_val, val_pred, squared=False))
        
        # save oof predictions
        self.oof_predictions.append(oof_preds.copy())

        mean_rmsle = np.mean(fold_rmsle)
        self.catboost_params_history.append({**params, 'RMSE': mean_rmsle})

        return mean_rmsle

        


    def optimize(self,n_trials=10):
        study = optuna.create_study(direction='minimize')
        study.optimize(self.objective, n_trials=n_trials)
        self.best_model_params = study.best_params
        self.best_score = study.best_value

        return study

    @property
    def oof(self):
        if not self.oof_predictions:
            raise ValueError("No OOF preds available")
        return self.oof_predictions

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

y_train_log1p = np.log1p(y_train)
y_test_log1p = np.log1p(y_test)

In [11]:
catboost_params = {'iterations': 719, 'learning_rate': 0.050771585717022505, 'depth': 9, 'l2_leaf_reg': 0.35647655856918115, 'loss_function': 'RMSE', 'random_strength': 0.06546237246369487, 'bagging_temperature': 0.011774942898945484}

In [12]:
#Function for creating oof predictions (on test data)
def cv_test_preds(X, y, X_test,n_folds=5):

    folds = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    test_preds = []

    for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):

        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]

        # Finding categorical features
        cat_features = X_test.select_dtypes(exclude=['number']).columns.tolist()


        model = CatBoostRegressor(**catboost_params,
        random_state=42,
        task_type='CPU',
        verbose = 0,
        cat_features=cat_features)
        model.fit(X_train, y_train, cat_features=cat_features)

        # Making predictions
        test_preds.append(model.predict(X_test))

    return test_preds

In [13]:
X_test = test_df

X_test = feature_transformer(X_test)
X_test = columns_imputer(X_test)
X_test = num_to_cat(X_test) # categorical_age

cats = X_test.select_dtypes(include=['string', 'category', 'object']).columns.tolist()
nums = X_test.select_dtypes(include=['number']).columns.tolist()

# # Ensure that 'None' is included as a category
# X[cats] = X[cats].fillna('None').astype('string')
# X[nums] = X[nums].fillna(-999).astype(float)

#X = feature_transformer(X)

In [14]:
for col in cats:
    X_test[col] = X_test[col].astype('string')


X_test[cats] = X_test[cats].fillna('None').astype('string')
X_test[nums] = X_test[nums].fillna(-999).astype(float)

In [15]:
def cv_test_preds(X, y, X_test, n_folds=5):
    folds = KFold(n_splits=n_folds, shuffle=True, random_state=42)
    test_preds = []

    for fold, (train_idx, val_idx) in enumerate(folds.split(X, y)):
        X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]

        cat_features = X.select_dtypes(exclude=['number']).columns.tolist()

        model = CatBoostRegressor(
            **catboost_params,
            random_state=42,
            task_type='CPU',
            verbose=0,
            cat_features=cat_features
        )
        model.fit(X_train, y_train, cat_features=cat_features)

        test_preds.append(model.predict(X_test))

    return test_preds

In [16]:
X_final_preds = cv_test_preds(X, y_log1p, X_test, n_folds=5)

preds = sum(np.expm1(X_final_preds))/len(X_final_preds)

In [17]:
sample_submission = pd.read_csv('playground-series-s4e12/sample_submission.csv')

sample_submission['Premium Amount'] = preds
sample_submission.to_csv("submission.csv", index = False)

print('DONE!')

DONE!
