**Importing required librairies**

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold
from sklearn.metrics import r2_score
import lightgbm as lgb
from lightgbm import LGBMRegressor 
from bayes_opt import BayesianOptimization

import warnings
warnings.filterwarnings("ignore")

**Reading train, test and submission files**

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

**Data Analysis and Pre-processing**

In [3]:
train.head(2)

Unnamed: 0,id,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy,cltv
0,1,Male,Urban,Bachelor,5L-10L,1,5,5790,More than 1,A,Platinum,64308
1,2,Male,Rural,High School,5L-10L,0,8,5080,More than 1,A,Platinum,515400


In [4]:
test.head(2)

Unnamed: 0,id,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy
0,89393,Female,Rural,High School,5L-10L,0,6,2134,More than 1,B,Silver
1,89394,Female,Urban,High School,2L-5L,0,4,4102,More than 1,A,Platinum


In [5]:
submission.head(2)

Unnamed: 0,id,cltv
0,89393,97952.828978
1,89394,97952.828978


In [6]:
train.shape, test.shape, submission.shape

((89392, 12), (59595, 11), (59595, 2))

Separating dependent and independent features from dataframes

In [7]:
train_x = train.iloc[:,1:-1] # Removing 'id' and target column from the train dataset
train_y = train.iloc[:,-1:] # Creating a dataframe with only target column
test_x = test.iloc[:,1:] # Removing 'id' column from the test dataset

In [8]:
train_x.shape, test_x.shape

((89392, 10), (59595, 10))

In [9]:
train_x.head(2)

Unnamed: 0,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy
0,Male,Urban,Bachelor,5L-10L,1,5,5790,More than 1,A,Platinum
1,Male,Rural,High School,5L-10L,0,8,5080,More than 1,A,Platinum


In [10]:
# Combining both train and test independent features for feature engineering at a single time
X = pd.concat([train_x, test_x], axis=0)

In [11]:
X.head(2)

Unnamed: 0,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy
0,Male,Urban,Bachelor,5L-10L,1,5,5790,More than 1,A,Platinum
1,Male,Rural,High School,5L-10L,0,8,5080,More than 1,A,Platinum


In [12]:
X.shape

(148987, 10)

In [13]:
# One hot encoding all the categorical columns and removing the first one
one_hot_columns = ['gender', 'area','qualification','income','num_policies','policy','type_of_policy']
X_ohe = pd.get_dummies(data=X, columns=one_hot_columns,drop_first=True)

In [14]:
X = X_ohe

In [15]:
X.head(2)

Unnamed: 0,marital_status,vintage,claim_amount,gender_Male,area_Urban,qualification_High School,qualification_Others,income_5L-10L,income_<=2L,income_More than 10L,num_policies_More than 1,policy_B,policy_C,type_of_policy_Platinum,type_of_policy_Silver
0,1,5,5790,1,1,0,0,1,0,0,1,0,0,1,0
1,0,8,5080,1,0,1,0,1,0,0,1,0,0,1,0


In [16]:
X.shape

(148987, 15)

In [17]:
# Separating back train and test datasets based on their previous length
train_x = X.iloc[:89392,:]
test_x = X.iloc[89392:,:]

In [18]:
train_x.shape, test_x.shape

((89392, 15), (59595, 15))

In [19]:
train_y.head(2)

Unnamed: 0,cltv
0,64308
1,515400


In [20]:
train_y = train_y['cltv']
train_y.shape

(89392,)

In [21]:
train_x.head(2)

Unnamed: 0,marital_status,vintage,claim_amount,gender_Male,area_Urban,qualification_High School,qualification_Others,income_5L-10L,income_<=2L,income_More than 10L,num_policies_More than 1,policy_B,policy_C,type_of_policy_Platinum,type_of_policy_Silver
0,1,5,5790,1,1,0,0,1,0,0,1,0,0,1,0
1,0,8,5080,1,0,1,0,1,0,0,1,0,0,1,0


**Model Building**

In [22]:
# In our data, only 'vintage' and 'claim_amount' are not categorical columns. So using them in lightgbm model
column_names = list(train_x.columns)
cat_features = [i for i in column_names if i not in ['vintage','claim_amount']]

In [23]:
len(cat_features)

13

In [1]:
pwd

'C:\\Users\\Hp\\Downloads'

In [24]:
# This function will use bayesian optimization for searching the best parameters for our dataset
def search_best_param(X,y,cat_features):
    
    trainXY = lgb.Dataset(data=X, label=y,categorical_feature = cat_features,free_raw_data=False)
    def lightGBM_CV(max_depth, num_leaves, n_estimators, learning_rate, subsample, colsample_bytree, 
                lambda_l1, lambda_l2, min_child_weight):
    
        params = {'boosting_type': 'gbdt', 'objective': 'regression', 'metric':'rmse', 'verbose': -1,
                  'early_stopping_round':500}
        
        params['max_depth'] = int(round(max_depth))
        params["num_leaves"] = int(round(num_leaves))
        params["n_estimators"] = int(round(n_estimators))
        params['learning_rate'] = learning_rate
        params['subsample'] = subsample
        params['colsample_bytree'] = colsample_bytree
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['min_child_weight'] = min_child_weight
    
        score = lgb.cv(params, trainXY, nfold=5, seed=1, stratified=False, verbose_eval =False, metrics=['rmse'])

        return -np.min(score['rmse-mean']) 

    # using bayesian optimization for the best hyper-parameters
    lightGBM_Bo = BayesianOptimization(lightGBM_CV, 
                                       {
                                          'max_depth': (5, 100),
                                          'num_leaves': (20, 200),
                                          'n_estimators': (50, 10000),
                                          'learning_rate': (0.005, 0.3),
                                          'subsample': (0.1, 1),
                                          'colsample_bytree' :(0.1, 0.99),
                                          'lambda_l1': (0, 5),
                                          'lambda_l2': (0, 3),
                                          'min_child_weight': (2, 100) 
                                      },
                                       random_state = 1,
                                       verbose = 0
                                      )
    np.random.seed(1)
    
    lightGBM_Bo.maximize(init_points=5, n_iter=25) 
    
    params_set = lightGBM_Bo.max['params']
    
    # get the params of the maximum target     
    max_target = -np.inf
    for i in lightGBM_Bo.res: # loop thru all the residuals 
        if i['target'] > max_target:
            params_set = i['params']
            max_target = i['target']
    
    params_set.update({'verbose': -1})
    params_set.update({'metric': 'rmse'})
    params_set.update({'boosting_type': 'gbdt'})
    params_set.update({'objective': 'regression'})
    
    params_set['max_depth'] = int(round(params_set['max_depth']))
    params_set['num_leaves'] = int(round(params_set['num_leaves']))
    params_set['n_estimators'] = int(round(params_set['n_estimators']))
    params_set['seed'] = 1 #set seed
    
    return params_set

In [25]:
# This function will apply 5 fold cross validation with early stopping
def K_Fold_LightGBM(X_train, y_train , cat_features, num_folds = 5):
    num = 0
    models = []
    folds = KFold(n_splits=num_folds, shuffle=True, random_state=0)

    avg_r2 = 0
    for n_fold, (train_idx, valid_idx) in enumerate (folds.split(X_train, y_train)):
        print(f"model : {num}")
        train_X, train_y = X_train.iloc[train_idx], y_train.iloc[train_idx]
        valid_X, valid_y = X_train.iloc[valid_idx], y_train.iloc[valid_idx]
        
        train_data=lgb.Dataset(train_X,label=train_y, categorical_feature = cat_features,free_raw_data=False)
        valid_data=lgb.Dataset(valid_X,label=valid_y, categorical_feature = cat_features,free_raw_data=False)
        
        params_set = search_best_param(train_X,train_y,cat_features)
        CV_LGBM = lgb.train(params_set,
                            train_data,
                            num_boost_round = 2500,
                            valid_sets = valid_data,
                            early_stopping_rounds = 200,
                            verbose_eval = 100
                           )
        
    
        # increase early_stopping_rounds can lead to overfitting 
        models.append(CV_LGBM)
        tr_r2 = r2_score(train_y,models[num].predict(train_X))
        test_r2 = r2_score(valid_y,models[num].predict(valid_X))
        print("Train set r2:", tr_r2)
        print("Test set r2:", test_r2)
        print("\n")
        num = num + 1
        avg_r2 = avg_r2 + test_r2
    mean_r2 =   avg_r2/num_folds
    print("mean r2", mean_r2)
    return models



In [26]:
# Training our data for 5 folds and getting our 5 models as outputs for prediction
lgbm_models = K_Fold_LightGBM(train_x,train_y,cat_features,5)

model : 0
Training until validation scores don't improve for 200 rounds
[100]	valid_0's rmse: 82647.5
[200]	valid_0's rmse: 82738.3
Early stopping, best iteration is:
[35]	valid_0's rmse: 82577.9
Train set r2: 0.16677277043765792
Test set r2: 0.16555104241203855


model : 1
Training until validation scores don't improve for 200 rounds
[100]	valid_0's rmse: 84155.2
[200]	valid_0's rmse: 84309.7
Early stopping, best iteration is:
[44]	valid_0's rmse: 84044.8
Train set r2: 0.17032026070683548
Test set r2: 0.16055229835875384


model : 2
Training until validation scores don't improve for 200 rounds
[100]	valid_0's rmse: 84924.9
[200]	valid_0's rmse: 85061.9
Early stopping, best iteration is:
[40]	valid_0's rmse: 84846.6
Train set r2: 0.16968436964826572
Test set r2: 0.15922416033566922


model : 3
Training until validation scores don't improve for 200 rounds
[100]	valid_0's rmse: 80906
[200]	valid_0's rmse: 81026.3
Early stopping, best iteration is:
[26]	valid_0's rmse: 80752.8
Train set r

**Inference**

In [27]:
# This function will take model and test data and will predict our test data with that model.
def predict_cv(model,X):
    y_preds = model.predict(X)
        
    return y_preds

In [28]:
# ensembling predictions from first and last model
# predicting on test dataset using first and fifth model
y_prediction_1 = predict_cv(lgbm_models[0],test_x)
y_prediction_5 = predict_cv(lgbm_models[-1],test_x)

In [29]:
y_prediction_final = (y_prediction_1 + y_prediction_5)/2

**Generating Submission File**

In [30]:
submission.head(2)

Unnamed: 0,id,cltv
0,89393,97952.828978
1,89394,97952.828978


In [31]:
# replacing submission column with predicted values
submission['cltv'] = y_prediction_final

In [32]:
submission.head()

Unnamed: 0,id,cltv
0,89393,93010.23202
1,89394,131938.330744
2,89395,94304.040763
3,89396,87152.184771
4,89397,130948.746454


In [33]:
# Saving the submission dataframe to a csv file
submission.to_csv("final_prediction_J.csv",index=False)