In [5]:
import numpy as np
import pandas as pd
import seaborn as sns
import pickle
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler

In [6]:
#importing the training data set
traindf1=pd.read_csv('train_BRC.csv')

In [7]:
#data preprocessing
#checking for null values
traindf1.info() #we can confirm that there is no null values in the dataset 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89392 entries, 0 to 89391
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              89392 non-null  int64 
 1   gender          89392 non-null  object
 2   area            89392 non-null  object
 3   qualification   89392 non-null  object
 4   income          89392 non-null  object
 5   marital_status  89392 non-null  int64 
 6   vintage         89392 non-null  int64 
 7   claim_amount    89392 non-null  int64 
 8   num_policies    89392 non-null  object
 9   policy          89392 non-null  object
 10  type_of_policy  89392 non-null  object
 11  cltv            89392 non-null  int64 
dtypes: int64(5), object(7)
memory usage: 8.2+ MB


In [8]:
#Data is having lot of categorical type variables, hence we need to convert it into numerical
#label encoding 
category_col=['gender','area','qualification','income','num_policies','policy','type_of_policy']
for column in category_col:
    traindf1[column]=LabelEncoder().fit_transform(traindf1[column])
traindf1.head()

Unnamed: 0,id,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy,cltv
0,1,1,1,0,1,1,5,5790,1,0,1,64308
1,2,1,0,1,1,0,8,5080,1,0,1,515400
2,3,1,1,0,1,1,8,2599,1,0,1,64212
3,4,0,0,1,1,0,7,0,1,0,1,97920
4,5,1,1,1,3,1,6,3508,1,0,0,59736


In [34]:
#assigning the independant and dependant variables to x and y respectively and further splitting our train dataset into 70:30 propotion to validate model
x=traindf1.drop(['id','cltv'],axis=1)
y=traindf1['cltv']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20)

In [35]:
GBmodel=GradientBoostingRegressor(learning_rate=0.1,n_estimators=150)
GBmodel.fit(x_train,y_train)

In [36]:
train_predict=GBmodel.predict(x_train)
test_predict=GBmodel.predict(x_test)

In [37]:
#training data prediction scores
print("****training data prediction********")
print("R2 score :", r2_score(y_train,train_predict))

****training data prediction********
R2 score : 0.17077420929471643


In [38]:
#testing data prediction scores
print("****test data prediction********")
print("R2 score :", r2_score(y_test,test_predict))

****test data prediction********
R2 score : 0.15840071256029553


In [39]:
pickle.dump(GBmodel,open('my_model_GB2regression.sav',"wb"))

In [40]:
myregressionGB=pickle.load(open("my_model_GB2regression.sav","rb")) #calling our model .sav file as a new model parameter

In [41]:
testdf=pd.read_csv('test_koRSKBP.csv') #importing the given csv test file\

#categorizing variables which is in object format 
category_col=['gender','area','qualification','income','num_policies','policy','type_of_policy'] 
for column in category_col:
    testdf[column]=LabelEncoder().fit_transform(testdf[column])
    
x_pred_test=testdf.drop(['id'],axis=1) 
x_pred_test

Unnamed: 0,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy
0,0,0,1,1,0,6,2134,1,1,2
1,0,1,1,0,0,4,4102,1,0,1
2,1,0,1,1,1,7,2925,1,1,0
3,0,0,0,3,1,2,0,1,1,2
4,0,1,1,0,0,5,14059,1,1,2
...,...,...,...,...,...,...,...,...,...,...
59590,1,0,0,1,1,6,0,0,0,2
59591,1,1,1,1,0,6,0,0,2,0
59592,0,1,1,0,1,4,3323,1,1,1
59593,1,0,0,1,0,1,2886,1,0,2


In [42]:
cltv=myregressionGB.predict(x_pred_test)
testdf['cltv']=cltv
testdf

Unnamed: 0,id,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy,cltv
0,89393,0,0,1,1,0,6,2134,1,1,2,91597.386193
1,89394,0,1,1,0,0,4,4102,1,0,1,130031.920280
2,89395,1,0,1,1,1,7,2925,1,1,0,96308.692255
3,89396,0,0,0,3,1,2,0,1,1,2,86509.629324
4,89397,0,1,1,0,0,5,14059,1,1,2,125662.145021
...,...,...,...,...,...,...,...,...,...,...,...,...
59590,148983,1,0,0,1,1,6,0,0,0,2,47936.792378
59591,148984,1,1,1,1,0,6,0,0,2,0,54225.492348
59592,148985,0,1,1,0,1,4,3323,1,1,1,111210.801999
59593,148986,1,0,0,1,0,1,2886,1,0,2,108046.150142


In [43]:
sample_submission=testdf[['id','cltv']] # creating a sample dataframe with only the predicted values and respective customer id.
sample_submission.to_csv("Mogith_sample_submissionGB2.csv",index=False) #storing the file locally as CSV."""

In [None]:
gb = GradientBoostingRegressor(learning_rate=0.1,n_estimators=150)
scores = cross_val_score(gb, x_train, y_train, scoring='r2', cv=10)
scores  