## 1. Importing libraries

In [1]:
import warnings
warnings.filterwarnings(action="ignore")
import os
import ast
import pandas as pd
import numpy as np
from matplotlib import pyplot
import matplotlib.patches as mpatches
import seaborn as sn
from tqdm.std import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [2]:
class Config:
    data_dir = '../data/'
    working_dir = '../src/'
    models_dir = '../models/'
    submissions_dir = '../submissions'

## 2. Importing files

In [3]:
train = pd.read_csv(filepath_or_buffer=os.path.join(Config.data_dir, "Train.csv"))
metadata = pd.read_csv(filepath_or_buffer=os.path.join(Config.data_dir, "metadata.csv"))
test = pd.read_csv(filepath_or_buffer=os.path.join(Config.data_dir, "Test.csv"))
ss = pd.read_csv(filepath_or_buffer=os.path.join(Config.data_dir, "SampleSubmission.csv"))

## 3. Preprocessing

In [4]:
print(metadata.isna().sum())

ID                         0
RegistrationDate           0
Deposit                    0
UpsellDate             36370
AccessoryRate              0
PaymentMethod              0
rateTypeEntity             0
RatePerUnit                0
DaysOnDeposit              0
MainApplicantGender        0
Age                     6939
Region                  1934
Town                       0
Occupation                 0
SupplierName               0
Term                       0
TotalContractValue         0
ExpectedTermDate           0
FirstPaymentDate           0
LastPaymentDate            0
dtype: int64


In [5]:
metadata[metadata.isna().Region].head(10)

Unnamed: 0,ID,RegistrationDate,Deposit,UpsellDate,AccessoryRate,PaymentMethod,rateTypeEntity,RatePerUnit,DaysOnDeposit,MainApplicantGender,Age,Region,Town,Occupation,SupplierName,Term,TotalContractValue,ExpectedTermDate,FirstPaymentDate,LastPaymentDate
3,ID_HXBJFHB,2015-11-25 00:00:00,2000,,0.0,FINANCED,DAILY,35,7,Female,43.0,,UNKNOWN,Teacher,d_light,364,14740.0,2016-11-23 00:00:00,2015-11-25 13:25:57,2017-05-22 16:46:54
17,ID_AYIBQUP,2015-12-12 00:00:00,2000,,0.0,FINANCED,DAILY,35,7,Male,44.0,,UNKNOWN,Other,d_light,364,14740.0,2016-12-10 00:00:00,2015-12-12 07:00:38,2020-09-02 20:30:53
51,ID_35MNQL1,2015-12-06 00:00:00,2000,,0.0,FINANCED,DAILY,35,7,Male,42.0,,UNKNOWN,Farmer,d_light,364,14740.0,2016-12-04 00:00:00,2015-12-07 06:21:45,2017-11-18 11:07:25
84,ID_RWA0Q3U,2017-07-12 14:44:48,2000,,0.0,FINANCED,DAILY,40,7,Female,52.0,,UNKNOWN,Government Employee,d_light,364,16560.0,2018-07-11 14:44:48,2017-07-12 14:45:19,2020-09-02 20:31:12
85,ID_QWI7WWN,2015-11-24 00:00:00,2000,,0.0,FINANCED,DAILY,35,7,Male,40.0,,UNKNOWN,Other,d_light,364,14740.0,2016-11-22 00:00:00,2015-11-24 14:02:51,2016-11-25 14:58:45
89,ID_UOT1MF3,2017-07-13 08:56:05,2000,,0.0,FINANCED,DAILY,40,7,Male,54.0,,UNKNOWN,Business,d_light,364,16560.0,2018-07-12 08:56:05,2017-07-13 08:56:23,2020-09-02 20:31:28
109,ID_V9Z4RFZ,2016-11-16 17:36:24,2000,2019-08-28 13:14:11,10.0,FINANCED,DAILY,40,7,Male,33.0,,UNKNOWN,Business,d_light,311,17560.0,2017-09-23 17:36:24,2016-11-18 04:38:30,2020-09-02 20:32:34
118,ID_M0OORNB,2016-12-17 15:34:24,2000,2019-08-28 12:58:00,40.0,FINANCED,DAILY,40,14,Male,39.0,,UNKNOWN,Labourer,d_light,207,18560.0,2017-07-12 15:34:24,2016-12-17 15:45:36,2021-02-14 08:23:31
125,ID_FQ1F7WY,2016-12-05 15:32:35,2000,2019-09-04 11:09:50,70.0,FINANCED,DAILY,40,7,Male,38.0,,UNKNOWN,Farmer,d_light,168,20560.0,2017-05-22 15:32:35,2016-12-05 15:39:15,2020-09-02 20:33:28
151,ID_1FSXREN,2017-07-11 10:34:42,2000,,0.0,FINANCED,DAILY,40,7,Male,34.0,,UNKNOWN,Business,d_light,364,16560.0,2018-07-10 10:34:42,2017-07-11 10:22:41,2020-09-02 20:33:31


In [6]:
metadata.Region.value_counts()

Nyanza                6998
North Rift            5344
Nairobi Region        5056
South Rift            4759
Coast Region          4671
Western               4508
Mount Kenya Region    4073
Name: Region, dtype: int64

In [7]:
pd.merge(left=metadata[metadata.isna().Region], right=test, on="ID").head(2)
#test data also contains empty values in region therefore we need to treat as a category.

Unnamed: 0,ID,RegistrationDate,Deposit,UpsellDate,AccessoryRate,PaymentMethod,rateTypeEntity,RatePerUnit,DaysOnDeposit,MainApplicantGender,Age,Region,Town,Occupation,SupplierName,Term,TotalContractValue,ExpectedTermDate,FirstPaymentDate,LastPaymentDate,TransactionDates,PaymentsHistory
0,ID_UOT1MF3,2017-07-13 08:56:05,2000,,0.0,FINANCED,DAILY,40,7,Male,54.0,,UNKNOWN,Business,d_light,364,16560.0,2018-07-12 08:56:05,2017-07-13 08:56:23,2020-09-02 20:31:28,"['07-2017', '08-2017', '09-2017', '10-2017', '11-2017', '12-2017', '01-2018', '02-2018', '03-2018', '04-2018', '05-2018', '06-2018', '07-2018', '08-2018', '10-2018', '11-2018', '12-2018', '01-2019', '02-2019', '03-2019', '04-2019', '07-2019']","[2720.0, 1080.0, 1200.0, 1000.0, 960.0, 1160.0, 1120.0, 600.0, 1040.0, 840.0, 680.0, 640.0, 680.0, 520.0, 80.0, 80.0, 40.0, 40.0, 240.0, 40.0, 80.0, 40.0]"
1,ID_290EB8N,2017-07-05 09:06:30,2000,,0.0,FINANCED,DAILY,40,7,Male,31.0,,UNKNOWN,Labourer,d_light,364,16560.0,2018-07-04 09:06:30,2017-07-05 08:54:37,2020-09-02 20:35:26,"['07-2017', '08-2017', '12-2017', '01-2018', '02-2018', '03-2018', '04-2018', '05-2018', '06-2018', '07-2018', '08-2018', '09-2018', '10-2018']","[2710.0, 120.0, 200.0, 300.0, 50.0, 250.0, 550.0, 100.0, 500.0, 350.0, 350.0, 950.0, 200.0]"


In [8]:
metadata.Region.fillna(value="Other", inplace=True)

In [9]:
metadata.UpsellDate.fillna(value=0, inplace=True)

In [10]:
pd.merge(left=metadata[metadata.isna().Age], right=test, on="ID").head(2)
##now just using mean here to remove 0 in metadata.


Unnamed: 0,ID,RegistrationDate,Deposit,UpsellDate,AccessoryRate,PaymentMethod,rateTypeEntity,RatePerUnit,DaysOnDeposit,MainApplicantGender,Age,Region,Town,Occupation,SupplierName,Term,TotalContractValue,ExpectedTermDate,FirstPaymentDate,LastPaymentDate,TransactionDates,PaymentsHistory
0,ID_VJ80SX2,2015-12-14 00:00:00,2000,0,0.0,FINANCED,DAILY,35,7,Female,,Mount Kenya Region,Embu,Business,d_light,364,14740.0,2016-12-12 00:00:00,2015-12-14 14:20:26,2016-12-12 11:59:20,"['12-2015', '01-2016', '02-2016', '03-2016', '04-2016', '05-2016', '06-2016']","[3000.0, 850.0, 750.0, 1500.0, 650.0, 1250.0, 1000.0]"
1,ID_ZLW8XIB,2015-12-05 00:00:00,2000,0,0.0,FINANCED,DAILY,35,7,Male,,Mount Kenya Region,Embu,Farmer,d_light,364,14740.0,2016-12-03 00:00:00,2015-12-05 15:28:45,2017-04-01 19:19:19,"['12-2015', '01-2016', '02-2016', '03-2016', '04-2016', '05-2016', '06-2016', '07-2016', '08-2016', '09-2016', '10-2016']","[3200.0, 735.0, 1320.0, 1000.0, 1170.0, 866.0, 730.0, 100.0, 100.0, 450.0, 700.0]"


In [11]:
metadata.Age.fillna(value=round(metadata.Age.mean()), inplace=True)

In [12]:
metadata.head(4)

Unnamed: 0,ID,RegistrationDate,Deposit,UpsellDate,AccessoryRate,PaymentMethod,rateTypeEntity,RatePerUnit,DaysOnDeposit,MainApplicantGender,Age,Region,Town,Occupation,SupplierName,Term,TotalContractValue,ExpectedTermDate,FirstPaymentDate,LastPaymentDate
0,ID_K00S4N4,2015-12-10 00:00:00,2000,0,0.0,FINANCED,DAILY,35,7,Male,41.0,Mount Kenya Region,Embu,Other,d_light,364,14740.0,2016-12-08 00:00:00,2015-12-10 09:52:35,2016-10-23 04:52:30
1,ID_6L67PAA,2015-12-09 00:00:00,2000,0,0.0,FINANCED,DAILY,35,7,Male,33.0,Coast Region,Kilifi,Other,d_light,364,14740.0,2016-12-07 00:00:00,2015-12-09 13:14:03,2020-05-24 15:32:18
2,ID_102CV85,2015-12-18 00:00:00,2000,2018-03-29 10:14:58,35.0,FINANCED,DAILY,35,7,Female,48.0,Nairobi Region,Makueni,Business,d_light,392,29480.0,2017-01-13 00:00:00,2015-12-18 06:22:34,2017-02-01 15:23:44
3,ID_HXBJFHB,2015-11-25 00:00:00,2000,0,0.0,FINANCED,DAILY,35,7,Female,43.0,Other,UNKNOWN,Teacher,d_light,364,14740.0,2016-11-23 00:00:00,2015-11-25 13:25:57,2017-05-22 16:46:54


In [16]:
merged = pd.merge(left=metadata, right=pd.concat(objs=[train, test]).fillna(value=0), on="ID")
merged.head(5)

Unnamed: 0,ID,RegistrationDate,Deposit,UpsellDate,AccessoryRate,PaymentMethod,rateTypeEntity,RatePerUnit,DaysOnDeposit,MainApplicantGender,Age,Region,Town,Occupation,SupplierName,Term,TotalContractValue,ExpectedTermDate,FirstPaymentDate,LastPaymentDate,TransactionDates,PaymentsHistory,m1,m2,m3,m4,m5,m6
0,ID_K00S4N4,2015-12-10 00:00:00,2000,0,0.0,FINANCED,DAILY,35,7,Male,41.0,Mount Kenya Region,Embu,Other,d_light,364,14740.0,2016-12-08 00:00:00,2015-12-10 09:52:35,2016-10-23 04:52:30,"['12-2015', '01-2016', '02-2016', '03-2016', '04-2016']","[3050.0, 1050.0, 910.0, 1050.0, 1050.0]",1225.0,1050.0,1190.0,525.0,1750.0,1890.0
1,ID_6L67PAA,2015-12-09 00:00:00,2000,0,0.0,FINANCED,DAILY,35,7,Male,33.0,Coast Region,Kilifi,Other,d_light,364,14740.0,2016-12-07 00:00:00,2015-12-09 13:14:03,2020-05-24 15:32:18,"['12-2015', '01-2016', '02-2016', '03-2016', '05-2016', '07-2016']","[4000.0, 1050.0, 1050.0, 1050.0, 1050.0, 400.0]",0.0,0.0,0.0,0.0,0.0,0.0
2,ID_102CV85,2015-12-18 00:00:00,2000,2018-03-29 10:14:58,35.0,FINANCED,DAILY,35,7,Female,48.0,Nairobi Region,Makueni,Business,d_light,392,29480.0,2017-01-13 00:00:00,2015-12-18 06:22:34,2017-02-01 15:23:44,"['12-2015', '01-2016', '02-2016', '03-2016', '04-2016', '05-2016', '06-2016', '07-2016', '08-2016']","[4245.0, 980.0, 735.0, 735.0, 1470.0, 735.0, 980.0, 915.0, 735.0]",980.0,980.0,1225.0,980.0,935.0,355.0
3,ID_HXBJFHB,2015-11-25 00:00:00,2000,0,0.0,FINANCED,DAILY,35,7,Female,43.0,Other,UNKNOWN,Teacher,d_light,364,14740.0,2016-11-23 00:00:00,2015-11-25 13:25:57,2017-05-22 16:46:54,"['11-2015', '12-2015', '01-2016', '02-2016', '03-2016', '04-2016', '05-2016', '06-2016', '07-2016', '08-2016', '10-2016']","[2245.0, 980.0, 980.0, 1225.0, 980.0, 980.0, 980.0, 1225.0, 735.0, 490.0, 250.0]",250.0,1000.0,250.0,500.0,560.0,1150.0
4,ID_3K9VZ5J,2015-12-02 00:00:00,2000,0,0.0,FINANCED,DAILY,35,7,Female,56.0,Mount Kenya Region,Kirinyaga,Other,d_light,364,14740.0,2016-11-30 00:00:00,2015-12-05 10:34:32,2017-05-12 16:50:52,"['12-2015', '01-2016', '02-2016', '03-2016', '04-2016', '05-2016', '06-2016', '07-2016', '08-2016', '09-2016', '10-2016', '11-2016']","[2750.0, 1000.0, 750.0, 1000.0, 955.0, 880.0, 280.0, 665.0, 770.0, 420.0, 525.0, 735.0]",630.0,805.0,700.0,855.0,245.0,775.0


In [19]:
merged.drop(["FirstPaymentDate"],axis=1,inplace=True)

In [20]:
def diff(a,b):
    start=b.split('-')
    end=a.split('-')
    return int(end[1])-int(start[1])+12*(int(end[0])-int(start[0]))

In [21]:
#merged['month']=merged['RegistrationDate']
#merged['year']=merged['RegistrationDate']
merged['time']=merged['RegistrationDate']
merged['PaidAmount']=merged['TotalContractValue']
merged['PeriodDiff']=merged['TotalContractValue']
for i in tqdm(range(0,merged.shape[0]),desc='Row'):
    year=int(merged.RegistrationDate.iloc[i].split('-')[0])
    month=int(merged.RegistrationDate.iloc[i].split('-')[1])
    time=np.ceil(merged.loc[i,'Term']/30)
    merged.loc[i,'time']=time
    if merged.UpsellDate.iloc[i]!=0:
        merged.loc[i,'UpsellDate']=diff(merged.UpsellDate.iloc[i],merged.RegistrationDate.iloc[i])/time
    merged.loc[i,'LastPaymentDate']=diff(merged.LastPaymentDate.iloc[i],merged.RegistrationDate.iloc[i])/time
    merged.loc[i,'ExpectedTermDate']=diff(merged.ExpectedTermDate.iloc[i],merged.RegistrationDate.iloc[i])/time
    merged.loc[i,'m1']=merged.loc[i,'m1']/merged.loc[i,'TotalContractValue']
    merged.loc[i,'m2']=merged.loc[i,'m2']/merged.loc[i,'TotalContractValue']
    merged.loc[i,'m3']=merged.loc[i,'m3']/merged.loc[i,'TotalContractValue']
    merged.loc[i,'m4']=merged.loc[i,'m4']/merged.loc[i,'TotalContractValue']
    merged.loc[i,'m5']=merged.loc[i,'m5']/merged.loc[i,'TotalContractValue']
    merged.loc[i,'m6']=merged.loc[i,'m6']/merged.loc[i,'TotalContractValue']
    dates=[]
    for j in merged.loc[i,'TransactionDates'].split("'"):
        if '-' in j:
            date=j.split('-')
            dates.append((int(date[0])-month+12*(int(date[1])-year))/time)
    merged.at[i,'TransactionDates']=dates
    history=[]
    for j in merged.loc[i,'PaymentsHistory'].split("[")[1].split("]")[0].split(","):
        if '.' in j:
            history.append(float(j)/merged.loc[i,'TotalContractValue'])
    merged.at[i,'PaymentsHistory']=history
    merged.loc[i,'PaidAmount']=sum(history)
    merged.loc[i,'PeriodDiff']=merged.loc[i,'LastPaymentDate']-dates[-1]
merged.drop(["RegistrationDate","Term"],axis=1,inplace=True)

Row: 100%|██████████| 37343/37343 [01:57<00:00, 317.02it/s]


In [22]:
merged.tail(2)

Unnamed: 0,ID,Deposit,UpsellDate,AccessoryRate,PaymentMethod,rateTypeEntity,RatePerUnit,DaysOnDeposit,MainApplicantGender,Age,Region,Town,Occupation,SupplierName,TotalContractValue,ExpectedTermDate,LastPaymentDate,TransactionDates,PaymentsHistory,m1,m2,m3,m4,m5,m6,time,PaidAmount,PeriodDiff
37341,ID_GHHAQ9D,2400,0,0.0,FINANCED,DAILY,50,3,Male,27.0,Other,UNKNOWN,Labourer,d_light,14400.0,1.0,1.75,"[0.0, 0.125, 0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]","[0.18194444444444444, 0.09027777777777778, 0.08333333333333333, 0.07291666666666667, 0.0763888888888889, 0.04513888888888889, 0.010416666666666666, 0.0038194444444444443, 0.021875]",0.034722,0.059028,0.045139,0.048611,0.041667,0.048611,8.0,0.586111,0.75
37342,ID_PAI1FJK,2400,0,0.0,FINANCED,DAILY,50,3,Male,36.0,Coast Region,Mombasa,Farmer,d_light,14400.0,1.0,1.875,"[0.0, 0.125, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0, 1.125]","[0.1701388888888889, 0.003472222222222222, 0.003472222222222222, 0.09375, 0.03125, 0.07291666666666667, 0.04861111111111111, 0.0798611111111111, 0.05277777777777778]",0.100694,0.055556,0.017361,0.003472,0.010417,0.045139,8.0,0.55625,0.75


In [23]:
from sklearn.preprocessing import LabelEncoder
lb_make = LabelEncoder()
category_list=['PaymentMethod','rateTypeEntity','MainApplicantGender','Region','Town','Occupation','SupplierName']
for s in category_list:
    merged[s] = lb_make.fit_transform(merged[s])
    print(lb_make.transform(lb_make.classes_))
    print(lb_make.classes_)
merged.head(2)

[0]
['FINANCED']
[0 1 2]
['DAILY' 'MONTHLY' 'WEEKLY']
[0 1]
['Female' 'Male']
[0 1 2 3 4 5 6 7]
['Coast Region' 'Mount Kenya Region' 'Nairobi Region' 'North Rift'
 'Nyanza' 'Other' 'South Rift' 'Western']
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47]
['Baringo' 'Bomet' 'Bungoma' 'Busia' 'Elgeyo/Marakwet' 'Embu' 'Garissa'
 'Homa Bay' 'Isiolo' 'Kajiado' 'Kakamega' 'Kericho' 'Kiambu' 'Kilifi'
 'Kirinyaga' 'Kisii' 'Kisumu' 'Kitui' 'Kwale' 'Laikipia' 'Lamu' 'Machakos'
 'Makueni' 'Mandera' 'Marsabit' 'Meru' 'Migori' 'Mombasa' 'Muranga'
 'Nairobi City' 'Nakuru' 'Nandi' 'Narok' 'Nyamira' 'Nyandarua' 'Nyeri'
 'Samburu' 'Siaya' 'Taita/Taveta' 'Tana River' 'Tharaka-Nithi'
 'Trans Nzoia' 'Turkana' 'UNKNOWN' 'Uasin Gishu' 'Vihiga' 'Wajir'
 'West Pokot']
[0 1 2 3 4 5 6]
['Business' 'Driver/Motorbike Rider' 'Farmer' 'Government Employee'
 'Labourer' 'Other' 'Teacher']
[0]
['d_light']


Unnamed: 0,ID,Deposit,UpsellDate,AccessoryRate,PaymentMethod,rateTypeEntity,RatePerUnit,DaysOnDeposit,MainApplicantGender,Age,Region,Town,Occupation,SupplierName,TotalContractValue,ExpectedTermDate,LastPaymentDate,TransactionDates,PaymentsHistory,m1,m2,m3,m4,m5,m6,time,PaidAmount,PeriodDiff
0,ID_K00S4N4,2000,0,0.0,0,0,35,7,1,41.0,1,5,5,0,14740.0,0.923077,0.769231,"[0.0, 0.07692307692307693, 0.15384615384615385, 0.23076923076923078, 0.3076923076923077]","[0.20691994572591588, 0.07123473541383989, 0.06173677069199457, 0.07123473541383989, 0.07123473541383989]",0.083107,0.071235,0.080733,0.035617,0.118725,0.128223,13.0,0.482361,0.461538
1,ID_6L67PAA,2000,0,0.0,0,0,35,7,1,33.0,0,13,5,0,14740.0,0.923077,4.076923,"[0.0, 0.07692307692307693, 0.15384615384615385, 0.23076923076923078, 0.38461538461538464, 0.5384615384615384]","[0.27137042062415195, 0.07123473541383989, 0.07123473541383989, 0.07123473541383989, 0.07123473541383989, 0.027137042062415198]",0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.583446,3.538462


In [24]:
## As these two variable has only one class.
merged.drop(['PaymentMethod','SupplierName'],axis=1,inplace=True)

In [25]:
merged.head(2)

Unnamed: 0,ID,Deposit,UpsellDate,AccessoryRate,rateTypeEntity,RatePerUnit,DaysOnDeposit,MainApplicantGender,Age,Region,Town,Occupation,TotalContractValue,ExpectedTermDate,LastPaymentDate,TransactionDates,PaymentsHistory,m1,m2,m3,m4,m5,m6,time,PaidAmount,PeriodDiff
0,ID_K00S4N4,2000,0,0.0,0,35,7,1,41.0,1,5,5,14740.0,0.923077,0.769231,"[0.0, 0.07692307692307693, 0.15384615384615385, 0.23076923076923078, 0.3076923076923077]","[0.20691994572591588, 0.07123473541383989, 0.06173677069199457, 0.07123473541383989, 0.07123473541383989]",0.083107,0.071235,0.080733,0.035617,0.118725,0.128223,13.0,0.482361,0.461538
1,ID_6L67PAA,2000,0,0.0,0,35,7,1,33.0,0,13,5,14740.0,0.923077,4.076923,"[0.0, 0.07692307692307693, 0.15384615384615385, 0.23076923076923078, 0.38461538461538464, 0.5384615384615384]","[0.27137042062415195, 0.07123473541383989, 0.07123473541383989, 0.07123473541383989, 0.07123473541383989, 0.027137042062415198]",0.0,0.0,0.0,0.0,0.0,0.0,13.0,0.583446,3.538462


## 4. Feature Extraction

In [21]:
from tsfresh import extract_features
df_final=pd.DataFrame(columns=["time","money","id"])
for k in tqdm(range(0,merged.shape[0])):
    df_temp=pd.DataFrame({"time":merged.loc[k,"TransactionDates"],"money":merged.loc[k,"PaymentsHistory"]})
    df_temp['id']=merged.loc[k,"ID"]
    df_final=pd.concat([df_final, df_temp], ignore_index=True)
print(df_final.shape)

100%|██████████| 37343/37343 [01:46<00:00, 351.41it/s]

(607851, 3)





In [118]:
df_final.head(10)

Unnamed: 0,time,money,id
0,0.0,0.20692,ID_K00S4N4
1,0.076923,0.071235,ID_K00S4N4
2,0.153846,0.061737,ID_K00S4N4
3,0.230769,0.071235,ID_K00S4N4
4,0.307692,0.071235,ID_K00S4N4
5,0.0,0.27137,ID_6L67PAA
6,0.076923,0.071235,ID_6L67PAA
7,0.153846,0.071235,ID_6L67PAA
8,0.230769,0.071235,ID_6L67PAA
9,0.384615,0.071235,ID_6L67PAA


In [23]:
df_p1= df_final.loc[0:100006]
df_p2= df_final.loc[100007:200016]
df_p3= df_final.loc[200017:300023]
df_p4= df_final.loc[300024:400005]
df_p5= df_final.loc[400006:500017]
df_p6= df_final.loc[500018:607851]

In [24]:
extracted_features1= extract_features(df_p1,column_id='id',column_sort="time").reset_index().rename(columns={'index':'ID'})

Feature Extraction: 100%|██████████| 20/20 [00:39<00:00,  2.00s/it]


In [25]:
extracted_features2= extract_features(df_p2,column_id='id',column_sort="time").reset_index().rename(columns={'index':'ID'})

Feature Extraction: 100%|██████████| 20/20 [00:41<00:00,  2.10s/it]


In [26]:
extracted_features3= extract_features(df_p3,column_id='id',column_sort="time").reset_index().rename(columns={'index':'ID'})

Feature Extraction: 100%|██████████| 20/20 [00:37<00:00,  1.88s/it]


In [27]:
extracted_features4= extract_features(df_p4,column_id='id',column_sort="time").reset_index().rename(columns={'index':'ID'})

Feature Extraction: 100%|██████████| 20/20 [00:32<00:00,  1.61s/it]


In [28]:
extracted_features5= extract_features(df_p5,column_id='id',column_sort="time").reset_index().rename(columns={'index':'ID'})

Feature Extraction: 100%|██████████| 20/20 [00:35<00:00,  1.77s/it]


In [29]:
extracted_features6= extract_features(df_p6,column_id='id',column_sort="time").reset_index().rename(columns={'index':'ID'})

Feature Extraction: 100%|██████████| 20/20 [01:18<00:00,  3.93s/it]


In [249]:
frames = [extracted_features1,extracted_features2,extracted_features3,extracted_features4,extracted_features5,extracted_features6]
  
merged_extraction = pd.concat(frames)

In [253]:
merged_extraction.shape

(37343, 780)

In [250]:
new_merged= merged[['ID','m1','m2','m3','m4','m5','m6']]

In [265]:
final_merge=pd.merge(new_merged, merged_extraction)

In [266]:
import re
final_merge = final_merge.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [267]:
final_merge= final_merge.drop(['money__value_count__value_1','money__number_crossing_m__m_1'],axis=1)

In [269]:
testl=[]
trainl=[]
t= len(final_merge)
for i in range(t):
    if (final_merge.loc[i,'m1']==0.000000):
        testl.append(final_merge.iloc[i,:])
    else:
        trainl.append(final_merge.iloc[i,:])

In [270]:
test_df= pd.DataFrame(testl)
train_df= pd.DataFrame(trainl)

In [271]:
test_df.shape, train_df.shape

((9336, 782), (28007, 782))

## 5. Test/Val Split

In [331]:
from sklearn.model_selection import train_test_split

var1_cols= [c for c in train_df.columns if c not in ['ID','m1']]
X1=train_df.loc[:,var_cols]
y1=train_df.loc[:,'m1']
X1_train, X1_valid, y1_train,y1_valid = train_test_split(X1,y1,test_size=0.1, random_state=20)

var2_cols= [c for c in train_df.columns if c not in ['ID','m2']]
X2=train_df.loc[:,var_cols]
y2=train_df.loc[:,'m2']
X2_train, X2_valid, y2_train, y2_valid = train_test_split(X2,y2,test_size=0.1, random_state=20)

var3_cols= [c for c in train_df.columns if c not in ['ID','m3']]
X3=train_df.loc[:,var_cols]
y3=train_df.loc[:,'m3']
X3_train, X3_valid, y3_train,y3_valid = train_test_split(X3,y3,test_size=0.1, random_state=20)

var4_cols= [c for c in train_df.columns if c not in ['ID','m4']]
X4=train_df.loc[:,var_cols]
y4=train_df.loc[:,'m4']
X4_train, X4_valid, y4_train,y4_valid = train_test_split(X4,y4,test_size=0.1, random_state=20)

var5_cols= [c for c in train_df.columns if c not in ['ID','m5']]
X5=train_df.loc[:,var_cols]
y5=train_df.loc[:,'m5']
X5_train, X5_valid, y5_train,y5_valid = train_test_split(X5,y5,test_size=0.1, random_state=20)

var6_cols= [c for c in train_df.columns if c not in ['ID','m6']]
X6=train_df.loc[:,var_cols]
y6=train_df.loc[:,'m6']
X6_train, X6_valid, y6_train,y6_valid = train_test_split(X6,y6,test_size=0.1, random_state=20)

## 6. Training model

In [332]:
import lightgbm as lgb

In [333]:
params= {
    'metric':'rmse',
    'boosting': 'gbdt',
    'num_leaves': 45,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.5,
    'bagging_frequency': 20,
    'learning_rate': 0.01
}

In [334]:
train_data1= lgb.Dataset(X1_train, label= y1_train)
valid_data1= lgb.Dataset(X1_valid, label= y1_valid)

In [335]:
train_data2= lgb.Dataset(X2_train, label= y2_train)
valid_data2= lgb.Dataset(X2_valid, label= y2_valid)

In [336]:
train_data3= lgb.Dataset(X3_train, label= y3_train)
valid_data3= lgb.Dataset(X3_valid, label= y3_valid)

In [337]:
train_data4= lgb.Dataset(X4_train, label= y4_train)
valid_data4= lgb.Dataset(X4_valid, label= y4_valid)

In [338]:
train_data5= lgb.Dataset(X5_train, label= y5_train)
valid_data5= lgb.Dataset(X5_valid, label= y5_valid)

In [339]:
train_data6= lgb.Dataset(X6_train, label= y6_train)
valid_data6= lgb.Dataset(X6_valid, label= y6_valid)

In [319]:
model1= lgb.train(params,
                train_data1,
                valid_sets= valid_data1,
                num_boost_round= 4000,
                early_stopping_rounds= 50)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 91301
[LightGBM] [Info] Number of data points in the train set: 25206, number of used features: 405
[LightGBM] [Info] Start training from score 816.491788
[1]	valid_0's rmse: 607.288
Training until validation scores don't improve for 50 rounds
[2]	valid_0's rmse: 603.449
[3]	valid_0's rmse: 599.781
[4]	valid_0's rmse: 593.783
[5]	valid_0's rmse: 587.871
[6]	valid_0's rmse: 584.214
[7]	valid_0's rmse: 578.375
[8]	valid_0's rmse: 572.625
[9]	valid_0's rmse: 569.524
[10]	valid_0's rmse: 563.858
[11]	valid_0's rmse: 558.247
[12]	valid_0's rmse: 554.999
[13]	valid_0's rmse: 551.527
[14]	valid_0's rmse: 546.051
[15]	valid_0's rmse: 540.632
[16]	valid_0's rmse: 535.236
[17]	valid_0's rmse: 532.015
[18]	valid_0's rmse: 529.07
[19]	valid_0's rmse: 525.985
[20]	valid_0's rmse: 522.961
[21]	valid_0's rmse: 517.751
[22]	valid_0's rmse: 512.593
[23]	valid_0's rmse: 507.517
[24]	valid_0's rmse: 504.664
[25]	valid_

[256]	valid_0's rmse: 135.038
[257]	valid_0's rmse: 134.115
[258]	valid_0's rmse: 134.031
[259]	valid_0's rmse: 133.097
[260]	valid_0's rmse: 132.113
[261]	valid_0's rmse: 132.04
[262]	valid_0's rmse: 131.967
[263]	valid_0's rmse: 131.07
[264]	valid_0's rmse: 131.009
[265]	valid_0's rmse: 130.948
[266]	valid_0's rmse: 130.135
[267]	valid_0's rmse: 129.296
[268]	valid_0's rmse: 128.451
[269]	valid_0's rmse: 128.372
[270]	valid_0's rmse: 128.286
[271]	valid_0's rmse: 128.215
[272]	valid_0's rmse: 127.346
[273]	valid_0's rmse: 127.25
[274]	valid_0's rmse: 127.175
[275]	valid_0's rmse: 126.351
[276]	valid_0's rmse: 125.538
[277]	valid_0's rmse: 125.492
[278]	valid_0's rmse: 124.616
[279]	valid_0's rmse: 124.554
[280]	valid_0's rmse: 124.457
[281]	valid_0's rmse: 123.605
[282]	valid_0's rmse: 123.563
[283]	valid_0's rmse: 123.563
[284]	valid_0's rmse: 123.493
[285]	valid_0's rmse: 122.652
[286]	valid_0's rmse: 122.612
[287]	valid_0's rmse: 122.608
[288]	valid_0's rmse: 122.586
[289]	valid_0

[533]	valid_0's rmse: 71.4868
[534]	valid_0's rmse: 71.2716
[535]	valid_0's rmse: 71.0303
[536]	valid_0's rmse: 71.0359
[537]	valid_0's rmse: 70.9991
[538]	valid_0's rmse: 71.0494
[539]	valid_0's rmse: 70.7792
[540]	valid_0's rmse: 70.5612
[541]	valid_0's rmse: 70.6018
[542]	valid_0's rmse: 70.627
[543]	valid_0's rmse: 70.5982
[544]	valid_0's rmse: 70.3352
[545]	valid_0's rmse: 70.3295
[546]	valid_0's rmse: 70.3407
[547]	valid_0's rmse: 70.1216
[548]	valid_0's rmse: 70.2946
[549]	valid_0's rmse: 70.3536
[550]	valid_0's rmse: 70.1344
[551]	valid_0's rmse: 69.8871
[552]	valid_0's rmse: 69.91
[553]	valid_0's rmse: 69.6663
[554]	valid_0's rmse: 69.6248
[555]	valid_0's rmse: 69.4059
[556]	valid_0's rmse: 69.4352
[557]	valid_0's rmse: 69.1773
[558]	valid_0's rmse: 68.9534
[559]	valid_0's rmse: 68.7419
[560]	valid_0's rmse: 68.8365
[561]	valid_0's rmse: 68.6222
[562]	valid_0's rmse: 68.6487
[563]	valid_0's rmse: 68.4366
[564]	valid_0's rmse: 68.2317
[565]	valid_0's rmse: 68.2327
[566]	valid_0

[808]	valid_0's rmse: 57.9468
[809]	valid_0's rmse: 57.9652
[810]	valid_0's rmse: 57.8428
[811]	valid_0's rmse: 57.9266
[812]	valid_0's rmse: 57.7978
[813]	valid_0's rmse: 57.849
[814]	valid_0's rmse: 57.7113
[815]	valid_0's rmse: 57.6963
[816]	valid_0's rmse: 57.5866
[817]	valid_0's rmse: 57.708
[818]	valid_0's rmse: 57.6939
[819]	valid_0's rmse: 57.6901
[820]	valid_0's rmse: 57.672
[821]	valid_0's rmse: 57.5558
[822]	valid_0's rmse: 57.516
[823]	valid_0's rmse: 57.4957
[824]	valid_0's rmse: 57.4588
[825]	valid_0's rmse: 57.3204
[826]	valid_0's rmse: 57.3819
[827]	valid_0's rmse: 57.269
[828]	valid_0's rmse: 57.2769
[829]	valid_0's rmse: 57.1467
[830]	valid_0's rmse: 57.2105
[831]	valid_0's rmse: 57.3354
[832]	valid_0's rmse: 57.2136
[833]	valid_0's rmse: 57.1082
[834]	valid_0's rmse: 57.1885
[835]	valid_0's rmse: 57.1984
[836]	valid_0's rmse: 57.2179
[837]	valid_0's rmse: 57.226
[838]	valid_0's rmse: 57.0988
[839]	valid_0's rmse: 57.0945
[840]	valid_0's rmse: 56.9749
[841]	valid_0's 

In [340]:
y1_train_pred=model1.predict(X1_train)
y1_valid_pred=model1.predict(X1_valid)

In [341]:
model2= lgb.train(params,
                train_data2,
                valid_sets= valid_data2,
                num_boost_round= 1000,
                early_stopping_rounds= 50)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 91301
[LightGBM] [Info] Number of data points in the train set: 25206, number of used features: 405
[LightGBM] [Info] Start training from score 808.399349
[1]	valid_0's rmse: 599.845
Training until validation scores don't improve for 50 rounds
[2]	valid_0's rmse: 593.553
[3]	valid_0's rmse: 587.328
[4]	valid_0's rmse: 581.169
[5]	valid_0's rmse: 577.269
[6]	valid_0's rmse: 571.205
[7]	valid_0's rmse: 567.468
[8]	valid_0's rmse: 563.991
[9]	valid_0's rmse: 560.772
[10]	valid_0's rmse: 557.145
[11]	valid_0's rmse: 553.503
[12]	valid_0's rmse: 550.051
[13]	valid_0's rmse: 544.266
[14]	valid_0's rmse: 538.548
[15]	valid_0's rmse: 532.89
[16]	valid_0's rmse: 527.29
[17]	valid_0's rmse: 521.751
[18]	valid_0's rmse: 518.519
[19]	valid_0's rmse: 513.081
[20]	valid_0's rmse: 507.704
[21]	valid_0's rmse: 502.384
[22]	valid_0's rmse: 497.125
[23]	valid_0's rmse: 493.978
[24]	valid_0's rmse: 491.006
[25]	valid_0

[264]	valid_0's rmse: 157.033
[265]	valid_0's rmse: 157.014
[266]	valid_0's rmse: 156.917
[267]	valid_0's rmse: 156.621
[268]	valid_0's rmse: 156.335
[269]	valid_0's rmse: 156.071
[270]	valid_0's rmse: 155.922
[271]	valid_0's rmse: 155.682
[272]	valid_0's rmse: 155.572
[273]	valid_0's rmse: 155.582
[274]	valid_0's rmse: 155.627
[275]	valid_0's rmse: 155.37
[276]	valid_0's rmse: 155.251
[277]	valid_0's rmse: 155.025
[278]	valid_0's rmse: 154.836
[279]	valid_0's rmse: 154.647
[280]	valid_0's rmse: 154.451
[281]	valid_0's rmse: 154.291
[282]	valid_0's rmse: 154.221
[283]	valid_0's rmse: 154.08
[284]	valid_0's rmse: 154.087
[285]	valid_0's rmse: 153.925
[286]	valid_0's rmse: 153.759
[287]	valid_0's rmse: 153.604
[288]	valid_0's rmse: 153.583
[289]	valid_0's rmse: 153.725
[290]	valid_0's rmse: 153.566
[291]	valid_0's rmse: 153.423
[292]	valid_0's rmse: 153.398
[293]	valid_0's rmse: 153.383
[294]	valid_0's rmse: 153.264
[295]	valid_0's rmse: 153.153
[296]	valid_0's rmse: 153.098
[297]	valid_

[546]	valid_0's rmse: 148.338
[547]	valid_0's rmse: 148.335
[548]	valid_0's rmse: 148.34
[549]	valid_0's rmse: 148.45
[550]	valid_0's rmse: 148.553
[551]	valid_0's rmse: 148.548
[552]	valid_0's rmse: 147.995
[553]	valid_0's rmse: 148.145
[554]	valid_0's rmse: 148.044
[555]	valid_0's rmse: 148.175
[556]	valid_0's rmse: 148.385
[557]	valid_0's rmse: 148.537
[558]	valid_0's rmse: 148.317
[559]	valid_0's rmse: 148.212
[560]	valid_0's rmse: 148.363
[561]	valid_0's rmse: 148.565
[562]	valid_0's rmse: 148.473
[563]	valid_0's rmse: 148.625
[564]	valid_0's rmse: 148.729
[565]	valid_0's rmse: 148.962
[566]	valid_0's rmse: 148.907
[567]	valid_0's rmse: 148.825
[568]	valid_0's rmse: 148.73
[569]	valid_0's rmse: 148.743
[570]	valid_0's rmse: 148.89
[571]	valid_0's rmse: 149.079
[572]	valid_0's rmse: 148.857
[573]	valid_0's rmse: 149.054
[574]	valid_0's rmse: 148.957
[575]	valid_0's rmse: 149.131
[576]	valid_0's rmse: 149.036
[577]	valid_0's rmse: 148.92
[578]	valid_0's rmse: 149.077
[579]	valid_0's

In [342]:
y2_train_pred=model2.predict(X2_train)
y2_valid_pred=model2.predict(X2_valid)

In [343]:
model3= lgb.train(params,
                train_data3,
                valid_sets= valid_data3,
                num_boost_round= 4000,
                early_stopping_rounds= 50)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 91301
[LightGBM] [Info] Number of data points in the train set: 25206, number of used features: 405
[LightGBM] [Info] Start training from score 799.201857
[1]	valid_0's rmse: 627.513
Training until validation scores don't improve for 50 rounds
[2]	valid_0's rmse: 623.55
[3]	valid_0's rmse: 619.727
[4]	valid_0's rmse: 615.936
[5]	valid_0's rmse: 609.885
[6]	valid_0's rmse: 603.896
[7]	valid_0's rmse: 597.969
[8]	valid_0's rmse: 594.935
[9]	valid_0's rmse: 591.698
[10]	valid_0's rmse: 585.905
[11]	valid_0's rmse: 580.17
[12]	valid_0's rmse: 574.485
[13]	valid_0's rmse: 570.995
[14]	valid_0's rmse: 565.405
[15]	valid_0's rmse: 562.037
[16]	valid_0's rmse: 556.551
[17]	valid_0's rmse: 551.116
[18]	valid_0's rmse: 545.743
[19]	valid_0's rmse: 540.413
[20]	valid_0's rmse: 535.149
[21]	valid_0's rmse: 531.98
[22]	valid_0's rmse: 528.768
[23]	valid_0's rmse: 523.626
[24]	valid_0's rmse: 518.54
[25]	valid_0's

[265]	valid_0's rmse: 153.299
[266]	valid_0's rmse: 153.258
[267]	valid_0's rmse: 153.199
[268]	valid_0's rmse: 153.143
[269]	valid_0's rmse: 152.528
[270]	valid_0's rmse: 152.501
[271]	valid_0's rmse: 151.885
[272]	valid_0's rmse: 151.839
[273]	valid_0's rmse: 151.801
[274]	valid_0's rmse: 151.761
[275]	valid_0's rmse: 151.178
[276]	valid_0's rmse: 150.592
[277]	valid_0's rmse: 150.022
[278]	valid_0's rmse: 149.999
[279]	valid_0's rmse: 149.975
[280]	valid_0's rmse: 149.399
[281]	valid_0's rmse: 149.38
[282]	valid_0's rmse: 149.34
[283]	valid_0's rmse: 149.284
[284]	valid_0's rmse: 149.262
[285]	valid_0's rmse: 149.247
[286]	valid_0's rmse: 148.703
[287]	valid_0's rmse: 148.159
[288]	valid_0's rmse: 148.14
[289]	valid_0's rmse: 148.116
[290]	valid_0's rmse: 148.075
[291]	valid_0's rmse: 148.048
[292]	valid_0's rmse: 147.58
[293]	valid_0's rmse: 147.061
[294]	valid_0's rmse: 147.023
[295]	valid_0's rmse: 146.517
[296]	valid_0's rmse: 146.512
[297]	valid_0's rmse: 146.013
[298]	valid_0'

[553]	valid_0's rmse: 117.409
[554]	valid_0's rmse: 117.322
[555]	valid_0's rmse: 117.238
[556]	valid_0's rmse: 117.253
[557]	valid_0's rmse: 117.266
[558]	valid_0's rmse: 117.18
[559]	valid_0's rmse: 117.217
[560]	valid_0's rmse: 117.217
[561]	valid_0's rmse: 117.228
[562]	valid_0's rmse: 117.221
[563]	valid_0's rmse: 117.175
[564]	valid_0's rmse: 117.181
[565]	valid_0's rmse: 117.103
[566]	valid_0's rmse: 117.098
[567]	valid_0's rmse: 117.096
[568]	valid_0's rmse: 117.094
[569]	valid_0's rmse: 117.11
[570]	valid_0's rmse: 117.117
[571]	valid_0's rmse: 117.131
[572]	valid_0's rmse: 117.074
[573]	valid_0's rmse: 116.992
[574]	valid_0's rmse: 116.997
[575]	valid_0's rmse: 117.02
[576]	valid_0's rmse: 117.011
[577]	valid_0's rmse: 117.051
[578]	valid_0's rmse: 116.964
[579]	valid_0's rmse: 116.961
[580]	valid_0's rmse: 116.884
[581]	valid_0's rmse: 116.798
[582]	valid_0's rmse: 116.793
[583]	valid_0's rmse: 116.79
[584]	valid_0's rmse: 116.806
[585]	valid_0's rmse: 116.827
[586]	valid_0'

[831]	valid_0's rmse: 112.901
[832]	valid_0's rmse: 112.878
[833]	valid_0's rmse: 112.823
[834]	valid_0's rmse: 112.817
[835]	valid_0's rmse: 112.838
[836]	valid_0's rmse: 112.876
[837]	valid_0's rmse: 112.836
[838]	valid_0's rmse: 112.817
[839]	valid_0's rmse: 112.769
[840]	valid_0's rmse: 112.74
[841]	valid_0's rmse: 112.698
[842]	valid_0's rmse: 112.73
[843]	valid_0's rmse: 112.69
[844]	valid_0's rmse: 112.634
[845]	valid_0's rmse: 112.62
[846]	valid_0's rmse: 112.634
[847]	valid_0's rmse: 112.644
[848]	valid_0's rmse: 112.61
[849]	valid_0's rmse: 112.571
[850]	valid_0's rmse: 112.577
[851]	valid_0's rmse: 112.552
[852]	valid_0's rmse: 112.542
[853]	valid_0's rmse: 112.505
[854]	valid_0's rmse: 112.539
[855]	valid_0's rmse: 112.518
[856]	valid_0's rmse: 112.476
[857]	valid_0's rmse: 112.482
[858]	valid_0's rmse: 112.437
[859]	valid_0's rmse: 112.463
[860]	valid_0's rmse: 112.45
[861]	valid_0's rmse: 112.41
[862]	valid_0's rmse: 112.387
[863]	valid_0's rmse: 112.366
[864]	valid_0's r

In [344]:
y3_train_pred=model3.predict(X3_train)
y3_valid_pred=model3.predict(X3_valid)

In [345]:
model4= lgb.train(params,
                train_data4,
                valid_sets= valid_data4,
                num_boost_round= 4000,
                early_stopping_rounds= 50)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 91301
[LightGBM] [Info] Number of data points in the train set: 25206, number of used features: 405
[LightGBM] [Info] Start training from score 814.344759
[1]	valid_0's rmse: 890.57
Training until validation scores don't improve for 50 rounds
[2]	valid_0's rmse: 884.729
[3]	valid_0's rmse: 878.959
[4]	valid_0's rmse: 876.337
[5]	valid_0's rmse: 870.679
[6]	valid_0's rmse: 865.081
[7]	valid_0's rmse: 862.702
[8]	valid_0's rmse: 860.677
[9]	valid_0's rmse: 855.211
[10]	valid_0's rmse: 852.946
[11]	valid_0's rmse: 847.589
[12]	valid_0's rmse: 845.267
[13]	valid_0's rmse: 840.008
[14]	valid_0's rmse: 837.686
[15]	valid_0's rmse: 835.469
[16]	valid_0's rmse: 830.341
[17]	valid_0's rmse: 828.073
[18]	valid_0's rmse: 823.043
[19]	valid_0's rmse: 820.869
[20]	valid_0's rmse: 815.927
[21]	valid_0's rmse: 811.053
[22]	valid_0's rmse: 806.23
[23]	v

[260]	valid_0's rmse: 508.118
[261]	valid_0's rmse: 508.103
[262]	valid_0's rmse: 507.449
[263]	valid_0's rmse: 507.414
[264]	valid_0's rmse: 506.791
[265]	valid_0's rmse: 506.745
[266]	valid_0's rmse: 506.102
[267]	valid_0's rmse: 505.473
[268]	valid_0's rmse: 504.849
[269]	valid_0's rmse: 504.832
[270]	valid_0's rmse: 504.816
[271]	valid_0's rmse: 504.187
[272]	valid_0's rmse: 504.181
[273]	valid_0's rmse: 503.661
[274]	valid_0's rmse: 503.069
[275]	valid_0's rmse: 503.058
[276]	valid_0's rmse: 502.45
[277]	valid_0's rmse: 502.443
[278]	valid_0's rmse: 502.427
[279]	valid_0's rmse: 502.408
[280]	valid_0's rmse: 502.398
[281]	valid_0's rmse: 502.392
[282]	valid_0's rmse: 502.39
[283]	valid_0's rmse: 501.81
[284]	valid_0's rmse: 501.795
[285]	valid_0's rmse: 501.788
[286]	valid_0's rmse: 501.782
[287]	valid_0's rmse: 501.197
[288]	valid_0's rmse: 501.175
[289]	valid_0's rmse: 501.168
[290]	valid_0's rmse: 500.602
[291]	valid_0's rmse: 500.039
[292]	valid_0's rmse: 500.032
[293]	valid_0

[544]	valid_0's rmse: 463.673
[545]	valid_0's rmse: 463.53
[546]	valid_0's rmse: 463.53
[547]	valid_0's rmse: 463.533
[548]	valid_0's rmse: 463.39
[549]	valid_0's rmse: 463.39
[550]	valid_0's rmse: 463.248
[551]	valid_0's rmse: 463.251
[552]	valid_0's rmse: 463.112
[553]	valid_0's rmse: 462.995
[554]	valid_0's rmse: 462.858
[555]	valid_0's rmse: 462.722
[556]	valid_0's rmse: 462.722
[557]	valid_0's rmse: 462.728
[558]	valid_0's rmse: 462.734
[559]	valid_0's rmse: 462.738
[560]	valid_0's rmse: 462.609
[561]	valid_0's rmse: 462.476
[562]	valid_0's rmse: 462.363
[563]	valid_0's rmse: 462.231
[564]	valid_0's rmse: 462.233
[565]	valid_0's rmse: 462.104
[566]	valid_0's rmse: 462.107
[567]	valid_0's rmse: 461.98
[568]	valid_0's rmse: 461.853
[569]	valid_0's rmse: 461.75
[570]	valid_0's rmse: 461.628
[571]	valid_0's rmse: 461.631
[572]	valid_0's rmse: 461.633
[573]	valid_0's rmse: 461.633
[574]	valid_0's rmse: 461.639
[575]	valid_0's rmse: 461.636
[576]	valid_0's rmse: 461.515
[577]	valid_0's 

[826]	valid_0's rmse: 453.113
[827]	valid_0's rmse: 453.191
[828]	valid_0's rmse: 453.191
[829]	valid_0's rmse: 453.195
[830]	valid_0's rmse: 453.201
[831]	valid_0's rmse: 453.156
[832]	valid_0's rmse: 453.157
[833]	valid_0's rmse: 453.111
[834]	valid_0's rmse: 453.189
[835]	valid_0's rmse: 453.189
[836]	valid_0's rmse: 453.191
[837]	valid_0's rmse: 453.194
[838]	valid_0's rmse: 453.15
[839]	valid_0's rmse: 453.156
[840]	valid_0's rmse: 453.163
[841]	valid_0's rmse: 453.12
[842]	valid_0's rmse: 453.077
[843]	valid_0's rmse: 453.082
[844]	valid_0's rmse: 453.082
[845]	valid_0's rmse: 453.084
[846]	valid_0's rmse: 453.084
[847]	valid_0's rmse: 453.034
[848]	valid_0's rmse: 453.11
[849]	valid_0's rmse: 453.112
[850]	valid_0's rmse: 453.121
[851]	valid_0's rmse: 453.124
[852]	valid_0's rmse: 453.082
[853]	valid_0's rmse: 453.085
[854]	valid_0's rmse: 453.043
[855]	valid_0's rmse: 453.042
[856]	valid_0's rmse: 453.047
[857]	valid_0's rmse: 452.995
[858]	valid_0's rmse: 452.952
[859]	valid_0

[1097]	valid_0's rmse: 452.241
[1098]	valid_0's rmse: 452.24
[1099]	valid_0's rmse: 452.244
[1100]	valid_0's rmse: 452.316
[1101]	valid_0's rmse: 452.28
[1102]	valid_0's rmse: 452.242
[1103]	valid_0's rmse: 452.243
[1104]	valid_0's rmse: 452.205
[1105]	valid_0's rmse: 452.21
[1106]	valid_0's rmse: 452.21
[1107]	valid_0's rmse: 452.174
[1108]	valid_0's rmse: 452.18
[1109]	valid_0's rmse: 452.254
[1110]	valid_0's rmse: 452.257
[1111]	valid_0's rmse: 452.218
[1112]	valid_0's rmse: 452.182
[1113]	valid_0's rmse: 452.184
[1114]	valid_0's rmse: 452.191
[1115]	valid_0's rmse: 452.193
[1116]	valid_0's rmse: 452.265
[1117]	valid_0's rmse: 452.226
[1118]	valid_0's rmse: 452.307
[1119]	valid_0's rmse: 452.306
[1120]	valid_0's rmse: 452.269
[1121]	valid_0's rmse: 452.274
[1122]	valid_0's rmse: 452.28
[1123]	valid_0's rmse: 452.278
[1124]	valid_0's rmse: 452.279
[1125]	valid_0's rmse: 452.24
[1126]	valid_0's rmse: 452.241
[1127]	valid_0's rmse: 452.204
[1128]	valid_0's rmse: 452.277
[1129]	valid_0'

In [346]:
y4_train_pred=model4.predict(X4_train)
y4_valid_pred=model4.predict(X4_valid)

In [347]:
model5= lgb.train(params,
                train_data5,
                valid_sets= valid_data5,
                num_boost_round= 1000,
                early_stopping_rounds= 50)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 91301
[LightGBM] [Info] Number of data points in the train set: 25206, number of used features: 405
[LightGBM] [Info] Start training from score 827.659327
[1]	valid_0's rmse: 764.348
Training until validation scores don't improve for 50 rounds
[2]	valid_0's rmse: 756.71
[3]	valid_0's rmse: 753.61
[4]	valid_0's rmse: 746.1
[5]	valid_0's rmse: 743.055
[6]	valid_0's rmse: 739.966
[7]	valid_0's rmse: 737.205
[8]	valid_0's rmse: 734.88
[9]	valid_0's rmse: 732.163
[10]	valid_0's rmse: 729.575
[11]	valid_0's rmse: 722.323
[12]	valid_0's rmse: 715.147
[13]	valid_0's rmse: 708.05
[14]	valid_0's rmse: 701.026
[15]	valid_0's rmse: 694.078
[16]	valid_0's rmse: 691.317
[17]	valid_0's rmse: 684.476
[18]	valid_0's rmse: 677.707
[19]	valid_0's rmse: 671.011
[20]	valid_0's rmse: 664.385
[21]	valid_0's rmse: 661.871
[22]	valid_0's rmse: 655.345
[23]	valid_0's rmse: 653.094
[24]	valid_0's rmse: 646.666
[25]	valid_0's r

[266]	valid_0's rmse: 206.948
[267]	valid_0's rmse: 206.904
[268]	valid_0's rmse: 206.864
[269]	valid_0's rmse: 206.126
[270]	valid_0's rmse: 206.076
[271]	valid_0's rmse: 205.405
[272]	valid_0's rmse: 204.694
[273]	valid_0's rmse: 204.674
[274]	valid_0's rmse: 203.973
[275]	valid_0's rmse: 203.979
[276]	valid_0's rmse: 203.281
[277]	valid_0's rmse: 203.249
[278]	valid_0's rmse: 202.577
[279]	valid_0's rmse: 201.978
[280]	valid_0's rmse: 201.329
[281]	valid_0's rmse: 200.69
[282]	valid_0's rmse: 200.095
[283]	valid_0's rmse: 200.067
[284]	valid_0's rmse: 199.54
[285]	valid_0's rmse: 198.932
[286]	valid_0's rmse: 198.915
[287]	valid_0's rmse: 198.311
[288]	valid_0's rmse: 197.768
[289]	valid_0's rmse: 197.763
[290]	valid_0's rmse: 197.735
[291]	valid_0's rmse: 197.185
[292]	valid_0's rmse: 197.202
[293]	valid_0's rmse: 196.635
[294]	valid_0's rmse: 196.637
[295]	valid_0's rmse: 196.651
[296]	valid_0's rmse: 196.651
[297]	valid_0's rmse: 196.642
[298]	valid_0's rmse: 196.632
[299]	valid_

[541]	valid_0's rmse: 173.52
[542]	valid_0's rmse: 173.566
[543]	valid_0's rmse: 173.537
[544]	valid_0's rmse: 173.512
[545]	valid_0's rmse: 173.479
Early stopping, best iteration is:
[495]	valid_0's rmse: 173.135


In [348]:
y5_train_pred=model5.predict(X5_train)
y5_valid_pred=model5.predict(X5_valid)

In [349]:
model6= lgb.train(params,
                train_data6,
                valid_sets= valid_data6,
                num_boost_round= 1000,
                early_stopping_rounds= 50)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 91301
[LightGBM] [Info] Number of data points in the train set: 25206, number of used features: 405
[LightGBM] [Info] Start training from score 693.298976
[1]	valid_0's rmse: 1384.51
Training until validation scores don't improve for 50 rounds
[2]	valid_0's rmse: 1382.3
[3]	valid_0's rmse: 1380.19
[4]	valid_0's rmse: 1379.07
[5]	valid_0's rmse: 1377.77
[6]	valid_0's rmse: 1376.39
[7]	valid_0's rmse: 1375.01
[8]	valid_0's rmse: 1374.23
[9]	valid_0's rmse: 1372.84
[10]	valid_0's rmse: 1371.8
[11]	valid_0's rmse: 1370.65
[12]	valid_0's rmse: 1369.66
[13]	valid_0's rmse: 1367.62
[14]	valid_0's rmse: 1366.21
[15]	valid_0's rmse: 1365.17
[16]	valid_0's rmse: 1363.54
[17]	valid_0's rmse: 1362.87
[18]	valid_0's rmse: 1361.8
[19]	valid_0's rmse: 1360.09
[20]	valid_0's rmse: 1358.31
[21]	valid_0's rmse: 1356.67
[22]	valid_0's rmse: 1355.95
[23]	valid_0's rmse: 1355.04
[24]	valid_0's rmse: 1354.36
[25]	valid_0'

In [350]:
y6_train_pred=model6.predict(X6_train)
y6_valid_pred=model6.predict(X6_valid)

## 7. Updating Sample Submission

In [359]:
ss.head(2)

Unnamed: 0,ID,Target
0,ID_6L67PAA x m1,0.0
1,ID_6L67PAA x m2,0.0


In [363]:
X1_test= test_df.loc[:, var1_cols]
ss1= model1.predict(X1_test)

In [364]:
X2_test= test_df.loc[:, var2_cols]
ss2= model2.predict(X2_test)

In [365]:
X3_test= test_df.loc[:, var3_cols]
ss3= model3.predict(X3_test)

In [366]:
X4_test= test_df.loc[:, var4_cols]
ss4= model4.predict(X4_test)

In [367]:
X5_test= test_df.loc[:, var5_cols]
ss5= model5.predict(X5_test)

In [368]:
X6_test= test_df.loc[:, var6_cols]
ss6= model6.predict(X6_test)

In [373]:
t= len(test_df)
for i in range(0,t):
    ss.loc[i,'Target']=ss1[i]
    ss.loc[i+1,'Target']=ss2[i]
    ss.loc[i+2,'Target']=ss3[i]
    ss.loc[i+3,'Target']=ss4[i]
    ss.loc[i+4,'Target']=ss5[i]
    ss.loc[i+5,'Target']=ss6[i]

In [376]:
ss.to_csv("submission.csv",index=False)