In [54]:
'''Plan:
1. Load libraries, load data.
2. Preliminary EDA, dealing with missing values, merging train and test.
3. EDA, deleting variables.
4. Feature engineering, ohc.
5. Sample formation.
6. Feature scaling.
7. Model fitting.
8. Performance evaluation.
9. Predictions.
'''

### 1. Load libraries ###

import numpy as np
import pandas as pd
import os, warnings, random, time
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer

from xgboost import XGBClassifier

warnings.filterwarnings("ignore")
#os.getcwd()

In [55]:
### Load data ###

train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
print(train.shape)

test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

(957919, 120)


In [56]:
train = train.sample(n=50000)

In [57]:
### 2. ###

train.shape
train.describe()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
count,50000.0,49239.0,49204.0,49156.0,49172.0,49188.0,49156.0,49173.0,49184.0,49232.0,...,49247.0,49246.0,49239.0,49173.0,49159.0,49195.0,49156.0,49188.0,49188.0,50000.0
mean,478811.77308,0.090148,0.345875,4083.993706,0.201427,0.304232,-0.078651,1605.932553,374073.2,1824343000000000.0,...,-19.87989,2.078207,23.570016,1.700365,63866.922886,1.208409,4.230062e+16,3964.21478,0.559079,0.49948
std,276869.735891,0.043462,0.145976,6453.816083,0.212285,0.14626,2.10298,1274.775041,344079.2,2345513000000000.0,...,18.498894,0.898091,45.278472,10.108747,92835.579408,0.115461,6.688385e+16,3160.828858,0.406363,0.500005
min,18.0,-0.12838,-0.018642,-8370.6,-0.053539,-0.006659,-12.097,-198.59,-24216.0,-1050800000000000.0,...,-102.41,0.30535,-24.187,-26.346,-54197.0,0.90527,-6779000000000000.0,-407.92,-0.12287,0.0
25%,238495.75,0.070059,0.28254,420.4125,0.034772,0.23908,-1.1134,460.17,89663.25,11642250000000.0,...,-28.6575,1.489,-0.616865,-4.5492,2499.5,1.1465,224825000000000.0,1307.725,0.2765,0.0
50%,477330.5,0.090157,0.38885,1287.25,0.13808,0.327085,-0.40146,1426.5,286895.0,524725000000000.0,...,-14.66,1.6629,1.724,0.80771,20131.0,1.1768,1.3053e+16,3226.2,0.47435,0.0
75%,719372.25,0.11683,0.45841,4396.95,0.297975,0.413793,0.901095,2483.6,555125.0,3157325000000000.0,...,-5.37395,2.54655,18.4055,6.8358,90891.5,1.2404,5.2191e+16,6150.8,0.747753,1.0
max,957913.0,0.41004,0.51899,39544.0,1.2856,0.54912,10.55,5337.2,1871000.0,1.0304e+16,...,1.3991,4.5066,211.93,46.983,511400.0,1.7896,3.2176e+17,13148.0,2.632,1.0


In [58]:
# are there numerical features?

un_colval = pd.DataFrame([[x,len(train[x].unique())] for x in train.columns], columns = ['colname', 'n_unique'])
un_colval.loc[un_colval.n_unique < 100]
# all columns contain numerical features

Unnamed: 0,colname,n_unique
119,claim,2


In [59]:
train.describe()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
count,50000.0,49239.0,49204.0,49156.0,49172.0,49188.0,49156.0,49173.0,49184.0,49232.0,...,49247.0,49246.0,49239.0,49173.0,49159.0,49195.0,49156.0,49188.0,49188.0,50000.0
mean,478811.77308,0.090148,0.345875,4083.993706,0.201427,0.304232,-0.078651,1605.932553,374073.2,1824343000000000.0,...,-19.87989,2.078207,23.570016,1.700365,63866.922886,1.208409,4.230062e+16,3964.21478,0.559079,0.49948
std,276869.735891,0.043462,0.145976,6453.816083,0.212285,0.14626,2.10298,1274.775041,344079.2,2345513000000000.0,...,18.498894,0.898091,45.278472,10.108747,92835.579408,0.115461,6.688385e+16,3160.828858,0.406363,0.500005
min,18.0,-0.12838,-0.018642,-8370.6,-0.053539,-0.006659,-12.097,-198.59,-24216.0,-1050800000000000.0,...,-102.41,0.30535,-24.187,-26.346,-54197.0,0.90527,-6779000000000000.0,-407.92,-0.12287,0.0
25%,238495.75,0.070059,0.28254,420.4125,0.034772,0.23908,-1.1134,460.17,89663.25,11642250000000.0,...,-28.6575,1.489,-0.616865,-4.5492,2499.5,1.1465,224825000000000.0,1307.725,0.2765,0.0
50%,477330.5,0.090157,0.38885,1287.25,0.13808,0.327085,-0.40146,1426.5,286895.0,524725000000000.0,...,-14.66,1.6629,1.724,0.80771,20131.0,1.1768,1.3053e+16,3226.2,0.47435,0.0
75%,719372.25,0.11683,0.45841,4396.95,0.297975,0.413793,0.901095,2483.6,555125.0,3157325000000000.0,...,-5.37395,2.54655,18.4055,6.8358,90891.5,1.2404,5.2191e+16,6150.8,0.747753,1.0
max,957913.0,0.41004,0.51899,39544.0,1.2856,0.54912,10.55,5337.2,1871000.0,1.0304e+16,...,1.3991,4.5066,211.93,46.983,511400.0,1.7896,3.2176e+17,13148.0,2.632,1.0


In [60]:
test.describe()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
count,493474.0,485662.0,485583.0,485679.0,485741.0,485597.0,485561.0,485541.0,485619.0,485656.0,...,485461.0,485701.0,485585.0,485449.0,485718.0,485532.0,485497.0,485391.0,485711.0,485589.0
mean,1204656.0,0.090191,0.346143,4047.2024,0.201503,0.304879,-0.071511,1617.036351,376799.8,1808500000000000.0,...,0.39813,-19.84097,2.075146,23.846392,1.762625,63051.287541,1.209049,4.284258e+16,3968.585512,0.558734
std,142453.8,0.043538,0.145889,6383.224961,0.212553,0.145435,2.125074,1272.769177,345014.6,2337721000000000.0,...,0.297971,18.513063,0.895702,45.544851,10.096886,92279.031878,0.115074,6.743966e+16,3161.273268,0.408472
min,957919.0,-0.15319,-0.01906,-9436.3,-0.080716,-0.007007,-12.841,-215.66,-27809.0,-1254900000000000.0,...,-0.042181,-104.74,0.26104,-23.938,-26.311,-81381.0,0.9007,-7749400000000000.0,-416.24,-0.18696
25%,1081287.0,0.070211,0.28366,418.835,0.035169,0.2406,-1.1195,481.82,91424.0,11580000000000.0,...,0.1133,-28.625,1.4885,-0.6247,-4.476575,2474.875,1.1468,231785000000000.0,1310.0,0.27676
50%,1204656.0,0.090173,0.38894,1277.9,0.13735,0.32802,-0.38019,1441.8,289590.0,504825000000000.0,...,0.35796,-14.576,1.6631,1.7304,0.910985,19429.0,1.1772,1.3304e+16,3237.1,0.47379
75%,1328024.0,0.116507,0.45831,4421.4,0.29716,0.41281,0.92324,2490.0,559630.0,3106600000000000.0,...,0.62265,-5.3183,2.5247,18.96,6.852075,88309.5,1.2423,5.2847e+16,6157.3,0.74497
max,1451392.0,0.4129,0.51988,39841.0,1.3102,0.55558,11.046,5413.1,1906700.0,1.0489e+16,...,1.1094,1.4642,4.5705,217.34,47.314,521540.0,1.8779,3.2395e+17,13198.0,2.732


In [61]:
# imputing missing values #

colmnames = train.columns
test['claim'] = np.nan
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp.fit(train)
train = pd.DataFrame(imp.transform(train))
test = pd.DataFrame(imp.transform(test))
#train.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,110,111,112,113,114,115,116,117,118,119
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,478811.77308,0.090148,0.346559,4036.784673,0.200378,0.304603,-0.0841,1602.964739,372650.4,1804381000000000.0,...,-19.801278,2.071944,23.237519,1.685601,63131.284663,1.2079,4.180692e+16,3952.229419,0.557703,0.49948
std,276869.735891,0.04313,0.144909,6409.247385,0.210675,0.145097,2.08557,1264.395604,341438.6,2332911000000000.0,...,18.37007,0.892729,45.012105,10.025443,92223.169262,0.114597,6.642388e+16,3136.444882,0.403192,0.500005
min,18.0,-0.12838,-0.018642,-8370.6,-0.053539,-0.006659,-12.097,-198.59,-24216.0,-1050800000000000.0,...,-102.41,0.30535,-24.187,-26.346,-54197.0,0.90527,-6779000000000000.0,-407.92,-0.12287,0.0
25%,238495.75,0.070348,0.28529,431.2275,0.035847,0.24105,-1.102725,474.875,91990.0,12518000000000.0,...,-28.34925,1.4918,-0.582995,-4.4334,2666.325,1.147,238472500000000.0,1337.725,0.280527,0.0
50%,477330.5,0.090157,0.38885,1287.25,0.13808,0.327085,-0.40146,1426.5,286895.0,524725000000000.0,...,-14.66,1.6629,1.724,0.80771,20131.0,1.1768,1.3053e+16,3226.2,0.47435,0.0
75%,719372.25,0.115913,0.45747,4273.275,0.294432,0.412253,0.885263,2462.225,548745.0,3100550000000000.0,...,-5.481025,2.5,17.86375,6.6998,88386.0,1.2386,5.09875e+16,6098.625,0.741663,1.0
max,957913.0,0.41004,0.51899,39544.0,1.2856,0.54912,10.55,5337.2,1871000.0,1.0304e+16,...,1.3991,4.5066,211.93,46.983,511400.0,1.7896,3.2176e+17,13148.0,2.632,1.0


In [63]:
train['sample'] = 'train'
test['sample'] = 'pred'
df = pd.concat([train, test])
df.colnames = colmnames
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,111,112,113,114,115,116,117,118,119,sample
0,287817.0,0.082316,0.487200,851.79,0.498290,0.378830,1.599000,1924.500,56523.0,3.523800e+15,...,1.8735,-3.50130,18.75400,106310.000,1.0817,5.456700e+16,356.00,0.327350,1.0,train
1,780501.0,0.097944,0.416790,4116.90,0.096623,0.492870,-1.549200,3877.100,612420.0,5.843300e+14,...,1.5398,156.46000,-1.51470,-30.191,1.1788,4.072500e+15,3332.00,0.040974,1.0,train
2,278317.0,0.100590,0.453950,26235.00,0.411310,0.419510,2.247100,4052.000,317950.0,9.195500e+15,...,1.4981,1.99030,1.53370,97295.000,1.2842,2.276800e+17,2285.10,0.416590,1.0,train
3,483674.0,0.077372,0.257680,420.55,0.006061,0.332850,-0.122640,2783.000,553000.0,3.082900e+15,...,3.8728,14.59200,0.59872,107120.000,1.5243,1.549000e+16,1915.00,0.662360,1.0,train
4,836873.0,0.132260,0.005187,431.05,0.004482,0.000921,-1.118500,75.313,312930.0,3.960700e+15,...,4.0366,-1.43330,-8.96360,-9278.200,1.1478,2.028100e+16,1144.40,0.501480,0.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493469,1451388.0,-0.009112,0.308190,637.64,0.778200,0.414150,-1.068500,651.220,985000.0,6.079700e+15,...,2.3325,0.11226,-5.92380,20131.000,1.1559,8.163900e+16,857.09,1.563300,0.0,pred
493470,1451389.0,0.088922,0.482650,6924.10,0.025963,0.355400,-0.870200,2514.200,18004.0,6.073500e+14,...,1.7005,97.81300,4.37930,-2432.000,1.0707,4.691800e+16,7497.10,0.670750,0.0,pred
493471,1451390.0,0.140620,0.484750,1797.10,0.147020,0.288030,-1.407100,434.030,333050.0,2.351000e+15,...,1.6827,1.72400,8.06330,2471.400,1.1725,7.900900e+16,2904.60,0.180050,0.0,pred
493472,1451391.0,0.168000,0.351760,454.79,0.164580,0.169830,0.323850,2331.200,223980.0,-2.795300e+12,...,1.3531,155.21000,13.96300,-11.440,1.1946,-1.770600e+14,6763.10,0.332230,0.0,pred


In [28]:
### 3. ###

# we have many variables, are all of them useful? lets see correlation

cors = df.corr()
cors.loc[((cors > 0.8)&(cors<1)).any(1)]
# there are no pairwise correlations above 80%

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim


In [29]:
### 4. ###

# check skew and possibly transform some variables #


In [30]:
### 5. ###

y_train = df[['claim']]
X_train = df.drop(columns=['claim'])

print(y_train.shape, X_train.shape)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=2)

print(y_train.shape, X_train.shape, X_test.shape)

(37423, 1) (37423, 119)
(33680, 1) (33680, 119) (3743, 119)


In [31]:
### 6. Scaling ###

ss = StandardScaler()

for i in X_train.columns:
    X_train[[i]] = ss.fit_transform(X_train[[i]])
    X_test[[i]] = ss.transform(X_test[[i]])

X_train.describe()  

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
count,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,...,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0
mean,-4.6149410000000003e-17,-1.723367e-16,-2.276847e-16,-6.709136000000001e-17,6.759900000000001e-17,-2.844782e-18,-6.526845999999999e-19,-8.897607000000001e-17,-1.684585e-16,1.585166e-16,...,3.469117e-17,4.0149990000000005e-17,4.683259e-16,2.4539950000000003e-17,1.1656020000000001e-17,1.413491e-17,-1.023653e-15,5.3855950000000003e-17,4.349912e-17,5.3012490000000005e-17
std,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,...,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015
min,-1.735789,-5.377914,-2.483372,-1.994204,-1.250938,-2.16331,-5.91417,-1.425799,-1.166974,-1.223646,...,-1.450151,-4.443746,-1.926185,-1.131163,-2.738443,-1.138977,-2.559208,-0.7435901,-1.380015,-1.657057
25%,-0.8660363,-0.4612403,-0.4195743,-0.5645006,-0.7877872,-0.4352402,-0.498867,-0.8925065,-0.8301976,-0.767611,...,-0.9606197,-0.4766457,-0.6542054,-0.5381229,-0.6175693,-0.6511689,-0.5308956,-0.6312502,-0.8443441,-0.6910875
50%,0.00465139,-0.00753102,0.2968736,-0.4295383,-0.2977853,0.1566639,-0.1507298,-0.1215875,-0.2466854,-0.5550956,...,-0.1098074,0.2800509,-0.4597107,-0.4861041,-0.08722818,-0.4740574,-0.2710345,-0.4371232,-0.2344897,-0.2052726
75%,0.8684124,0.5965637,0.7664012,0.05870339,0.4611632,0.7397065,0.4727733,0.6836539,0.5259617,0.548351,...,0.7524793,0.788092,0.4845858,-0.1085895,0.5034799,0.2646904,0.2783364,0.1428407,0.6975473,0.456132
max,1.733535,7.155931,1.162921,5.495634,4.972745,1.66674,5.136039,2.962957,4.372166,3.658337,...,2.391472,1.139968,2.794379,4.063584,4.346456,4.770453,5.461607,4.170953,2.891493,5.204625


In [32]:
### 7. Model fitting ###

time1 = time.time()

log_rg = LogisticRegression()

grid_values = {'penalty': ['l2'], 'C': [0.00001, 0.0001, 0.001, 0.01]}

lr = GridSearchCV(log_rg, param_grid = grid_values, cv=4)
lr.fit(X_train, y_train)

print('logistic', lr.best_score_, lr.best_params_, time.time()-time1)


logistic 0.8619358669833729 {'C': 1e-05, 'penalty': 'l2'} 2.049663782119751


In [34]:
time1 = time.time()

svm = svm.SVC(kernel='rbf')

grid_values = {'C':[0.01, 0.1, 1, 5, 10]}

svm = GridSearchCV(svm, param_grid = grid_values, cv=2)
svm.fit(X_train, y_train)

print('SVM', svm.best_score_, svm.best_params_, time.time()-time1)

KeyboardInterrupt: 

In [33]:
time1 = time.time()

xgbcl = XGBClassifier()

grid_values = {'eta':[0.01, 0.1], 'max_depth':[2,3,5]}

xgb = GridSearchCV(xgbcl, param_grid = grid_values, cv=2)
xgb.fit(X_train, y_train)

print('XGBoost', xgb.best_score_, xgb.best_params_, time.time()-time1)



KeyboardInterrupt: 