In [79]:
'''Plan:
1. Load libraries, load data.
2. Preliminary EDA, dealing with missing values, merging train and test.
3. EDA, deleting variables.
4. Feature engineering, ohc.
5. Sample formation.
6. Feature scaling.
7. Model fitting.
8. Performance evaluation.
9. Predictions.
'''

# aside:
# when coding for interview ML purposes or Kaggle, never drop any obervations!
# you will have to make predictions for all obs in test sample.

# correct way to deal with missing obs and merge train and test samples:
# 1. Load both samples.
# 2. Impute missing values in both samples, using train sample to impute missing values.
# 3. Concatentate them into df.


### 1. Load libraries ###

import numpy as np
import pandas as pd
import os, warnings, random, time
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer

from xgboost import XGBClassifier

warnings.filterwarnings("ignore")
#os.getcwd()

In [87]:
### Load data ###

train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
print(train.shape)

test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

train = train.sample(n=50000)

(957919, 120)


In [88]:
### 2. ###

train.shape
train.describe()

# are there numerical features?

un_colval = pd.DataFrame([[x,len(train[x].unique())] for x in train.columns], columns = ['colname', 'n_unique'])
un_colval.loc[un_colval.n_unique < 100]
# all columns contain numerical features

Unnamed: 0,colname,n_unique
119,claim,2


In [89]:
#train.describe()
#test.describe()

# imputing missing values #

colmnames = train.columns
test['claim'] = np.nan
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp.fit(train)
train = pd.DataFrame(imp.transform(train))
test = pd.DataFrame(imp.transform(test))
#train.describe()

In [90]:
train['sample'] = 'train'
test['sample'] = 'pred'
df = pd.concat([train, test])
#df.colnames = list(colmnames)+'sample'
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,111,112,113,114,115,116,117,118,119,sample
0,153815.0,0.101330,0.47947,15385.00,0.246190,0.27297,-0.869000,2042.90,41499.0,3.587700e+15,...,1.5528,-1.75110,2.58050,792.06,1.1450,2.949200e+15,181.13,1.819100,0.0,train
1,664221.0,0.093230,0.26313,3511.10,0.015731,0.47427,-1.424600,1129.00,192410.0,-7.741900e+12,...,4.2677,-1.25610,13.16800,105410.00,1.0892,2.573600e+13,231.59,0.178190,0.0,train
2,666309.0,0.018645,0.32663,886.63,0.013459,0.24345,0.246830,295.61,467920.0,1.999900e+15,...,1.1023,18.14900,-8.60080,172500.00,1.1257,1.748600e+16,248.96,0.084167,1.0,train
3,896941.0,0.083500,0.42187,728.63,0.188360,0.36629,-2.261100,341.64,187180.0,8.379500e+12,...,1.6407,-1.82440,0.89801,238650.00,1.0732,4.401500e+15,10525.00,0.590930,0.0,train
4,33685.0,0.079317,0.29726,-2453.00,0.027737,0.22532,-0.619640,569.76,87058.0,1.317400e+13,...,1.5598,1.35600,0.79779,10758.00,1.2744,3.976300e+16,1445.60,0.474480,0.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493469,1451388.0,-0.009112,0.30819,637.64,0.778200,0.41415,-1.068500,651.22,985000.0,6.079700e+15,...,2.3325,0.11226,-5.92380,19284.00,1.1559,8.163900e+16,857.09,1.563300,0.0,pred
493470,1451389.0,0.088922,0.48265,6924.10,0.025963,0.35540,-0.870200,2514.20,18004.0,6.073500e+14,...,1.7005,97.81300,4.37930,-2432.00,1.0707,4.691800e+16,7497.10,0.670750,0.0,pred
493471,1451390.0,0.140620,0.48475,1797.10,0.147020,0.28803,-1.407100,434.03,333050.0,2.351000e+15,...,1.6827,1.72910,8.06330,2471.40,1.1725,7.900900e+16,2904.60,0.180050,0.0,pred
493472,1451391.0,0.168000,0.35176,454.79,0.164580,0.16983,0.323850,2331.20,223980.0,-2.795300e+12,...,1.3531,155.21000,13.96300,-11.44,1.1946,-1.770600e+14,6763.10,0.332230,0.0,pred


In [96]:
newcolnames = list(colmnames) + ['sample']
df.columns = newcolnames
df

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f111,f112,f113,f114,f115,f116,f117,f118,claim,sample
0,153815.0,0.101330,0.47947,15385.00,0.246190,0.27297,-0.869000,2042.90,41499.0,3.587700e+15,...,1.5528,-1.75110,2.58050,792.06,1.1450,2.949200e+15,181.13,1.819100,0.0,train
1,664221.0,0.093230,0.26313,3511.10,0.015731,0.47427,-1.424600,1129.00,192410.0,-7.741900e+12,...,4.2677,-1.25610,13.16800,105410.00,1.0892,2.573600e+13,231.59,0.178190,0.0,train
2,666309.0,0.018645,0.32663,886.63,0.013459,0.24345,0.246830,295.61,467920.0,1.999900e+15,...,1.1023,18.14900,-8.60080,172500.00,1.1257,1.748600e+16,248.96,0.084167,1.0,train
3,896941.0,0.083500,0.42187,728.63,0.188360,0.36629,-2.261100,341.64,187180.0,8.379500e+12,...,1.6407,-1.82440,0.89801,238650.00,1.0732,4.401500e+15,10525.00,0.590930,0.0,train
4,33685.0,0.079317,0.29726,-2453.00,0.027737,0.22532,-0.619640,569.76,87058.0,1.317400e+13,...,1.5598,1.35600,0.79779,10758.00,1.2744,3.976300e+16,1445.60,0.474480,0.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
493469,1451388.0,-0.009112,0.30819,637.64,0.778200,0.41415,-1.068500,651.22,985000.0,6.079700e+15,...,2.3325,0.11226,-5.92380,19284.00,1.1559,8.163900e+16,857.09,1.563300,0.0,pred
493470,1451389.0,0.088922,0.48265,6924.10,0.025963,0.35540,-0.870200,2514.20,18004.0,6.073500e+14,...,1.7005,97.81300,4.37930,-2432.00,1.0707,4.691800e+16,7497.10,0.670750,0.0,pred
493471,1451390.0,0.140620,0.48475,1797.10,0.147020,0.28803,-1.407100,434.03,333050.0,2.351000e+15,...,1.6827,1.72910,8.06330,2471.40,1.1725,7.900900e+16,2904.60,0.180050,0.0,pred
493472,1451391.0,0.168000,0.35176,454.79,0.164580,0.16983,0.323850,2331.20,223980.0,-2.795300e+12,...,1.3531,155.21000,13.96300,-11.44,1.1946,-1.770600e+14,6763.10,0.332230,0.0,pred


In [98]:
### 3. ###

# we have many variables, are all of them useful? lets see correlation

#cors = df.corr()
#cors.loc[((cors > 0.8)&(cors<1)).any(1)]
# there are no pairwise correlations above 80%

In [99]:
### 4. ###

# check skew and possibly transform some variables #


In [101]:
### 5. ###

y_train = df.loc[df['sample']=='train',['claim']]
X_train = df.drop(columns=['claim'])
X_train = X_train.loc[X_train['sample']=='train']
X_pred = X_train.loc[X_train['sample']=='test']

print(y_train.shape, X_train.shape)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=2)

print(y_train.shape, X_train.shape, X_test.shape)

(50000, 1) (50000, 120)
(45000, 1) (45000, 120) (5000, 120)


In [103]:
X_test

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,sample
23656,459922.0,0.000997,0.23616,1837.70,0.167570,0.000381,-1.02190,556.35,358490.0,-8.372400e+12,...,-21.1070,1.57370,0.41146,-4.4966,2123.70,1.2069,2.358500e+16,9038.100,0.37741,train
27442,639112.0,0.061548,0.46893,28950.00,0.276140,0.426450,-0.69772,2430.60,1056300.0,6.154400e+13,...,-33.5290,3.96880,5.54550,1.5450,18032.00,1.2200,6.420600e+14,9401.800,0.83720,train
40162,758328.0,0.081479,0.46552,3706.60,0.203720,0.391790,0.92159,288.47,760380.0,2.136000e+15,...,-4.7030,4.16660,1.36430,1.5258,1925.60,1.0898,-9.813300e+13,680.000,0.20146,train
8459,398113.0,0.078819,0.35543,1259.70,-0.002040,0.187990,3.64370,2943.50,52919.0,3.506100e+15,...,-12.7980,2.97950,-0.00548,-4.7205,6573.40,1.2344,1.182200e+15,1928.500,0.60247,train
8051,681741.0,0.080000,0.46197,446.81,0.037718,0.389370,1.98930,1073.20,279310.0,2.738600e+14,...,-36.4020,0.87978,1.27800,-1.6939,392.70,1.2078,1.937100e+14,164.640,0.12352,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14503,670354.0,0.080693,0.37580,-175.69,0.790830,0.003140,-1.68030,2032.50,9116.3,2.523100e+13,...,-2.4019,2.78220,36.22700,-7.8979,7150.60,1.2008,1.898900e+16,-96.749,0.38777,train
24801,941512.0,0.065555,0.44969,566.16,0.344370,0.259180,-1.07890,117.08,362390.0,2.259500e+15,...,-15.3960,1.27190,88.50500,-2.1206,6335.20,1.1342,1.510200e+16,2858.700,0.80442,train
47054,434644.0,0.075924,0.45264,779.41,0.690320,0.409400,0.12702,630.58,1053600.0,6.251600e+15,...,-14.7750,3.38770,105.71000,3.2481,967.54,1.0892,8.356300e+13,9751.700,0.52492,train
7999,14584.0,0.121370,0.28309,1645.20,0.108400,0.268600,-0.44917,2400.40,530320.0,1.057500e+14,...,-1.2252,1.61080,17.04000,-12.8940,15825.00,1.1609,1.285400e+14,496.020,0.17453,train


In [31]:
### 6. Scaling ###

ss = StandardScaler()

for i in X_train.columns:
    X_train[[i]] = ss.fit_transform(X_train[[i]])
    X_test[[i]] = ss.transform(X_test[[i]])

X_train.describe()  

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
count,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,...,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0
mean,-4.6149410000000003e-17,-1.723367e-16,-2.276847e-16,-6.709136000000001e-17,6.759900000000001e-17,-2.844782e-18,-6.526845999999999e-19,-8.897607000000001e-17,-1.684585e-16,1.585166e-16,...,3.469117e-17,4.0149990000000005e-17,4.683259e-16,2.4539950000000003e-17,1.1656020000000001e-17,1.413491e-17,-1.023653e-15,5.3855950000000003e-17,4.349912e-17,5.3012490000000005e-17
std,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,...,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015
min,-1.735789,-5.377914,-2.483372,-1.994204,-1.250938,-2.16331,-5.91417,-1.425799,-1.166974,-1.223646,...,-1.450151,-4.443746,-1.926185,-1.131163,-2.738443,-1.138977,-2.559208,-0.7435901,-1.380015,-1.657057
25%,-0.8660363,-0.4612403,-0.4195743,-0.5645006,-0.7877872,-0.4352402,-0.498867,-0.8925065,-0.8301976,-0.767611,...,-0.9606197,-0.4766457,-0.6542054,-0.5381229,-0.6175693,-0.6511689,-0.5308956,-0.6312502,-0.8443441,-0.6910875
50%,0.00465139,-0.00753102,0.2968736,-0.4295383,-0.2977853,0.1566639,-0.1507298,-0.1215875,-0.2466854,-0.5550956,...,-0.1098074,0.2800509,-0.4597107,-0.4861041,-0.08722818,-0.4740574,-0.2710345,-0.4371232,-0.2344897,-0.2052726
75%,0.8684124,0.5965637,0.7664012,0.05870339,0.4611632,0.7397065,0.4727733,0.6836539,0.5259617,0.548351,...,0.7524793,0.788092,0.4845858,-0.1085895,0.5034799,0.2646904,0.2783364,0.1428407,0.6975473,0.456132
max,1.733535,7.155931,1.162921,5.495634,4.972745,1.66674,5.136039,2.962957,4.372166,3.658337,...,2.391472,1.139968,2.794379,4.063584,4.346456,4.770453,5.461607,4.170953,2.891493,5.204625


In [32]:
### 7. Model fitting ###

time1 = time.time()

log_rg = LogisticRegression()

grid_values = {'penalty': ['l2'], 'C': [0.00001, 0.0001, 0.001, 0.01]}

lr = GridSearchCV(log_rg, param_grid = grid_values, cv=4)
lr.fit(X_train, y_train)

print('logistic', lr.best_score_, lr.best_params_, time.time()-time1)


logistic 0.8619358669833729 {'C': 1e-05, 'penalty': 'l2'} 2.049663782119751


In [34]:
time1 = time.time()

svm = svm.SVC(kernel='rbf')

grid_values = {'C':[0.01, 0.1, 1, 5, 10]}

svm = GridSearchCV(svm, param_grid = grid_values, cv=2)
svm.fit(X_train, y_train)

print('SVM', svm.best_score_, svm.best_params_, time.time()-time1)

KeyboardInterrupt: 

In [33]:
time1 = time.time()

xgbcl = XGBClassifier()

grid_values = {'eta':[0.01, 0.1], 'max_depth':[2,3,5]}

xgb = GridSearchCV(xgbcl, param_grid = grid_values, cv=2)
xgb.fit(X_train, y_train)

print('XGBoost', xgb.best_score_, xgb.best_params_, time.time()-time1)



KeyboardInterrupt: 