In [23]:
'''Plan:
1. Load libraries, load data.
2. Preliminary EDA, dealing with missing values.
3. EDA, deleting variables.
4. Feature engineering, ohc.
5. Sample formation.
6. Feature scaling.
7. Model fitting.
8. Performance evaluation.
9. Predictions.
'''

### 1. Load libraries ###

import numpy as np
import pandas as pd
import os, warnings, random, time
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

from xgboost import XGBClassifier

warnings.filterwarnings("ignore")
#os.getcwd()

In [24]:
### Load data ###

train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
print(train.shape)
train.head(3)

(957919, 120)


Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
0,0,0.10859,0.004314,-37.566,0.017364,0.28915,-10.251,135.12,168900.0,399240000000000.0,...,-12.228,1.7482,1.9096,-7.1157,4378.8,1.2096,861340000000000.0,140.1,1.0177,1
1,1,0.1009,0.29961,11822.0,0.2765,0.4597,-0.83733,1721.9,119810.0,3874100000000000.0,...,-56.758,4.1684,0.34808,4.142,913.23,1.2464,7575100000000000.0,1861.0,0.28359,0
2,2,0.17803,-0.00698,907.27,0.27214,0.45948,0.17327,2298.0,360650.0,12245000000000.0,...,-5.7688,1.2042,0.2629,8.1312,45119.0,1.1764,321810000000000.0,3838.2,0.4069,1


In [25]:
### 2. ###

df = train.sample(n=100000)
#df = train.copy()
df.shape
df.describe()

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim
count,100000.0,98423.0,98436.0,98387.0,98409.0,98416.0,98412.0,98324.0,98331.0,98385.0,...,98392.0,98374.0,98390.0,98419.0,98349.0,98408.0,98389.0,98430.0,98340.0,100000.0
mean,479669.6079,0.090252,0.345467,4045.538415,0.200284,0.30485,-0.06788,1630.61281,376601.8,1804475000000000.0,...,-19.866839,2.072613,23.700881,1.767113,62769.411521,1.208571,4.266137e+16,3961.897576,0.558247,0.49824
std,276304.441389,0.043424,0.146654,6375.592514,0.211903,0.145161,2.104665,1275.251136,345941.1,2332285000000000.0,...,18.536676,0.89339,45.526179,10.086453,92063.638166,0.114896,6.718075e+16,3160.040257,0.406459,0.499999
min,9.0,-0.14991,-0.018172,-8598.6,-0.063067,-0.006924,-12.456,-189.67,-28337.0,-1050800000000000.0,...,-102.39,0.34609,-27.691,-26.276,-60269.0,0.90789,-7347200000000000.0,-407.92,-0.12136,0.0
25%,240558.25,0.070284,0.28217,414.95,0.034455,0.240538,-1.115825,493.595,90164.5,12128000000000.0,...,-28.711,1.4878,-0.66257,-4.43505,2459.2,1.1468,231630000000000.0,1306.55,0.27711,0.0
50%,480140.0,0.09021,0.388875,1280.1,0.13682,0.32742,-0.37579,1468.1,289040.0,505520000000000.0,...,-14.6225,1.663,1.68885,0.89434,19172.0,1.1771,1.3359e+16,3230.9,0.472325,0.0
75%,719156.0,0.11648,0.45826,4435.5,0.29624,0.41311,0.916892,2506.0,559935.0,3102000000000000.0,...,-5.289925,2.5029,18.52025,6.80885,87905.0,1.2413,5.252e+16,6149.5,0.742993,1.0
max,957911.0,0.40177,0.51669,38518.0,1.2856,0.54884,10.692,5388.3,1886900.0,1.0424e+16,...,1.3101,4.5659,212.17,46.715,510430.0,1.834,3.238e+17,13105.0,2.6901,1.0


In [26]:
# are there numerical features?

un_colval = pd.DataFrame([[x,len(df[x].unique())] for x in df.columns], columns = ['colname', 'n_unique'])
un_colval.loc[un_colval.n_unique < 100]
# all columns contain numerical features

Unnamed: 0,colname,n_unique
119,claim,2


In [27]:
# omitting missing values #

#df.info(verbose=True, null_counts=True)
df = df.dropna()
df.shape

(37423, 120)

In [28]:
### 3. ###

# we have many variables, are all of them useful? lets see correlation

cors = df.corr()
cors.loc[((cors > 0.8)&(cors<1)).any(1)]
# there are no pairwise correlations above 80%

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110,f111,f112,f113,f114,f115,f116,f117,f118,claim


In [29]:
### 4. ###

# check skew and possibly transform some variables #


In [30]:
### 5. ###

y_train = df[['claim']]
X_train = df.drop(columns=['claim'])

print(y_train.shape, X_train.shape)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=2)

print(y_train.shape, X_train.shape, X_test.shape)

(37423, 1) (37423, 119)
(33680, 1) (33680, 119) (3743, 119)


In [31]:
### 6. Scaling ###

ss = StandardScaler()

for i in X_train.columns:
    X_train[[i]] = ss.fit_transform(X_train[[i]])
    X_test[[i]] = ss.transform(X_test[[i]])

X_train.describe()  

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f109,f110,f111,f112,f113,f114,f115,f116,f117,f118
count,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,...,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0,33680.0
mean,-4.6149410000000003e-17,-1.723367e-16,-2.276847e-16,-6.709136000000001e-17,6.759900000000001e-17,-2.844782e-18,-6.526845999999999e-19,-8.897607000000001e-17,-1.684585e-16,1.585166e-16,...,3.469117e-17,4.0149990000000005e-17,4.683259e-16,2.4539950000000003e-17,1.1656020000000001e-17,1.413491e-17,-1.023653e-15,5.3855950000000003e-17,4.349912e-17,5.3012490000000005e-17
std,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,...,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015,1.000015
min,-1.735789,-5.377914,-2.483372,-1.994204,-1.250938,-2.16331,-5.91417,-1.425799,-1.166974,-1.223646,...,-1.450151,-4.443746,-1.926185,-1.131163,-2.738443,-1.138977,-2.559208,-0.7435901,-1.380015,-1.657057
25%,-0.8660363,-0.4612403,-0.4195743,-0.5645006,-0.7877872,-0.4352402,-0.498867,-0.8925065,-0.8301976,-0.767611,...,-0.9606197,-0.4766457,-0.6542054,-0.5381229,-0.6175693,-0.6511689,-0.5308956,-0.6312502,-0.8443441,-0.6910875
50%,0.00465139,-0.00753102,0.2968736,-0.4295383,-0.2977853,0.1566639,-0.1507298,-0.1215875,-0.2466854,-0.5550956,...,-0.1098074,0.2800509,-0.4597107,-0.4861041,-0.08722818,-0.4740574,-0.2710345,-0.4371232,-0.2344897,-0.2052726
75%,0.8684124,0.5965637,0.7664012,0.05870339,0.4611632,0.7397065,0.4727733,0.6836539,0.5259617,0.548351,...,0.7524793,0.788092,0.4845858,-0.1085895,0.5034799,0.2646904,0.2783364,0.1428407,0.6975473,0.456132
max,1.733535,7.155931,1.162921,5.495634,4.972745,1.66674,5.136039,2.962957,4.372166,3.658337,...,2.391472,1.139968,2.794379,4.063584,4.346456,4.770453,5.461607,4.170953,2.891493,5.204625


In [32]:
### 7. Model fitting ###

time1 = time.time()

log_rg = LogisticRegression()

grid_values = {'penalty': ['l2'], 'C': [0.00001, 0.0001, 0.001, 0.01]}

lr = GridSearchCV(log_rg, param_grid = grid_values, cv=4)
lr.fit(X_train, y_train)

print('logistic', lr.best_score_, lr.best_params_, time.time()-time1)


logistic 0.8619358669833729 {'C': 1e-05, 'penalty': 'l2'} 2.049663782119751


In [None]:
time1 = time.time()

svm = svm.SVC(kernel='rbf')

grid_values = {'C':[0.01, 0.1, 1, 5, 10]}

svm = GridSearchCV(svm, param_grid = grid_values, cv=2)
svm.fit(X_train, y_train)

print('SVM', svm.best_score_, svm.best_params_, time.time()-time1)

In [33]:
time1 = time.time()

xgbcl = XGBClassifier()

grid_values = {'eta':[0.01, 0.1], 'max_depth':[2,3,5]}

xgb = GridSearchCV(xgbcl, param_grid = grid_values, cv=2)
xgb.fit(X_train, y_train)

print('XGBoost', xgb.best_score_, xgb.best_params_, time.time()-time1)



KeyboardInterrupt: 