In [1]:
'''Plan:
0. Load libraries, 
1. load data.
2. Preliminary EDA.
3. Dealing with missing values, merging train and test.
4. Feature engineering, ohc.
5. Sample formation.
6. Feature scaling.
7. Model fitting.
8. Performance evaluation.
9. [opt] Feature importance, error analysis.
10. Predictions.
'''

# aside:
# when coding for interview ML purposes or Kaggle, never drop any obervations!
# you will have to make predictions for all obs in test sample.

# correct way to deal with missing obs and merge train and test samples:
# 1. Load both samples.
# 2. Impute missing values in both samples, using train sample to impute missing values.
# 3. Concatentate them into df.


### 1. Load libraries ###

import numpy as np
import pandas as pd
import os, warnings, random, time
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import svm
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.impute import SimpleImputer

from xgboost import XGBClassifier

warnings.filterwarnings("ignore")
#os.getcwd()

In [2]:
### 1. Load data ###

train = pd.read_csv('../input/tabular-playground-series-sep-2021/train.csv')
print(train.shape)

test = pd.read_csv('../input/tabular-playground-series-sep-2021/test.csv')

train = train.sample(n=200000)

(957919, 120)


In [3]:
### 2. pEDA ###

train.shape
train.describe()

# are there numerical features?

un_colval = pd.DataFrame([[x,len(train[x].unique())] for x in train.columns], columns = ['colname', 'n_unique'])
un_colval.loc[un_colval.n_unique < 100]
# all columns contain numerical features

Unnamed: 0,colname,n_unique
119,claim,2


In [4]:
### 3. Missing values ###

# given huge difference in accuracy with and without missing values, i suggest creating dummies for them.
# to write more general code, will have to create union of mis_cols for train and test.

train_mis_cols = [col for col in train.columns if train[col].isnull().any()]
test_mis_cols = [col for col in test.columns if test[col].isnull().any()]

for col in train_mis_cols:
    train[col + '_miss'] = (train[col].isnull()).astype(int)

for col in test_mis_cols:
    test[col + '_miss'] = (test[col].isnull()).astype(int)

In [5]:
#train.describe()
#test.describe()

# imputing missing values #

colmnames = train.columns
test['claim'] = np.nan
imp = SimpleImputer(missing_values=np.nan, strategy='median')
imp.fit(train)
train = pd.DataFrame(imp.transform(train))
test = pd.DataFrame(imp.transform(test))
#train.describe()

In [6]:
train['sample'] = 'train'
test['sample'] = 'pred'
df = pd.concat([train, test])
#df.colnames = list(colmnames)+'sample'
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,229,230,231,232,233,234,235,236,237,sample
0,117075.0,0.081362,0.473380,-291.92,-0.003355,0.356400,-0.740850,1663.60,239770.0,-3.200400e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
1,822015.0,0.060645,0.465470,22233.00,0.044649,0.373000,-1.153800,2182.20,636730.0,4.636400e+15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
2,477450.0,0.100570,0.003177,343.62,0.005835,-0.001988,-0.772670,1016.50,437150.0,-7.275500e+12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
3,631308.0,0.082687,0.162730,1475.00,0.052222,0.389130,1.481100,1569.60,1200800.0,3.701800e+12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
4,907459.0,0.062238,0.240750,152.68,0.098153,0.503840,0.001133,1815.20,237780.0,1.623700e+15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693469,1451388.0,-0.009112,0.308190,637.64,0.778200,0.414150,-1.068500,651.22,985000.0,6.079700e+15,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,pred
693470,1451389.0,0.088922,0.482650,6924.10,0.025963,0.355400,-0.870200,2514.20,18004.0,6.073500e+14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pred
693471,1451390.0,0.140620,0.484750,1797.10,0.147020,0.288030,-1.407100,434.03,333050.0,2.351000e+15,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pred
693472,1451391.0,0.168000,0.351760,454.79,0.164580,0.169830,0.323850,2331.20,223980.0,-2.795300e+12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pred


In [7]:
newcolnames = list(colmnames) + ['sample']
df.columns = newcolnames
df

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f110_miss,f111_miss,f112_miss,f113_miss,f114_miss,f115_miss,f116_miss,f117_miss,f118_miss,sample
0,117075.0,0.081362,0.473380,-291.92,-0.003355,0.356400,-0.740850,1663.60,239770.0,-3.200400e+09,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
1,822015.0,0.060645,0.465470,22233.00,0.044649,0.373000,-1.153800,2182.20,636730.0,4.636400e+15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
2,477450.0,0.100570,0.003177,343.62,0.005835,-0.001988,-0.772670,1016.50,437150.0,-7.275500e+12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
3,631308.0,0.082687,0.162730,1475.00,0.052222,0.389130,1.481100,1569.60,1200800.0,3.701800e+12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
4,907459.0,0.062238,0.240750,152.68,0.098153,0.503840,0.001133,1815.20,237780.0,1.623700e+15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693469,1451388.0,-0.009112,0.308190,637.64,0.778200,0.414150,-1.068500,651.22,985000.0,6.079700e+15,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,pred
693470,1451389.0,0.088922,0.482650,6924.10,0.025963,0.355400,-0.870200,2514.20,18004.0,6.073500e+14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pred
693471,1451390.0,0.140620,0.484750,1797.10,0.147020,0.288030,-1.407100,434.03,333050.0,2.351000e+15,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pred
693472,1451391.0,0.168000,0.351760,454.79,0.164580,0.169830,0.323850,2331.20,223980.0,-2.795300e+12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,pred


In [8]:
# we have many variables, are all of them useful? lets see correlation

#cors = df.corr()
#cors.loc[((cors > 0.8)&(cors<1)).any(1)]
# there are no pairwise correlations above 80%

In [9]:
### 4. Feature engineering ###

# check skew and possibly transform some variables #

temp = (df.dtypes == np.float64)
num_cols = df.columns[temp]
skew_vals = df[num_cols].skew() 
skew_limit = 1
    
skew_cols = (skew_vals
             .sort_values(ascending=False)
             .to_frame()
             .rename(columns={0:'Skew'})
             .query('abs(Skew) > {}'.format(skew_limit)))

print(skew_cols)

                Skew
f118_miss  14.568339
f2_miss     7.797908
f13_miss    7.773938
f80_miss    7.771329
f112_miss   7.766118
...              ...
f13        -1.295357
f110       -1.330498
f58        -1.357247
f46        -1.542950
f91        -1.557878

[187 rows x 1 columns]


In [13]:
### 5. ###

y_train = df.loc[df['sample']=='train',['claim']]
X_train_ = df.drop(columns=['claim'])
X_train = X_train_.loc[X_train_['sample']=='train']
X_pred = X_train_.loc[X_train_['sample']=='pred']

print(y_train.shape, X_train.shape)

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.1, random_state=2)

print(y_train.shape, X_train.shape, X_pred.shape)

(200000, 1) (200000, 238)
(180000, 1) (180000, 238) (493474, 238)


In [14]:
X_train.drop(columns = ['sample'], inplace=True)
X_test.drop(columns = ['sample'], inplace=True)
X_pred.drop(columns = ['sample'], inplace=True)

In [15]:
X_pred

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f109_miss,f110_miss,f111_miss,f112_miss,f113_miss,f114_miss,f115_miss,f116_miss,f117_miss,f118_miss
200000,957919.0,0.165850,0.487050,1295.00,0.023100,0.31900,0.901880,573.29,3743.7,2.705700e+12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200001,957920.0,0.129650,0.373480,1763.00,0.728840,0.33247,-1.263100,875.55,554370.0,5.955700e+14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200002,957921.0,0.120190,0.445210,736.26,0.046150,0.29605,0.316650,2659.50,317140.0,3.977800e+14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200003,957922.0,0.054008,0.395960,996.14,0.859340,0.36678,-0.170600,386.56,325680.0,-3.432200e+13,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200004,957923.0,0.079947,-0.006919,10574.00,0.348450,0.45008,-1.842000,3027.00,428150.0,9.291500e+11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
693469,1451388.0,-0.009112,0.308190,637.64,0.778200,0.41415,-1.068500,651.22,985000.0,6.079700e+15,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
693470,1451389.0,0.088922,0.482650,6924.10,0.025963,0.35540,-0.870200,2514.20,18004.0,6.073500e+14,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
693471,1451390.0,0.140620,0.484750,1797.10,0.147020,0.28803,-1.407100,434.03,333050.0,2.351000e+15,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
693472,1451391.0,0.168000,0.351760,454.79,0.164580,0.16983,0.323850,2331.20,223980.0,-2.795300e+12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
### 6. Scaling ###

ss = StandardScaler()

#for i in X_train.columns[0:119]:
for i in X_train.columns:
    X_train[[i]] = ss.fit_transform(X_train[[i]])
    X_test[[i]] = ss.transform(X_test[[i]])

X_train.describe()  

Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f109_miss,f110_miss,f111_miss,f112_miss,f113_miss,f114_miss,f115_miss,f116_miss,f117_miss,f118_miss
count,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,...,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0,180000.0
mean,4.856177e-17,5.117993e-18,-1.955584e-16,2.022478e-16,-6.465544000000001e-17,2.479239e-16,4.1420260000000006e-17,3.589073e-17,2.762499e-16,1.2724390000000002e-17,...,1.563633e-15,1.047261e-15,-2.893582e-15,2.495735e-16,-3.813125e-15,-3.509097e-16,5.098667e-15,2.325263e-16,1.170454e-14,1.187001e-16
std,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,...,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003,1.000003
min,-1.730933,-5.363099,-2.511483,-2.071116,-1.236648,-2.161083,-5.686915,-1.438933,-1.178798,-1.239681,...,-0.1280766,-0.127987,-0.1279197,-0.1278075,-0.1260223,-0.1286133,-0.1288586,-0.1291479,-0.1297469,-0.1278524
25%,-0.8677114,-0.4570095,-0.4198685,-0.5630091,-0.7783965,-0.4351244,-0.4897308,-0.8881558,-0.8227977,-0.7612088,...,-0.1280766,-0.127987,-0.1279197,-0.1278075,-0.1260223,-0.1286133,-0.1288586,-0.1291479,-0.1297469,-0.1278524
50%,0.002135542,-0.001754528,0.293221,-0.429401,-0.2991274,0.155776,-0.1441352,-0.1366565,-0.2509927,-0.5530376,...,-0.1280766,-0.127987,-0.1279197,-0.1278075,-0.1260223,-0.1286133,-0.1288586,-0.1291479,-0.1297469,-0.1278524
75%,0.8640951,0.5842382,0.7617377,0.04715925,0.4455957,0.7373053,0.46447,0.6743857,0.523069,0.5376001,...,-0.1280766,-0.127987,-0.1279197,-0.1278075,-0.1260223,-0.1286133,-0.1288586,-0.1291479,-0.1297469,-0.1278524
max,1.730478,7.156427,1.166943,5.596314,5.251886,1.690007,5.222199,2.955129,4.47343,3.725684,...,7.807825,7.813294,7.817404,7.824267,7.935105,7.775244,7.760444,7.743059,7.707314,7.821519


In [18]:
### 7. Model fitting ###

time1 = time.time()

log_rg = LogisticRegression()

grid_values = {'penalty': ['l2'], 'C': [0.1, 0.2, 0.5, 1, 2, 3]}

lr = GridSearchCV(log_rg, param_grid = grid_values, cv=4)
lr.fit(X_train, y_train)

print('logistic', lr.best_score_, lr.best_params_, time.time()-time1)
# after dropping missing I had 86%. while imputing them with median, i had 52%. with miss dummies, 73%.

logistic 0.7341888888888889 {'C': 0.2, 'penalty': 'l2'} 21.016772747039795


In [None]:
time1 = time.time()

svm = svm.SVC(kernel='rbf')

grid_values = {'C':[0.01, 0.1, 1, 5, 10]}

svm = GridSearchCV(svm, param_grid = grid_values, cv=2)
svm.fit(X_train, y_train)

print('SVM', svm.best_score_, svm.best_params_, time.time()-time1)

In [19]:
time1 = time.time()

xgbcl = XGBClassifier(tree_method='gpu_hist', gpu_id=0)

grid_values = {'n_estimators':[100,200,300,400],'eta':[0.05, 0.8, 0.11], 'max_depth':[2,3]}

xgb = GridSearchCV(xgbcl, param_grid = grid_values, cv=2)
xgb.fit(X_train, y_train)

print('XGBoost', xgb.best_score_, xgb.best_params_, time.time()-time1)

XGBoost 0.7315333333333334 {'eta': 0.11, 'max_depth': 3, 'n_estimators': 300} 113.68186163902283


In [20]:
### 8. performance evaluation ###

yhat_lr = lr.predict(X_test)
yhat_bt = xgb.predict(X_test)

print('Accuracy of logistic regression is ', 1-(np.abs(yhat_lr-np.array(y_test.claim))).mean())
print('Accuracy of XGBoost is ', 1-(np.abs(yhat_bt-np.array(y_test.claim))).mean())


Accuracy of logistic regression is  0.7323999999999999
Accuracy of XGBoost is  0.73735


In [25]:
### Export results ###
yhat_lr = lr.predict(X_pred).astype(int)
yhat_bt = xgb.predict(X_pred).astype(int)

submission_df_lr = pd.DataFrame({'id': (X_pred.id).astype(int), 'claim': yhat_lr}, columns=['id', 'claim'])
submission_df_bt = pd.DataFrame({'id': (X_pred.id).astype(int), 'claim': yhat_bt}, columns=['id', 'claim'])

submission_df_lr.to_csv('submissions_Sep21_lr1.csv',index=False)
submission_df_bt.to_csv('submissions_Sep21_bt1.csv',index=False)

In [27]:
os.chdir(r'/kaggle/working')

from IPython.display import FileLink
FileLink(r'submissions_Sep21_lr1.csv')

In [62]:
# something has gone terribly wrong here. my submissions gte scored as pure random guess...


Unnamed: 0,id,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f109_miss,f110_miss,f111_miss,f112_miss,f113_miss,f114_miss,f115_miss,f116_miss,f117_miss,f118_miss
