In [48]:
# Data loading

import pandas as pd
import numpy as np
import os

train_path = os.path.join('data', 'train.csv')
test_path = os.path.join('data', 'test.csv')
output_path = os.path.join('kaggle_output', 'sample_submission_2.csv')

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
train_data.set_index('id', inplace=True)
test_data.set_index('id', inplace=True)
categorical_columns = [f'cat{idx}' for idx in range(19)]
continous_columns = [f'cont{idx}' for idx in range(11)]
print(train_data.shape)
train_data.head()

(300000, 31)


Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,cat10,cat11,cat12,cat13,cat14,cat15,cat16,cat17,cat18,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1
0,A,I,A,B,B,BI,A,S,Q,A,LO,A,A,A,A,B,D,D,B,0.629858,0.855349,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,A,I,A,A,E,BI,K,W,AD,F,HJ,A,B,A,B,D,B,D,B,0.370727,0.328929,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,A,K,A,A,E,BI,A,E,BM,L,DJ,A,B,A,A,B,D,D,B,0.502272,0.322749,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,A,K,A,C,E,BI,A,Y,AD,F,KV,A,A,A,A,B,D,D,B,0.934242,0.707663,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,A,I,G,B,E,BI,C,G,Q,A,DP,A,A,A,B,B,B,D,B,0.254427,0.274514,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


In [49]:
# Some useful libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# categorical data label encoding
for col in categorical_columns:
    label_encoder = LabelEncoder().fit(pd.concat((train_data[col], test_data[col]), axis=0))
    train_data[col] = label_encoder.transform(train_data[col])
    test_data[col] = label_encoder.transform(test_data[col])
    
# transforming target into Boolean
train_data.target = train_data.target == 1


# splitting into training and validation datasets
print(train_data[categorical_columns].shape, train_data[continous_columns].shape, train_data.target.shape)    # sanity check before split
X_train, X_test, y_train, y_test = train_test_split(train_data[categorical_columns+continous_columns], train_data.target, test_size=0.25, random_state=2138)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) # sanity check after split

(300000, 19) (300000, 11) (300000,)
(225000, 30) (75000, 30) (225000,) (75000,)


In [50]:
# import predicting model's class
from sklearn.linear_model import LogisticRegression

# fitting model on training set
log_reg = LogisticRegression(random_state=3821, max_iter=1e5).fit(X_train, y_train)

print(log_reg.score(X_test, y_test))    # printing predition score of model's performance on the validation set 

0.83684


In [52]:
# >0.83  accuracy with simple logistic regression? FR

# Dumping prediction on test_data to file in the dedicated format
test_indices = test_data.index.to_list()
prediction = list(log_reg.predict(test_data).astype(int))
pd.DataFrame({'id': test_indices, 'target': prediction}).to_csv(output_path, index=None)

In [None]:
# Voila! : )