In [1]:
# Data loading

import pandas as pd
import numpy as np
import os

train_path = os.path.join('data', 'train.csv')
test_path = os.path.join('data', 'test.csv')
output_path = os.path.join('kaggle_output', 'sample_submission_2.csv')

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
train_data.set_index('id', inplace=True)
test_data.set_index('id', inplace=True)
categorical_columns = [f'cat{idx}' for idx in range(19)]
continous_columns = [f'cont{idx}' for idx in range(11)]
print(train_data.shape)
train_data.head()

(240000, 31)


Unnamed: 0_level_0,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,cat9,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
294672,A,N,A,A,H,BI,A,F,AF,A,...,0.825906,0.566271,0.834821,0.418962,0.786024,0.517031,0.726133,0.26005,0.528997,0
378663,A,J,C,F,G,BI,A,AH,AV,A,...,0.775983,0.854265,0.236272,0.804565,0.555801,0.73597,0.487202,0.770561,0.650874,1
243382,A,L,F,B,E,BI,A,E,BJ,A,...,0.34928,0.625846,0.771506,0.737403,0.720549,0.372884,0.413075,0.282077,0.334873,0
420112,B,O,A,A,E,AB,A,AH,N,A,...,0.357225,0.667713,0.576015,0.290814,0.245854,0.573437,0.420561,0.369397,0.397687,0
396134,A,I,A,B,D,AE,A,N,M,A,...,0.249429,0.271644,0.226205,0.788825,0.507065,0.204093,0.351165,0.349229,0.30936,0


In [2]:
# Some useful libraries
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# categorical data label encoding
for col in categorical_columns:
    label_encoder = LabelEncoder().fit(pd.concat((train_data[col], test_data[col]), axis=0))
    train_data[col] = label_encoder.transform(train_data[col])
    test_data[col] = label_encoder.transform(test_data[col])
    
# transforming target into Boolean
train_data.target = train_data.target == 1


# splitting into training and validation datasets
print(train_data[categorical_columns].shape, train_data[continous_columns].shape, train_data.target.shape)    # sanity check before split
X_train, X_test, y_train, y_test = train_test_split(train_data[categorical_columns+continous_columns], train_data.target, test_size=0.25, random_state=2138)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape) # sanity check after split

(240000, 19) (240000, 11) (240000,)
(180000, 30) (60000, 30) (180000,) (60000,)


In [3]:
# import predicting model's class
from sklearn.linear_model import LogisticRegression

# fitting model on training set
log_reg = LogisticRegression(random_state=3821, max_iter=1e5).fit(X_train, y_train)

print(log_reg.score(X_test, y_test))    # printing predition score of model's performance on the validation set 

0.8339833333333333


In [4]:
# >0.83  accuracy with simple logistic regression? FR

# Dumping prediction on test_data to file in the dedicated format
test_indices = test_data.index.to_list()
prediction = list(log_reg.predict(test_data).astype(int))
pd.DataFrame({'id': test_indices, 'Category': prediction}).to_csv(output_path, index=None)

In [None]:
# Voila! : )