# 02 - Baseline Model

## Setup

In [182]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import yaml

pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
# pd.set_option('display.max_rows', None)

sns.set_style("darkgrid")

from IPython.display import display, Markdown
from pprint import pprint 

DEBUG = True
SEED = 12092000

In [183]:
DATASET = "train.csv"

import os, sys
COLAB = 'google.colab' in sys.modules
ROOT = "./"

if COLAB:
  from google.colab import drive
  if not os.path.isdir("/content/gdrive"):
    drive.mount("/content/gdrive")
    d = "/content/gdrive/MyDrive/datasets"
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  ROOT = f"/content/gdrive/MyDrive/datasets/{DATASET.replace(' ','_')}/"
  if not os.path.isdir(ROOT): os.makedirs(ROOT)


def makedirs(d):
  if COLAB:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d)
  else:
    if not os.path.isdir(ROOT+d): os.makedirs(ROOT+d, mode=0o777, exist_ok=True)

for d in ['orig','data','output']: makedirs(d)

## Import Data

In [184]:
df_train = pd.read_pickle(ROOT+"data/train.pkl")
print(df_train.shape)
df_train.head()

(125803, 165)


Unnamed: 0,x000,x001,x002,x003,x004,x005,x006,x007,x008,x009,x010,x011,x012,x013,x014,x015,x016,x017,x018,x019,x020,x021,x022,x023,x024,x025,x026,x027,x028,x029,x030,x031,x032,x033,x034,x035,x036,x037,x038,x039,x040,x041,x042,x043,x044,x045,x046,x047,x048,x049,x050,x051,x052,x053,x054,x055,x056,x057,x058,x059,x060,x061,x062,x063,x064,x065,x066,x067,x068,x069,x070,x071,x072,x073,x074,x075,x076,x077,x078,x079,x080,x081,x082,x083,x084,x085,x086,x087,x088,x089,x090,x091,x092,x093,x094,x095,x096,x097,x098,x099,x100,x101,x102,x103,x104,x105,x106,x107,x108,x109,x110,x111,x112,x113,x114,x115,x116,x117,x118,x119,x120,x121,x122,x123,x124,x125,x126,x127,x128,x129,x130,x131,x132,x133,x134,x135,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145,x146,x147,x148,x149,x150,x151,x152,x153,x154,x155,x156,x157,x158,x159,x160,x161,x162,x163,Target
74077,1.141575,8,-4.359983,0.450825,0.84555,x005_000,3.007819,1,x008_000,0.850699,3.025945,x011_001,0.264184,0.622826,-0.919733,-0.072886,x016_048,1.254801,x018_000,-0.09243,-0.919733,3,0.344038,3.025945,0.545339,0.843092,3.237924,0.430913,0.236528,-2.063537,33,1.163704,0.797879,0.84555,x034_005,x035_014,3.237924,-6.688506,0.553086,0.430913,5,3.075214,0,0.618561,x044_003,5.760403,x046_044,0,-0.09243,2.712984,1.679671,-1.450752,0.620773,x053_001,0.575879,3.579433,0.505691,0.0,0.684463,1.254801,0.604527,0.160393,x062_009,0,-1.931227,0.834569,0.620773,-1.76764,1,0.684463,0,x071_194,x072_000,x073_019,x074_000,7,-4.449852,0.620807,0.115102,2,-0.382699,0.649372,x082_000,0.476814,0.216968,0.184741,0.151682,0.190724,3,-3.213441,2.642693,-3.213441,x092_004,0.618561,2.135107,-8.276485,1.517087,x097_002,1.115526,0.480386,1.141575,0,0.816572,0.483895,0.834569,0.280958,x106_009,x107_006,-0.488003,x109_001,x110_055,x111_000,0.236528,0.199391,0.155979,-2.063773,0.813007,0.596687,-0.791877,0.483895,-0.488003,0.0,0.399602,5,x124_244,x125_002,x126_001,0.797879,2.712984,x129_050,0.393239,-0.773984,x132_021,1,x134_000,2.667655,1.145426,x137_001,0,5.132649,0.649372,0.604527,0.816572,-0.670385,5.132649,5.760403,0.216968,-1.450752,0.151682,-8.276485,x150_001,-1.931227,135,0.596687,0.843092,1.679671,-1.76764,x157_006,33,0.444652,-0.382699,0.381588,-0.829724,x163_010,Class_5
49922,1.143751,0,-3.438526,0.805108,-1.619605,x005_000,-0.366589,0,x008_000,-0.87576,4.290102,x011_001,0.878541,0.613965,-0.074597,1.528265,x016_048,-3.035078,x018_000,2.293614,-0.074597,1,0.280777,4.290102,-8.714742,0.281836,0.602296,0.310266,0.308363,-0.289196,33,4.698847,0.721183,-1.619605,x034_010,x035_014,0.602296,-2.999018,0.2382,0.310266,7,-4.134411,0,0.289259,x044_003,-0.753635,x046_043,0,2.293614,-0.084513,,-4.425191,-0.247047,x053_000,0.214749,7.7933,0.345678,0.0,0.984625,-3.035078,0.646445,0.898021,x062_011,0,11.079512,0.671892,-0.247047,-1.175019,1,0.984625,9,x071_123,x072_000,x073_002,x074_000,4,2.966075,0.420835,0.197124,1,0.77379,0.551325,x082_000,0.710899,-0.817111,-1.064741,1.722823,0.78198,3,1.796974,3.720548,1.796974,x092_001,0.289259,-1.15062,4.091824,-0.650828,x097_016,1.115781,0.356202,1.143751,1,-1.818711,0.278035,0.671892,0.311779,x106_002,x107_012,1.485849,x109_000,x110_057,x111_001,0.308363,0.719172,1.147369,0.927291,0.362872,0.549868,-0.235604,0.278035,1.485849,0.0,0.210574,11,x124_022,x125_002,x126_001,0.721183,-0.084513,x129_002,0.267381,4.433305,x132_038,1,x134_000,-0.667662,0.101721,x137_001,0,4.598879,0.551325,0.646445,-1.818711,-3.151749,4.598879,-0.753635,-0.817111,-4.425191,1.722823,4.091824,x150_001,11.079512,256,0.549868,0.281836,0.282718,-1.175019,x157_037,33,0.315624,0.77379,0.321262,-1.191636,x163_010,Class_5
25511,-1.636225,0,0.120206,0.828269,0.414759,x005_003,0.284236,0,x008_000,1.466211,3.070626,x011_000,0.81958,0.729508,1.250118,-0.398999,x016_004,0.288877,x018_000,2.313134,1.250118,3,0.830299,3.070626,-2.932911,0.807711,2.698524,0.564777,0.659052,1.711316,33,6.474173,0.594748,0.414759,x034_009,x035_014,2.698524,-0.07819,0.282701,0.564777,4,3.935656,0,0.280576,x044_003,-1.288236,x046_040,0,2.313134,-2.629138,-1.206731,5.015136,-0.453017,x053_001,0.231826,-1.520408,0.254926,0.0,0.755787,0.288877,0.348295,0.947436,x062_011,0,-4.297205,0.301084,-0.453017,0.639507,1,0.755787,0,x071_212,x072_000,x073_002,x074_000,5,-3.104869,0.240706,0.239497,1,-2.812359,0.96081,x082_000,-0.049562,-0.248426,2.195266,-1.01966,0.554087,1,1.661734,1.103417,1.661734,x092_004,0.280576,1.019636,0.777689,1.145941,x097_002,-1.186097,0.343451,-1.636225,1,-0.931803,0.861431,0.301084,0.51322,x106_010,x107_006,2.889598,x109_001,x110_057,x111_000,0.659052,0.689917,4.359074,-1.099268,0.326716,0.643528,0.827223,0.861431,2.889598,0.0,0.810328,5,x124_243,x125_002,x126_001,0.594748,-2.629138,x129_003,0.268972,0.026138,x132_029,1,x134_000,2.692507,-0.065706,x137_000,0,-1.520504,0.96081,0.348295,-0.931803,-3.073593,-1.520504,-1.288236,-0.248426,5.015136,-1.01966,0.777689,x150_003,-4.297205,257,0.643528,0.807711,-1.206731,0.639507,x157_037,33,0.804911,-2.812359,0.49242,0.451035,x163_010,Class_3
72423,-1.034375,0,-1.456873,0.512814,0.632431,x005_000,-1.03909,1,x008_000,-0.843426,-8.120932,x011_000,0.359375,0.642282,0.490389,0.558809,x016_048,-0.58147,x018_000,-2.509418,0.490389,1,0.906862,-8.120932,8.727484,0.974849,3.591948,0.31712,0.868934,4.604243,33,-2.727148,0.238162,0.632431,x034_009,x035_010,3.591948,-4.249842,0.910096,0.31712,4,2.275054,0,0.09065,x044_003,-3.232234,x046_008,5,-2.509418,0.160926,-0.840417,6.143932,-0.721325,x053_001,0.71733,-8.238396,0.862987,1.0,0.331907,-0.58147,0.300992,0.031345,x062_009,0,1.098858,0.392404,-0.721325,-5.036252,1,0.331907,0,x071_194,x072_000,x073_002,x074_000,4,-3.319348,0.114805,0.562302,0,-2.121333,0.392799,x082_000,0.50394,1.725635,9.161009,-3.319507,0.740113,3,-3.708099,3.155372,-3.708099,x092_001,0.09065,-0.380517,-7.622551,-0.207712,x097_002,6.131838,0.968477,-1.034375,0,2.557438,0.731049,0.392404,0.283193,x106_009,x107_006,-3.590808,x109_000,x110_057,x111_000,0.868934,0.263916,1.272092,-0.184509,0.857396,0.957596,-0.638152,0.731049,-3.590808,0.0,0.731102,10,x124_244,x125_002,x126_000,0.238162,0.160926,x129_028,0.569074,2.051501,x132_018,1,x134_000,-0.247365,0.611291,x137_001,0,0.107207,0.392799,0.300992,2.557438,4.21822,0.107207,-3.232234,1.725635,6.143932,-3.319507,-7.622551,x150_001,1.098858,54,0.957596,0.974849,-0.840417,-5.036252,x157_041,14,0.355066,-2.121333,0.265035,0.158196,x163_005,Class_1
32772,-1.567972,4,3.261447,0.348514,2.361654,x005_001,-0.531134,0,x008_000,2.268094,-0.525095,x011_000,0.312771,0.280416,-0.089698,-0.289579,x016_013,1.924013,x018_000,1.167478,-0.089698,3,0.942663,-0.525095,1.288834,0.47662,-2.443952,0.305123,0.232785,-0.117791,33,0.894147,0.710837,2.361654,x034_009,x035_010,-2.443952,0.612824,0.615116,0.305123,4,0.160757,0,0.620717,x044_005,-0.827508,x046_013,0,1.167478,4.819498,-0.024034,0.592569,1.361153,x053_000,0.478275,-0.428333,0.488107,0.0,0.729928,1.924013,0.828306,0.530872,x062_012,0,-3.132878,0.368548,1.361153,3.762578,1,0.729928,0,x071_226,x072_000,x073_002,x074_000,9,2.362394,0.359014,0.635795,0,-1.687309,0.370548,x082_000,0.706157,0.547086,-2.590781,0.096456,0.283956,3,2.263447,-0.693665,2.263447,x092_001,0.620717,-1.045366,0.092176,2.315404,x097_002,-1.567661,0.492705,-1.567972,1,-0.3136,0.775503,0.368548,0.281477,x106_000,x107_012,0.927707,x109_000,x110_057,x111_000,0.232785,0.196126,-3.542416,0.495603,0.606498,0.55447,-2.645867,0.775503,0.927707,0.0,0.378416,9,x124_094,x125_001,x126_001,0.710837,4.819498,x129_030,0.387673,0.317126,x132_023,2,x134_000,-1.792664,0.327718,x137_000,0,-0.73716,0.370548,0.828306,-0.3136,-1.08861,-0.73716,-0.827508,0.547086,0.592569,0.096456,0.092176,x150_002,-3.132878,75,0.55447,0.47662,-0.024034,3.762578,x157_051,74,0.397866,-1.687309,0.265508,0.315376,x163_010,Class_5


In [185]:
df_test = pd.read_pickle(ROOT+"data/test.pkl")
print(df_test.shape)
df_test.head()

(83869, 165)


Unnamed: 0,x000,x001,x002,x003,x004,x005,x006,x007,x008,x009,x010,x011,x012,x013,x014,x015,x016,x017,x018,x019,x020,x021,x022,x023,x024,x025,x026,x027,x028,x029,x030,x031,x032,x033,x034,x035,x036,x037,x038,x039,x040,x041,x042,x043,x044,x045,x046,x047,x048,x049,x050,x051,x052,x053,x054,x055,x056,x057,x058,x059,x060,x061,x062,x063,x064,x065,x066,x067,x068,x069,x070,x071,x072,x073,x074,x075,x076,x077,x078,x079,x080,x081,x082,x083,x084,x085,x086,x087,x088,x089,x090,x091,x092,x093,x094,x095,x096,x097,x098,x099,x100,x101,x102,x103,x104,x105,x106,x107,x108,x109,x110,x111,x112,x113,x114,x115,x116,x117,x118,x119,x120,x121,x122,x123,x124,x125,x126,x127,x128,x129,x130,x131,x132,x133,x134,x135,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145,x146,x147,x148,x149,x150,x151,x152,x153,x154,x155,x156,x157,x158,x159,x160,x161,x162,x163,Target
122232,-3.344156,4,2.172569,0.539517,-0.23231,x005_002,-0.347396,0,x008_001,-0.158356,3.66033,x011_000,0.295093,0.767444,0.349099,1.134375,x016_006,1.694801,x018_000,1.283826,0.349099,3,0.288078,3.66033,-0.576699,0.701344,1.84578,0.720627,0.424103,1.373123,33,6.051348,0.552332,-0.23231,x034_009,x035_011,1.84578,-2.733806,0.29502,0.720627,7,-4.82723,0,0.737022,x044_003,1.772232,x046_005,0,1.283826,1.702993,2.000092,5.646412,0.960414,x053_001,0.525748,-3.773189,0.347604,0.0,0.196037,1.694801,0.332869,0.471527,x062_004,3,1.544974,0.554083,0.960414,-0.834783,1,0.196037,0,x071_235,x072_000,x073_014,x074_000,4,2.506398,0.354329,0.119601,0,-1.432546,0.32379,x082_003,0.925484,-0.754708,-4.157989,1.993699,0.42555,1,4.000094,-1.020161,4.000094,x092_010,0.737022,-0.779117,-0.219093,0.004463,x097_002,-2.531398,0.496891,-3.344156,1,-3.164401,0.794238,0.554083,0.747685,x106_010,x107_012,0.642362,x109_001,x110_057,x111_000,0.424103,0.465203,-5.717978,-1.301262,0.346044,0.864574,1.441809,0.794238,0.642362,0.0,0.383641,7,x124_282,x125_002,x126_001,0.552332,1.702993,x129_015,0.354346,-1.348996,x132_039,2,x134_000,-2.655006,2.517596,x137_000,0,-2.593382,0.32379,0.332869,-3.164401,-0.690828,-2.593382,1.772232,-0.754708,5.646412,1.993699,-0.219093,x150_003,1.544974,23,0.864574,0.701344,2.000092,-0.834783,x157_018,33,0.707962,-1.432546,0.887848,-0.079848,x163_010,Class_3
170143,3.231669,0,5.213101,0.319086,0.563226,x005_002,-1.936628,1,x008_000,-0.485216,1.174234,x011_000,0.934481,0.282337,1.330622,-1.395256,x016_013,0.495893,x018_000,0.398433,1.330622,1,0.319295,1.174234,0.802635,0.913934,2.563651,0.458742,0.342443,-1.4541,33,2.706768,0.235929,0.563226,x034_009,x035_004,2.563651,2.29586,0.512712,0.458742,6,1.656521,1,0.071356,x044_003,-2.010164,x046_050,0,0.398433,-2.28996,5.413164,0.23001,2.035408,x053_001,0.805215,3.957178,0.701032,0.0,0.788725,0.495893,0.21537,0.362263,x062_009,0,1.319552,0.347096,2.035408,1.569314,2,0.788725,14,x071_090,x072_002,x073_001,x074_000,7,6.514606,0.619366,0.764356,1,0.63342,0.524824,x082_003,0.899786,0.67655,-0.612299,-0.554123,0.355339,3,-2.173246,-1.131016,-2.173246,x092_006,0.071356,-0.381436,1.670339,-1.074664,x097_004,-2.825852,0.509803,3.231669,0,-5.910125,0.342076,0.347096,0.617534,x106_002,x107_006,2.737555,x109_001,x110_057,x111_000,0.342443,0.971688,-1.585377,-2.785407,0.762415,0.431396,-1.560924,0.342076,2.737555,0.0,0.679771,8,x124_018,x125_002,x126_001,0.235929,-2.28996,x129_003,0.595006,1.126768,x132_048,3,x134_000,-0.307294,0.159046,x137_001,0,1.062362,0.524824,0.21537,-5.910125,-0.62554,1.062362,-2.010164,0.67655,0.23001,-0.554123,1.670339,x150_003,1.319552,271,0.431396,0.913934,5.413164,1.569314,x157_029,33,0.312975,0.63342,0.319985,-0.15252,x163_010,Class_1
196172,0.796761,0,0.31417,0.501443,0.485478,x005_000,-0.231908,1,x008_000,-1.218573,-1.200363,x011_000,0.300099,0.423472,-0.80929,-0.594823,x016_013,1.040214,x018_000,-0.286775,-0.80929,3,0.577801,-1.200363,2.966585,0.428781,0.015534,0.361838,0.887167,-0.013237,33,4.990464,0.778387,0.485478,x034_009,x035_014,0.015534,-0.337644,0.339875,0.361838,7,1.114214,0,0.075828,x044_003,4.4801,x046_011,5,-0.286775,0.429953,2.699606,-3.081257,-1.331587,x053_000,0.437044,-2.930677,0.549783,0.0,0.309077,1.040214,0.261238,0.923268,x062_012,0,-3.78625,0.338409,-1.331587,2.485082,1,0.309077,15,x071_212,x072_000,x073_014,x074_000,5,-1.408164,0.554252,0.511043,1,0.875131,0.325462,x082_000,0.438151,-1.535747,-2.376908,2.05202,0.157479,3,1.503772,-0.025202,1.503772,x092_001,0.075828,0.69784,-5.622837,-1.925541,x097_016,0.605691,0.376819,0.796761,0,1.956162,0.320726,0.338409,0.282522,x106_010,x107_012,-1.797388,x109_000,x110_057,x111_002,0.887167,0.238081,1.292654,0.393314,0.652742,0.567658,2.868987,0.320726,-1.797388,0.0,0.472724,0,x124_262,x125_002,x126_001,0.778387,0.429953,x129_021,0.421966,1.616369,x132_027,1,x134_000,0.062966,1.618971,x137_000,0,0.692446,0.325462,0.261238,1.956162,-2.375856,0.692446,4.4801,-1.535747,-3.081257,2.05202,-5.622837,x150_003,-3.78625,66,0.567658,0.428781,2.699606,2.485082,x157_042,33,0.669629,0.875131,0.251521,1.299233,x163_010,Class_4
31721,-0.940662,0,4.65983,0.677801,-1.722532,x005_000,1.252999,0,x008_000,-0.520129,-1.552745,x011_000,0.545332,0.685122,-0.418816,0.250904,x016_048,0.251979,x018_000,1.345765,-0.418816,3,0.845477,-1.552745,-4.398986,0.778886,1.809106,0.502454,0.819679,2.624565,33,-8.732356,0.283804,-1.722532,x034_000,x035_014,1.809106,6.731387,1.018483,0.502454,5,-1.006625,2,0.487885,x044_003,-0.017913,x046_035,0,1.345765,3.395784,-0.624719,-1.321331,-0.633829,x053_001,0.850287,6.638624,0.683891,0.0,0.367056,0.251979,0.548329,0.526623,x062_009,0,2.59271,0.389881,-0.633829,0.16395,2,0.367056,0,x071_238,x072_000,x073_014,x074_000,4,2.185387,0.487868,0.769417,0,-0.178286,0.303503,x082_000,0.613859,0.194949,0.159477,2.713088,0.736727,3,-1.859302,-0.176615,-1.859302,x092_001,0.487885,-2.146089,0.926816,0.041784,x097_004,-9.500941,0.874258,-0.940662,0,1.475228,0.774788,0.389881,0.399223,x106_014,x107_012,0.389409,x109_001,x110_057,x111_000,0.819679,0.679288,-2.818092,0.6616,0.933714,0.357891,-3.367345,0.774788,0.389409,,0.28446,14,x124_141,x125_002,x126_001,0.283804,3.395784,x129_049,0.890942,2.1883,x132_009,1,x134_000,-3.935951,-0.189062,x137_001,0,4.582675,0.303503,0.548329,1.475228,-1.64357,4.582675,-0.017913,0.194949,-1.321331,2.713088,0.926816,x150_003,2.59271,163,0.357891,0.778886,-0.624719,0.16395,x157_022,33,0.771162,-0.178286,0.783956,-0.821668,x163_010,Class_2
166779,1.329454,4,1.073245,0.543279,-2.129426,x005_002,-1.604272,0,x008_000,-0.297818,-0.330086,x011_001,0.515342,0.674548,0.239012,-1.003687,x016_048,0.707302,x018_000,4.719237,0.239012,3,0.67037,-0.330086,-3.247619,0.63926,-0.316476,0.444896,0.282482,0.433645,33,-3.609534,0.596834,-2.129426,x034_005,x035_014,-0.316476,-1.837028,0.374812,0.444896,5,-3.058536,0,0.175809,x044_005,-2.238103,x046_025,0,4.719237,-4.286446,-1.969261,-5.135769,-0.829366,x053_001,0.439646,1.12619,0.291992,0.0,0.886997,0.707302,0.245443,0.510783,x062_012,2,1.208289,0.360329,-0.829366,-0.024828,1,0.886997,6,x071_262,x072_000,x073_006,x074_000,4,-4.182466,0.799783,0.842957,2,0.387436,0.406051,x082_000,0.52131,-2.779414,-4.536928,-0.315002,0.12975,3,-0.876002,-0.021163,-0.876002,x092_001,0.175809,-0.410774,-1.082198,2.690973,x097_016,-1.715228,0.465236,1.329454,0,-2.649902,0.346921,0.360329,0.439892,x106_010,x107_012,0.494651,x109_001,x110_057,x111_002,0.282482,0.497887,1.784379,4.830307,0.332225,0.239082,-1.791514,0.346921,0.494651,0.0,0.821797,11,x124_262,x125_002,x126_001,0.596834,-4.286446,x129_024,0.408994,-0.421698,x132_044,1,x134_001,-0.587493,-0.862213,x137_000,0,-1.508511,0.406051,0.245443,-2.649902,-5.220707,-1.508511,-2.238103,-2.779414,-5.135769,-0.315002,-1.082198,x150_001,1.208289,4,0.239082,0.63926,-1.969261,-0.024828,x157_060,33,0.267645,0.387436,0.785771,0.995956,x163_010,Class_3


In [186]:
df_score = pd.read_pickle(ROOT+"data/score.pkl")
print(df_score.shape)
df_score.head()

(90000, 164)


Unnamed: 0,x000,x001,x002,x003,x004,x005,x006,x007,x008,x009,x010,x011,x012,x013,x014,x015,x016,x017,x018,x019,x020,x021,x022,x023,x024,x025,x026,x027,x028,x029,x030,x031,x032,x033,x034,x035,x036,x037,x038,x039,x040,x041,x042,x043,x044,x045,x046,x047,x048,x049,x050,x051,x052,x053,x054,x055,x056,x057,x058,x059,x060,x061,x062,x063,x064,x065,x066,x067,x068,x069,x070,x071,x072,x073,x074,x075,x076,x077,x078,x079,x080,x081,x082,x083,x084,x085,x086,x087,x088,x089,x090,x091,x092,x093,x094,x095,x096,x097,x098,x099,x100,x101,x102,x103,x104,x105,x106,x107,x108,x109,x110,x111,x112,x113,x114,x115,x116,x117,x118,x119,x120,x121,x122,x123,x124,x125,x126,x127,x128,x129,x130,x131,x132,x133,x134,x135,x136,x137,x138,x139,x140,x141,x142,x143,x144,x145,x146,x147,x148,x149,x150,x151,x152,x153,x154,x155,x156,x157,x158,x159,x160,x161,x162,x163
0,2.319538,0,5.020483,0.448529,-0.852546,x005_000,-0.998906,0,x008_000,1.288086,4.673962,x011_001,0.289747,0.284042,0.572018,-1.336891,x016_006,-1.13796,x018_002,2.442883,0.572018,3,0.298722,4.673962,1.958812,0.282664,4.85593,0.290681,0.480461,-0.091239,33,-0.047759,0.288419,-0.852546,x034_011,x035_014,4.85593,2.191471,0.302269,0.290681,4,-2.573493,1,0.789406,x044_000,-2.778244,x046_011,0,2.442883,-3.219411,-1.538807,0.634541,0.358364,x053_001,0.410863,4.025498,0.31478,1.0,0.885479,-1.13796,0.647797,0.475543,x062_001,0,-0.084794,0.36046,0.358364,-1.810576,1,0.885479,3,x071_018,x072_000,x073_002,x074_000,5,4.5169,0.73564,0.477497,0,0.084926,0.510748,x082_000,0.310362,-3.29531,2.907393,0.377336,0.680271,1,-1.684449,-1.089001,-1.684449,x092_004,0.789406,-1.758931,-0.920925,-1.816968,x097_002,-3.853039,0.33526,2.319538,1,1.21015,0.823916,0.36046,0.422275,x106_010,x107_006,-2.845971,x109_001,x110_059,x111_000,0.480461,0.221567,1.620115,-3.280507,0.619155,0.612774,-1.956931,0.823916,-2.845971,0.0,0.431122,12,x124_022,x125_000,x126_000,0.288419,-3.219411,x129_023,0.153943,3.630768,x132_009,1,x134_000,-0.712996,1.072984,x137_001,0,0.708755,0.510748,0.647797,1.21015,4.905962,0.708755,-2.778244,-3.29531,0.634541,0.377336,-0.920925,x150_003,-0.084794,159,0.612774,0.282664,-1.538807,-1.810576,x157_051,33,0.307058,0.084926,0.27934,0.626329,x163_007
1,0.596632,0,6.648641,0.508439,0.967977,x005_002,-2.897833,1,x008_001,0.376745,-4.264011,x011_000,0.365204,0.285102,0.219571,-0.313663,x016_048,0.629423,x018_000,-1.675292,0.219571,3,0.279568,-4.264011,-1.11149,0.283594,-0.828395,0.351714,0.625054,1.848905,2,-2.0332,0.805128,0.967977,x034_001,x035_014,-0.828395,-0.321451,0.142981,0.351714,5,-0.629202,1,0.119783,x044_005,-3.858438,x046_005,0,-1.675292,2.657546,0.117252,-1.301271,0.605371,x053_001,0.171433,-2.538732,0.287723,0.0,0.90471,0.629423,0.274973,0.463415,x062_012,2,-0.470625,0.491269,0.605371,0.881008,3,0.90471,0,x071_157,x072_001,x073_002,x074_000,5,1.517297,0.490456,0.466524,0,0.58353,0.533045,x082_003,0.685248,-0.248232,-5.086252,2.109288,0.277475,3,1.862004,-2.096803,1.862004,x092_004,0.119783,0.103561,-0.830837,-0.850213,x097_002,-4.353762,0.166441,0.596632,1,0.07492,0.147904,0.491269,0.672344,x106_009,x107_002,-1.922535,x109_001,x110_057,x111_000,0.625054,0.317563,2.38791,-2.485752,0.419489,0.91086,-1.030031,0.147904,-1.922535,0.0,0.828945,7,x124_022,x125_002,x126_001,0.805128,2.657546,x129_010,0.163536,-3.465025,x132_041,1,x134_000,-1.683605,0.320888,x137_001,0,-5.786395,0.533045,0.274973,0.07492,1.15216,-5.786395,-3.858438,-0.248232,-1.301271,2.109288,-0.830837,x150_003,-0.470625,69,0.91086,0.283594,0.117252,0.881008,x157_019,33,0.303058,0.58353,0.483201,0.309898,x163_010
2,1.181403,7,-0.278018,0.269135,0.470354,x005_000,-1.125396,1,x008_000,-0.435548,4.971689,x011_001,0.922037,0.876832,0.792961,-0.719348,x016_048,1.056223,x018_000,1.31097,0.792961,3,0.651189,4.971689,-2.513337,0.28373,-2.852422,0.122145,0.312218,0.711132,33,-3.13632,0.794412,0.470354,x034_009,x035_014,-2.852422,-4.229822,0.297356,0.122145,4,-1.342726,1,0.149601,x044_005,1.935087,x046_041,0,1.31097,-0.849471,0.256365,-4.095485,-2.383043,x053_001,0.446565,-2.134531,0.235314,0.0,0.45841,1.056223,0.610031,0.333816,x062_009,0,1.520913,0.332621,-2.383043,0.508806,2,0.45841,9,x071_262,x072_000,x073_014,x074_000,7,-1.99115,0.117028,0.559418,2,0.398494,0.698028,x082_000,0.679684,-0.16424,-6.032394,0.473949,0.125347,3,0.131833,-2.113557,0.131833,x092_001,0.149601,1.170761,-2.760197,0.078652,x097_002,-3.660065,0.472669,1.181403,0,0.663495,0.618753,0.332621,0.284048,x106_010,x107_012,-3.865667,x109_000,x110_057,x111_000,0.312218,0.907233,-4.085762,2.236776,0.549247,0.622028,3.221274,0.618753,-3.865667,0.0,0.472203,5,x124_213,x125_000,x126_001,0.794412,-0.849471,x129_037,0.381851,0.166208,x132_001,2,x134_000,2.884738,1.141532,x137_001,0,-3.494744,0.698028,0.610031,0.663495,10.9771,-3.494744,1.935087,-0.16424,-4.095485,0.473949,-2.760197,x150_001,1.520913,155,0.622028,0.28373,0.256365,0.508806,x157_029,33,0.837742,0.398494,0.550593,0.551815,x163_005
3,1.062521,0,4.681273,0.351513,0.895732,x005_002,-1.939696,0,x008_000,-1.215164,-6.970703,x011_001,0.746144,0.424481,3.068141,-0.166192,x016_048,0.277012,x018_000,-2.31757,3.068141,1,0.369813,-6.970703,4.362935,0.284249,-0.354353,0.561398,0.358915,-2.54051,33,1.706659,0.245466,0.895732,x034_006,x035_014,-0.354353,1.431991,0.328452,0.561398,1,-1.437536,0,0.784052,x044_014,-1.260662,x046_003,0,-2.31757,0.563919,-3.088806,1.453814,-0.405619,x053_001,0.218636,4.540099,0.436622,0.0,0.636592,0.277012,0.329475,0.336606,x062_009,9,7.545427,0.405871,-0.405619,3.174633,1,0.636592,0,x071_236,x072_000,x073_014,x074_000,4,4.746699,0.621325,0.34275,0,0.350563,0.316946,x082_003,0.286075,0.603912,-1.280063,-0.40833,0.776094,3,0.111652,-2.181893,0.111652,x092_001,0.784052,0.224501,-3.815884,0.13166,x097_002,-1.602668,0.026942,1.062521,1,-1.717823,0.442232,0.405871,0.561598,x106_009,x107_003,-1.240114,x109_001,x110_057,x111_000,0.358915,0.722143,-0.082655,-1.449469,0.599188,0.551079,-2.568187,0.442232,-1.240114,0.0,0.680741,6,x124_238,x125_002,x126_001,0.245466,0.563919,x129_012,0.280368,-2.082442,x132_059,1,x134_001,1.18135,0.328639,x137_001,0,2.217366,0.316946,0.329475,-1.717823,3.906872,2.217366,-1.260662,0.603912,1.453814,-0.40833,-3.815884,x150_001,7.545427,7,0.551079,0.284249,-3.088806,3.174633,x157_042,2,0.79795,0.350563,0.368082,-0.010097,x163_010
4,3.202631,0,4.535848,0.278066,0.0602,x005_000,-0.841294,1,x008_000,-0.194629,-6.880155,x011_000,0.641076,0.283357,0.14495,-0.701763,x016_048,1.832194,x018_000,1.551679,0.14495,2,0.624406,-6.880155,-1.880178,0.616512,0.392636,0.550122,0.543176,0.526731,33,4.477607,0.679963,0.0602,x034_009,x035_014,0.392636,-6.342937,0.322065,0.550122,6,2.295347,0,0.080758,x044_003,0.695561,x046_007,10,1.551679,-0.045503,1.164021,-1.237624,0.769096,x053_001,0.437865,-0.657845,0.39872,0.0,0.763592,1.832194,0.299039,0.308614,x062_004,0,-2.531185,0.492353,0.769096,2.717765,1,0.763592,0,x071_170,x072_000,x073_006,x074_000,5,-0.329671,0.357912,0.572746,5,-0.247686,0.318795,x082_000,0.676677,-1.834202,-3.971667,1.795463,0.488578,1,-2.365519,-1.089131,-2.365519,x092_001,0.080758,-0.925009,3.743664,0.525361,x097_002,-3.974501,0.270718,3.202631,0,-0.97281,0.487134,0.492353,0.306553,x106_010,x107_012,0.193769,x109_001,x110_057,x111_001,0.543176,0.639594,4.340596,0.623056,0.706431,0.361288,0.415698,0.487134,0.193769,0.0,0.218783,8,x124_201,x125_002,x126_000,0.679963,-0.045503,x129_049,0.174329,0.358485,x132_011,1,x134_000,0.284686,0.921958,x137_001,0,-1.14165,0.318795,0.299039,-0.97281,1.1601,-1.14165,0.695561,-1.834202,-1.237624,1.795463,3.743664,x150_001,-2.531185,163,0.361288,0.616512,1.164021,2.717765,x157_022,33,0.226336,-0.247686,0.447736,-0.012285,x163_005


## Generate Submission Files

In [187]:
target_labels = sorted(df_train['Target'].unique())
pred = np.array([[0.1, 0.3, 0.33, 0.27, 0.1]] * 90000)
pd.DataFrame(pred, columns=target_labels)

def submission(y_pred_prob, name=f"submission-{pd.to_datetime('now', utc=True).strftime('%Y%m%d%H%M%S')}"):
    df_sub = pd.DataFrame(y_pred_prob, columns=target_labels)
    df_sub.index.name = 'id'
    df_sub.to_csv(ROOT+f'output/{name}.csv', index=True)
    print(f"Saved ({df_sub.shape[0]} rows) to: {ROOT}output/{name}.csv")

## Baseline Model

* treat NA - drop or impute? Impute is faster != correct

In [188]:
# All missing values
na_vals = df_train.isna().sum().sum()
print(f"Missing values: {na_vals}")

Missing values: 9974


In [189]:
# All rows with at least one NaN
nan_rows = (df_train.isna().sum(axis=1)>0).sum()
print(f"Rows with at least one NaN: {nan_rows} ({nan_rows/df_train.shape[0]:.2%})")

Rows with at least one NaN: 9639 (7.66%)


In [190]:
# Really stupid but fast - treat all NA as 0
df_train = df_train.fillna(0)
df_test = df_test.fillna(0)
df_score = df_score.fillna(0)

### Dont do above !

* Numerical populate by mean
* Categorical populate by mode
* If skewed populate by median

target = 'Target'
features = df.columns.drop(target)

In [191]:
target = 'Target'
features = df_train.columns.drop(target)

In [192]:
features = list(df_train.select_dtypes(["int", "float"]).columns)
df_train.dtypes.value_counts()

float64    112
object      34
int64       19
dtype: int64

In [193]:
df_train.select_dtypes("int").nunique().sort_values()

x007      2
x138      2
x101      2
x021      4
x133      4
x068      4
x088      4
x042     13
x079     13
x123     15
x001     19
x070     19
x063     19
x047     19
x040     20
x075     20
x030     84
x158     84
x152    286
dtype: int64

In [194]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
ss.fit(df_train[features])

X_train = ss.transform(df_train[features])
X_test = ss.transform(df_test[features])
X_score = ss.transform(df_score[features])

In [195]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

y_train = df_train[target]
y_test = df_test[target]

In [196]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state=SEED, max_iter=1000)
model.fit(X_train, y_train)

In [197]:
from sklearn.metrics import confusion_matrix, classification_report, log_loss

y_pred = model.predict(X_train)
y_pred_prob = model.predict_proba(X_train)

print(confusion_matrix(y_train, y_pred))
print()
print(classification_report(y_train, y_pred))
print()
print(f"Log Loss: {log_loss(y_train, y_pred_prob):.4f}")

[[19491  2797  1409  1009  3828]
 [ 6005  6576  1322   684 13584]
 [11486  2482  1714   357  7534]
 [ 9256  1069  1391  1333  2308]
 [ 4413  6146  1642   223 17744]]

              precision    recall  f1-score   support

     Class_1       0.38      0.68      0.49     28534
     Class_2       0.34      0.23      0.28     28171
     Class_3       0.23      0.07      0.11     23573
     Class_4       0.37      0.09      0.14     15357
     Class_5       0.39      0.59      0.47     30168

    accuracy                           0.37    125803
   macro avg       0.34      0.33      0.30    125803
weighted avg       0.35      0.37      0.33    125803


Log Loss: 1.4334


In [198]:
y_pred = model.predict(X_test)
y_pred_prob = model.predict_proba(X_test)

print(confusion_matrix(y_test, y_pred))
print()
print(classification_report(y_test, y_pred))
print()
print(f"Log Loss: {log_loss(y_test, y_pred_prob):.4f}")

[[13022  1831   931   642  2596]
 [ 4074  4286   897   429  9095]
 [ 7689  1711  1124   276  4915]
 [ 6132   716   940   870  1580]
 [ 2974  4205  1143   135 11656]]

              precision    recall  f1-score   support

     Class_1       0.38      0.68      0.49     19022
     Class_2       0.34      0.23      0.27     18781
     Class_3       0.22      0.07      0.11     15715
     Class_4       0.37      0.08      0.14     10238
     Class_5       0.39      0.58      0.47     20113

    accuracy                           0.37     83869
   macro avg       0.34      0.33      0.30     83869
weighted avg       0.34      0.37      0.32     83869


Log Loss: 1.4349


In [199]:
y_pred_prob = model.predict_proba(X_score)
print(y_pred_prob.shape)
submission(y_pred_prob, "LR_num_features_only")

(90000, 5)
Saved (90000 rows) to: ./output/LR_num_features_only.csv
