# Tabular Playground Series - Mar 2021

## Importing Modules

In [1]:
import pandas as pd
import seaborn as sns
from scipy import stats
import numpy as np
import matplotlib.pyplot as plt

## Data Exploration

### Data Import 

In [2]:
data = pd.read_csv('train.csv')

In [3]:
float_cols = [c for c in data if data[c].dtype == "float64"]
float16_cols = {c: np.float16 for c in float_cols}

data = pd.read_csv('train.csv', engine='c', dtype=float16_cols)

In [4]:
data.dtypes

id          int64
cat0       object
cat1       object
cat2       object
cat3       object
cat4       object
cat5       object
cat6       object
cat7       object
cat8       object
cat9       object
cat10      object
cat11      object
cat12      object
cat13      object
cat14      object
cat15      object
cat16      object
cat17      object
cat18      object
cont0     float16
cont1     float16
cont2     float16
cont3     float16
cont4     float16
cont5     float16
cont6     float16
cont7     float16
cont8     float16
cont9     float16
cont10    float16
target      int64
dtype: object

In [5]:
data.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,A,I,A,B,B,BI,A,S,Q,...,0.759277,0.79541,0.682129,0.621582,0.592285,0.791992,0.81543,0.964844,0.666016,0
1,1,A,I,A,A,E,BI,K,W,AD,...,0.386475,0.541504,0.388916,0.357666,0.600098,0.408691,0.399414,0.927246,0.493652,0
2,2,A,K,A,A,E,BI,A,E,BM,...,0.343262,0.616211,0.793457,0.552734,0.352051,0.388916,0.412354,0.292725,0.549316,0
3,3,A,K,A,C,E,BI,A,Y,AD,...,0.831055,0.807617,0.799805,0.619141,0.221802,0.897461,0.633789,0.760254,0.934082,0
4,4,A,I,G,B,E,BI,C,G,Q,...,0.338867,0.277344,0.610352,0.128296,0.578613,0.279053,0.351074,0.357178,0.328857,1


In [6]:
data

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,A,I,A,B,B,BI,A,S,Q,...,0.759277,0.795410,0.682129,0.621582,0.592285,0.791992,0.815430,0.964844,0.666016,0
1,1,A,I,A,A,E,BI,K,W,AD,...,0.386475,0.541504,0.388916,0.357666,0.600098,0.408691,0.399414,0.927246,0.493652,0
2,2,A,K,A,A,E,BI,A,E,BM,...,0.343262,0.616211,0.793457,0.552734,0.352051,0.388916,0.412354,0.292725,0.549316,0
3,3,A,K,A,C,E,BI,A,Y,AD,...,0.831055,0.807617,0.799805,0.619141,0.221802,0.897461,0.633789,0.760254,0.934082,0
4,4,A,I,G,B,E,BI,C,G,Q,...,0.338867,0.277344,0.610352,0.128296,0.578613,0.279053,0.351074,0.357178,0.328857,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,499993,A,N,F,A,E,BU,A,AS,K,...,0.662598,0.671875,0.390625,0.145874,0.262695,0.514160,0.519531,0.617676,0.687988,0
299996,499995,A,K,A,A,G,BI,A,K,AE,...,0.821777,0.620117,0.385010,0.735840,0.547852,0.726562,0.470459,0.275635,0.639160,0
299997,499996,A,G,M,A,H,BI,C,L,F,...,0.406982,0.232422,0.832520,0.810547,0.597168,0.308838,0.374023,0.518066,0.452148,1
299998,499997,B,H,A,D,B,BI,A,AA,AX,...,0.808105,0.630859,0.346924,0.735352,0.563477,0.609863,0.680664,0.318359,0.335938,0


### X & Y

In [7]:
X = data.drop(columns = 'target')

In [8]:
X

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10
0,0,A,I,A,B,B,BI,A,S,Q,...,0.855469,0.759277,0.795410,0.682129,0.621582,0.592285,0.791992,0.815430,0.964844,0.666016
1,1,A,I,A,A,E,BI,K,W,AD,...,0.328857,0.386475,0.541504,0.388916,0.357666,0.600098,0.408691,0.399414,0.927246,0.493652
2,2,A,K,A,A,E,BI,A,E,BM,...,0.322754,0.343262,0.616211,0.793457,0.552734,0.352051,0.388916,0.412354,0.292725,0.549316
3,3,A,K,A,C,E,BI,A,Y,AD,...,0.707520,0.831055,0.807617,0.799805,0.619141,0.221802,0.897461,0.633789,0.760254,0.934082
4,4,A,I,G,B,E,BI,C,G,Q,...,0.274414,0.338867,0.277344,0.610352,0.128296,0.578613,0.279053,0.351074,0.357178,0.328857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,499993,A,N,F,A,E,BU,A,AS,K,...,0.500488,0.662598,0.671875,0.390625,0.145874,0.262695,0.514160,0.519531,0.617676,0.687988
299996,499995,A,K,A,A,G,BI,A,K,AE,...,0.790527,0.821777,0.620117,0.385010,0.735840,0.547852,0.726562,0.470459,0.275635,0.639160
299997,499996,A,G,M,A,H,BI,C,L,F,...,0.522461,0.406982,0.232422,0.832520,0.810547,0.597168,0.308838,0.374023,0.518066,0.452148
299998,499997,B,H,A,D,B,BI,A,AA,AX,...,0.812988,0.808105,0.630859,0.346924,0.735352,0.563477,0.609863,0.680664,0.318359,0.335938


In [9]:
y = data['target']
y

0         0
1         0
2         0
3         0
4         1
         ..
299995    0
299996    0
299997    1
299998    0
299999    0
Name: target, Length: 300000, dtype: int64

### Checking for duplicates

In [10]:
len(X) # Check number of rows before removing duplicates

300000

In [11]:
X = X.drop_duplicates() # Remove duplicates
len(X)# Check new number of rows

300000

### Null Values

In [12]:
X.isnull().sum().sort_values(ascending=False) #NaN count for each column

id        0
cat15     0
cont9     0
cont8     0
cont7     0
cont6     0
cont5     0
cont4     0
cont3     0
cont2     0
cont1     0
cont0     0
cat18     0
cat17     0
cat16     0
cat14     0
cat0      0
cat13     0
cat12     0
cat11     0
cat10     0
cat9      0
cat8      0
cat7      0
cat6      0
cat5      0
cat4      0
cat3      0
cat2      0
cat1      0
cont10    0
dtype: int64

### Checking Cat and Num values

In [13]:
num_cols = X._get_numeric_data().columns

In [14]:
len(num_cols)

12

In [15]:
num_cols

Index(['id', 'cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6',
       'cont7', 'cont8', 'cont9', 'cont10'],
      dtype='object')

In [16]:
num_cols = num_cols.drop('id')

In [17]:
num_cols

Index(['cont0', 'cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7',
       'cont8', 'cont9', 'cont10'],
      dtype='object')

In [18]:
cat_cols = list(set(X) - set(num_cols))

In [19]:
cat_cols.remove('id')

In [20]:
cat_cols

['cat10',
 'cat9',
 'cat6',
 'cat7',
 'cat18',
 'cat17',
 'cat4',
 'cat2',
 'cat14',
 'cat5',
 'cat11',
 'cat8',
 'cat1',
 'cat13',
 'cat16',
 'cat0',
 'cat15',
 'cat3',
 'cat12']

### Removing  outliers for numerical values

In [21]:
data_num = data[num_cols]

In [22]:
data_num

Unnamed: 0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10
0,0.629883,0.855469,0.759277,0.795410,0.682129,0.621582,0.592285,0.791992,0.815430,0.964844,0.666016
1,0.370605,0.328857,0.386475,0.541504,0.388916,0.357666,0.600098,0.408691,0.399414,0.927246,0.493652
2,0.502441,0.322754,0.343262,0.616211,0.793457,0.552734,0.352051,0.388916,0.412354,0.292725,0.549316
3,0.934082,0.707520,0.831055,0.807617,0.799805,0.619141,0.221802,0.897461,0.633789,0.760254,0.934082
4,0.254395,0.274414,0.338867,0.277344,0.610352,0.128296,0.578613,0.279053,0.351074,0.357178,0.328857
...,...,...,...,...,...,...,...,...,...,...,...
299995,0.681641,0.500488,0.662598,0.671875,0.390625,0.145874,0.262695,0.514160,0.519531,0.617676,0.687988
299996,0.489258,0.790527,0.821777,0.620117,0.385010,0.735840,0.547852,0.726562,0.470459,0.275635,0.639160
299997,0.487793,0.522461,0.406982,0.232422,0.832520,0.810547,0.597168,0.308838,0.374023,0.518066,0.452148
299998,0.331787,0.812988,0.808105,0.630859,0.346924,0.735352,0.563477,0.609863,0.680664,0.318359,0.335938


In [23]:
data_num_outliers = data_num[(np.abs(stats.zscore(data_num)) < 3).all(axis=1)]
data_num_outliers

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  arrmean = umr_sum(arr, axis, dtype, keepdims=True)


Unnamed: 0,cont0,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10
0,0.629883,0.855469,0.759277,0.795410,0.682129,0.621582,0.592285,0.791992,0.815430,0.964844,0.666016
1,0.370605,0.328857,0.386475,0.541504,0.388916,0.357666,0.600098,0.408691,0.399414,0.927246,0.493652
2,0.502441,0.322754,0.343262,0.616211,0.793457,0.552734,0.352051,0.388916,0.412354,0.292725,0.549316
3,0.934082,0.707520,0.831055,0.807617,0.799805,0.619141,0.221802,0.897461,0.633789,0.760254,0.934082
4,0.254395,0.274414,0.338867,0.277344,0.610352,0.128296,0.578613,0.279053,0.351074,0.357178,0.328857
...,...,...,...,...,...,...,...,...,...,...,...
299995,0.681641,0.500488,0.662598,0.671875,0.390625,0.145874,0.262695,0.514160,0.519531,0.617676,0.687988
299996,0.489258,0.790527,0.821777,0.620117,0.385010,0.735840,0.547852,0.726562,0.470459,0.275635,0.639160
299997,0.487793,0.522461,0.406982,0.232422,0.832520,0.810547,0.597168,0.308838,0.374023,0.518066,0.452148
299998,0.331787,0.812988,0.808105,0.630859,0.346924,0.735352,0.563477,0.609863,0.680664,0.318359,0.335938


### Merging New Dataframe without rows

In [24]:
data_cols = data[cat_cols]
data_cols

Unnamed: 0,cat10,cat9,cat6,cat7,cat18,cat17,cat4,cat2,cat14,cat5,cat11,cat8,cat1,cat13,cat16,cat0,cat15,cat3,cat12
0,LO,A,A,S,B,D,B,A,A,BI,A,Q,I,A,D,A,B,B,A
1,HJ,F,K,W,B,D,E,A,B,BI,A,AD,I,A,B,A,D,A,B
2,DJ,L,A,E,B,D,E,A,A,BI,A,BM,K,A,D,A,B,A,B
3,KV,F,A,Y,B,D,E,A,A,BI,A,AD,K,A,D,A,B,C,A
4,DP,A,C,G,B,D,E,G,B,BI,A,Q,I,A,B,A,B,B,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,HG,A,A,AS,B,D,E,F,B,BU,A,K,N,A,B,A,D,A,A
299996,HK,E,A,K,B,D,G,A,B,BI,A,AE,K,A,D,A,B,A,B
299997,HC,A,C,L,D,D,H,M,B,BI,B,F,G,A,B,A,D,A,A
299998,BF,A,A,AA,A,D,B,A,A,BI,A,AX,H,A,A,B,B,D,A


In [25]:
data_no_outliers = pd.merge(data_cols, data_num_outliers, left_index=True, right_index=True)

In [26]:
X_no_out = data_no_outliers
X_no_out

Unnamed: 0,cat10,cat9,cat6,cat7,cat18,cat17,cat4,cat2,cat14,cat5,...,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10
0,LO,A,A,S,B,D,B,A,A,BI,...,0.855469,0.759277,0.795410,0.682129,0.621582,0.592285,0.791992,0.815430,0.964844,0.666016
1,HJ,F,K,W,B,D,E,A,B,BI,...,0.328857,0.386475,0.541504,0.388916,0.357666,0.600098,0.408691,0.399414,0.927246,0.493652
2,DJ,L,A,E,B,D,E,A,A,BI,...,0.322754,0.343262,0.616211,0.793457,0.552734,0.352051,0.388916,0.412354,0.292725,0.549316
3,KV,F,A,Y,B,D,E,A,A,BI,...,0.707520,0.831055,0.807617,0.799805,0.619141,0.221802,0.897461,0.633789,0.760254,0.934082
4,DP,A,C,G,B,D,E,G,B,BI,...,0.274414,0.338867,0.277344,0.610352,0.128296,0.578613,0.279053,0.351074,0.357178,0.328857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,HG,A,A,AS,B,D,E,F,B,BU,...,0.500488,0.662598,0.671875,0.390625,0.145874,0.262695,0.514160,0.519531,0.617676,0.687988
299996,HK,E,A,K,B,D,G,A,B,BI,...,0.790527,0.821777,0.620117,0.385010,0.735840,0.547852,0.726562,0.470459,0.275635,0.639160
299997,HC,A,C,L,D,D,H,M,B,BI,...,0.522461,0.406982,0.232422,0.832520,0.810547,0.597168,0.308838,0.374023,0.518066,0.452148
299998,BF,A,A,AA,A,D,B,A,A,BI,...,0.812988,0.808105,0.630859,0.346924,0.735352,0.563477,0.609863,0.680664,0.318359,0.335938


In [27]:
X_no_out.index

RangeIndex(start=0, stop=300000, step=1)

### Histogram Cat Values

In [28]:
"""plt.figure(figsize=(30,50))
for index, cat in enumerate(cat_cols):
    index += 1
    plt.subplot(10,2,index)
    x = data[cat]
    sns.histplot(x)"""

'plt.figure(figsize=(30,50))\nfor index, cat in enumerate(cat_cols):\n    index += 1\n    plt.subplot(10,2,index)\n    x = data[cat]\n    sns.histplot(x)'

## One Hot Encoding

In [29]:
X_no_out[cat_cols]

Unnamed: 0,cat10,cat9,cat6,cat7,cat18,cat17,cat4,cat2,cat14,cat5,cat11,cat8,cat1,cat13,cat16,cat0,cat15,cat3,cat12
0,LO,A,A,S,B,D,B,A,A,BI,A,Q,I,A,D,A,B,B,A
1,HJ,F,K,W,B,D,E,A,B,BI,A,AD,I,A,B,A,D,A,B
2,DJ,L,A,E,B,D,E,A,A,BI,A,BM,K,A,D,A,B,A,B
3,KV,F,A,Y,B,D,E,A,A,BI,A,AD,K,A,D,A,B,C,A
4,DP,A,C,G,B,D,E,G,B,BI,A,Q,I,A,B,A,B,B,A
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,HG,A,A,AS,B,D,E,F,B,BU,A,K,N,A,B,A,D,A,A
299996,HK,E,A,K,B,D,G,A,B,BI,A,AE,K,A,D,A,B,A,B
299997,HC,A,C,L,D,D,H,M,B,BI,B,F,G,A,B,A,D,A,A
299998,BF,A,A,AA,A,D,B,A,A,BI,A,AX,H,A,A,B,B,D,A


In [30]:
from sklearn.preprocessing import OneHotEncoder

In [31]:
ohe = OneHotEncoder()

In [32]:
new_X = ohe.fit_transform(X_no_out[cat_cols]).toarray()

In [33]:
df_cat_ohe = pd.DataFrame(new_X)

In [34]:
df_cat_ohe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,613,614,615,616,617,618,619,620,621,622
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
299996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
299997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
299998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [35]:
X_no_out.index

RangeIndex(start=0, stop=300000, step=1)

In [36]:
df_cat_ohe.set_index(X_no_out.index, inplace=True)

In [37]:
df_cat_ohe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,613,614,615,616,617,618,619,620,621,622
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
299996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
299997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
299998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [38]:
data_no_outliers

Unnamed: 0,cat10,cat9,cat6,cat7,cat18,cat17,cat4,cat2,cat14,cat5,...,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10
0,LO,A,A,S,B,D,B,A,A,BI,...,0.855469,0.759277,0.795410,0.682129,0.621582,0.592285,0.791992,0.815430,0.964844,0.666016
1,HJ,F,K,W,B,D,E,A,B,BI,...,0.328857,0.386475,0.541504,0.388916,0.357666,0.600098,0.408691,0.399414,0.927246,0.493652
2,DJ,L,A,E,B,D,E,A,A,BI,...,0.322754,0.343262,0.616211,0.793457,0.552734,0.352051,0.388916,0.412354,0.292725,0.549316
3,KV,F,A,Y,B,D,E,A,A,BI,...,0.707520,0.831055,0.807617,0.799805,0.619141,0.221802,0.897461,0.633789,0.760254,0.934082
4,DP,A,C,G,B,D,E,G,B,BI,...,0.274414,0.338867,0.277344,0.610352,0.128296,0.578613,0.279053,0.351074,0.357178,0.328857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,HG,A,A,AS,B,D,E,F,B,BU,...,0.500488,0.662598,0.671875,0.390625,0.145874,0.262695,0.514160,0.519531,0.617676,0.687988
299996,HK,E,A,K,B,D,G,A,B,BI,...,0.790527,0.821777,0.620117,0.385010,0.735840,0.547852,0.726562,0.470459,0.275635,0.639160
299997,HC,A,C,L,D,D,H,M,B,BI,...,0.522461,0.406982,0.232422,0.832520,0.810547,0.597168,0.308838,0.374023,0.518066,0.452148
299998,BF,A,A,AA,A,D,B,A,A,BI,...,0.812988,0.808105,0.630859,0.346924,0.735352,0.563477,0.609863,0.680664,0.318359,0.335938


### Merging OHE with Num Features

In [39]:
new_data = df_cat_ohe.join(data_num_outliers)

In [40]:
new_data.shape

(300000, 634)

## Baseline model

### Adding Target to New Dataframe

In [41]:
X_no_out.index

RangeIndex(start=0, stop=300000, step=1)

In [42]:
new_data['target'] = y[y.index.isin(X_no_out.index)]

### X & Y with new DataFrame

In [43]:
X = new_data.drop(columns='target')

In [44]:
X

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,cont1,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.855469,0.759277,0.795410,0.682129,0.621582,0.592285,0.791992,0.815430,0.964844,0.666016
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.328857,0.386475,0.541504,0.388916,0.357666,0.600098,0.408691,0.399414,0.927246,0.493652
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.322754,0.343262,0.616211,0.793457,0.552734,0.352051,0.388916,0.412354,0.292725,0.549316
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.707520,0.831055,0.807617,0.799805,0.619141,0.221802,0.897461,0.633789,0.760254,0.934082
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.274414,0.338867,0.277344,0.610352,0.128296,0.578613,0.279053,0.351074,0.357178,0.328857
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.500488,0.662598,0.671875,0.390625,0.145874,0.262695,0.514160,0.519531,0.617676,0.687988
299996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.790527,0.821777,0.620117,0.385010,0.735840,0.547852,0.726562,0.470459,0.275635,0.639160
299997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.522461,0.406982,0.232422,0.832520,0.810547,0.597168,0.308838,0.374023,0.518066,0.452148
299998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.812988,0.808105,0.630859,0.346924,0.735352,0.563477,0.609863,0.680664,0.318359,0.335938


In [45]:
y = new_data['target']

In [46]:
y.shape

(300000,)

### Baseline Model

In [47]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression # explicit class import from module

In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [49]:
model = LogisticRegression()

In [50]:
X.shape

(300000, 634)

In [56]:
cv_results = cross_validate(model, X, y, cv=2)

Traceback (most recent call last):
  File "/home/sisto/.pyenv/versions/3.8.6/envs/lewagon/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 593, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/sisto/.pyenv/versions/3.8.6/envs/lewagon/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1344, in fit
    X, y = self._validate_data(X, y, accept_sparse='csr', dtype=_dtype,
  File "/home/sisto/.pyenv/versions/3.8.6/envs/lewagon/lib/python3.8/site-packages/sklearn/base.py", line 433, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "/home/sisto/.pyenv/versions/3.8.6/envs/lewagon/lib/python3.8/site-packages/sklearn/utils/validation.py", line 63, in inner_f
    return f(*args, **kwargs)
  File "/home/sisto/.pyenv/versions/3.8.6/envs/lewagon/lib/python3.8/site-packages/sklearn/utils/validation.py", line 814, in check_X_y
    X = check_array(X, accept_sparse=accept_sparse,
  File "/home/sisto/.pyenv/vers

In [52]:
# Scores
cv_results['test_score']

array([nan, nan, nan, nan, nan])

In [53]:
# Mean of scores
cv_results['test_score'].mean()

nan