In [1]:
import pandas as pd
from sklearn import preprocessing

train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")

In [3]:
print(f"{train.shape}")
print(f"{test.shape}")

(600000, 25)
(400000, 24)


In [4]:
# create a fake target column for test data
# since this column doesn't exist
test.loc[:,"target"] = -1

In [5]:
#concatenating train and test data
data = pd.concat([train,test]).reset_index(drop=True)


In [6]:
# make a list of features we are interested in
# id and target is something we should not encode
features = [x for x in train.columns if x not in ["id", "target"]]
# loop over the features list
for feat in features:
    # create a new instance of LabelEncoder for each feature
    lbl_enc = preprocessing.LabelEncoder()
    # note the trick here
    # since its categorical data, we fillna with a string
    # and we convert all the data to string type
    # so, no matter its int or float, its converted to string
    # int/float but categorical!!!
    data[feat].fillna("NONE", inplace= True)
    print(data[feat].value_counts())
    temp_col = data[feat].astype(str).values
    # we can use fit_transform here as we do not
    # have any extra test data that we need to
    # transform on separately
    data.loc[:, feat] = lbl_enc.fit_transform(temp_col)



0.0     880154
1.0      90051
NONE     29795
Name: bin_0, dtype: int64
0.0     789371
1.0     180588
NONE     30041
Name: bin_1, dtype: int64
0.0     699843
1.0     270255
NONE     29902
Name: bin_2, dtype: int64
F       610304
T       359731
NONE     29965
Name: bin_3, dtype: int64
N       520771
Y       449231
NONE     29998
Name: bin_4, dtype: int64
Red      539738
Blue     342453
Green     87495
NONE      30314
Name: nom_0, dtype: int64
Triangle     273832
Polygon      253952
Trapezoid    199463
Circle       175071
Square        43901
NONE          30103
Star          23678
Name: nom_1, dtype: int64
Hamster    274196
Axolotl    254155
Lion       199206
Dog        174752
Cat         43917
NONE        30214
Snake       23560
Name: nom_2, dtype: int64
India         274546
Costa Rica    253274
Russia        199933
Finland       174188
Canada         44044
NONE           30297
China          23718
Name: nom_3, dtype: int64
Theremin    514158
Bassoon     328104
Oboe         83328
Piano  

In [8]:
for feat in features:
    print(data[feat].value_counts())

0    880154
1     90051
2     29795
Name: bin_0, dtype: int64
0    789371
1    180588
2     30041
Name: bin_1, dtype: int64
0    699843
1    270255
2     29902
Name: bin_2, dtype: int64
0    610304
2    359731
1     29965
Name: bin_3, dtype: int64
0    520771
2    449231
1     29998
Name: bin_4, dtype: int64
3    539738
0    342453
1     87495
2     30314
Name: nom_0, dtype: int64
6    273832
2    253952
5    199463
0    175071
3     43901
1     30103
4     23678
Name: nom_1, dtype: int64
3    274196
0    254155
4    199206
2    174752
1     43917
5     30214
6     23560
Name: nom_2, dtype: int64
4    274546
2    253274
6    199933
3    174188
0     44044
5     30297
1     23718
Name: nom_3, dtype: int64
4    514158
0    328104
2     83328
3     44382
1     30028
Name: nom_4, dtype: int64
757     29690
257      1633
1083     1631
846      1597
1209     1592
        ...  
227         8
546         6
1024        5
12          4
853         1
Name: nom_5, Length: 1221, dtype: int64
982   

In [9]:
data.shape

(1000000, 25)

In [10]:
# split the training and test data again
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)

In [11]:
test.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,600000,0,0,0,0,2,0,2,0,3,...,2197,2,5,0,6,21,147,2,11,-1
1,600001,0,0,0,0,2,3,0,4,6,...,1107,0,5,1,14,13,46,1,10,-1
2,600002,0,0,0,0,2,0,0,0,6,...,812,0,1,6,9,13,12,1,8,-1
3,600003,1,0,0,0,0,3,2,0,2,...,996,0,1,3,13,1,0,0,8,-1
4,600004,0,0,1,0,2,3,0,5,3,...,371,0,0,4,15,9,14,2,5,-1
