### Word Embeddings for Categorical Features

This snippet of bash script will link your kaggle api key to the library and download the data

In [None]:
!echo "Installing dependencies"
!pip3 install --upgrade --force-reinstall --no-deps kaggle
# This snippet will install kaggle api and connect your api-key to it
!mkdir -p ~/.kaggle
!echo "Setting up your Kaggle key to API..."
!cp '/content/drive/MyDrive/Kaggle/kaggle.json' ~/.kaggle/
!cat ~/.kaggle/kaggle.json
!chmod 600 ~/.kaggle/kaggle.json
!echo "Kaggle API Key successfully linked !!!"
!kaggle competitions download -c cat-in-the-dat-ii
!cd ~/.
!unzip cat-in-the-dat-ii.zip

Installing dependencies
Processing /root/.cache/pip/wheels/a1/6a/26/d30b7499ff85a4a4593377a87ecf55f7d08af42f0de9b60303/kaggle-1.5.12-cp37-none-any.whl
Installing collected packages: kaggle
  Found existing installation: kaggle 1.5.12
    Uninstalling kaggle-1.5.12:
      Successfully uninstalled kaggle-1.5.12
Successfully installed kaggle-1.5.12
Setting up your Kaggle key to API...
{"username":"atharvaingle","key":"2d3cebc4a4637012b24ba6d11e73f5ce"}Kaggle API Key successfully linked !!!
cat-in-the-dat-ii.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  cat-in-the-dat-ii.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


#### Let' start

In [None]:
import numpy as np
import pandas as pd
import os
from sklearn import preprocessing
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model

In [None]:
train = pd.read_csv("/content/train.csv")
test = pd.read_csv("/content/test.csv")
sample = pd.read_csv("/content/sample_submission.csv")

In [None]:
train.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


In [None]:
test.loc[:, 'target'] = -1

In [None]:
data = pd.concat([train, test]).reset_index(drop=True)

In [None]:
train.shape, test.shape, data.shape

((600000, 25), (400000, 25), (1000000, 25))

In [None]:
features = [f for f in train.columns if f not in ["id", "target"]]

In [None]:
features

['bin_0',
 'bin_1',
 'bin_2',
 'bin_3',
 'bin_4',
 'nom_0',
 'nom_1',
 'nom_2',
 'nom_3',
 'nom_4',
 'nom_5',
 'nom_6',
 'nom_7',
 'nom_8',
 'nom_9',
 'ord_0',
 'ord_1',
 'ord_2',
 'ord_3',
 'ord_4',
 'ord_5',
 'day',
 'month']

In [None]:
for feat in features:
    lbl_enc = preprocessing.LabelEncoder()
    data.loc[:, feat] = lbl_enc.fit_transform(data[feat].astype(str).fillna("-1").values)

In [None]:
data.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,0,0,2,4,3,5,0,1059,1013,87,1,27,2,0,3,2,20,56,5,5,0
1,1,1,1,0,0,1,2,3,0,6,3,210,359,27,69,2112,2,2,5,4,23,151,6,9,0
2,2,0,1,0,0,0,2,6,3,0,0,860,694,90,102,2218,2,5,2,13,15,105,4,11,0
3,3,2,0,0,0,0,2,0,3,3,3,477,241,51,170,2167,0,4,4,0,2,140,2,5,0
4,4,0,2,0,1,0,2,5,3,2,4,556,361,182,222,1747,2,2,1,7,2,50,4,3,0


In [None]:
train = data[data.target != -1].reset_index(drop=True)
test = data[data.target == -1].reset_index(drop=True)

In [None]:
train.shape, test.shape

((600000, 25), (400000, 25))

In [33]:
def get_model(df, categorical_columns):
    inputs = []
    outputs = []
    for c in categorical_columns:
        num_unique_vals = int(df[c].nunique())
        embed_dim = int(min(np.ceil(num_unique_vals / 2), 50))
        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_vals + 1, embed_dim, name=c)(inp)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)
    x = layers.Concatenate()(outputs)
    x = layers.Dense(300, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    y = layers.Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inputs, outputs=y)

    return model

In [36]:
model = get_model(train, features)

In [40]:
model.compile(loss='binary_crossentropy', optimizer='adam')
model.fit([train.loc[:, f].values for f in features], train.target.values)



<tensorflow.python.keras.callbacks.History at 0x7fcaf7daa910>