In [6]:
import pandas as pd
import numpy as np

In [7]:
data = pd.read_csv('train.csv', usecols=['X1','X2','X3','X4','X5','X6'])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [8]:
# Check how many labels each variable has

for col in data.columns:
    print(col, ':', len(data[col].unique()), 'labels')
    

X1 : 27 labels
X2 : 44 labels
X3 : 7 labels
X4 : 4 labels
X5 : 29 labels
X6 : 12 labels


In [None]:
# let's check what will be the size of columns if we apply one hot encoding

pd.get_dummies(data, drop_first = True).shape

### Since every single col contains many categories, one hand encoding is not appropriate to apply.. so the technique we wil be using is ONE HOT ENCODING WITH MULTIPLE CATEGORIES

In [11]:
data.X2.value_counts().sort_values(ascending = False).head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
i       25
k       25
b       21
ao      20
ag      19
z       19
Name: X2, dtype: int64

In [16]:
# now make a list of of the top 10 categories appeared

top_10 = [x for x in data.X2.value_counts().sort_values(ascending = False).head(10).index]
top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [18]:
# now we make 10 binary variables

for label in top_10:
    data[label] = np.where(data['X2']==label, 1, 0)
    
data[['X2']+top_10].head(40)

Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0
5,e,0,0,0,0,0,0,0,0,0,1
6,e,0,0,0,0,0,0,0,0,0,1
7,as,1,0,0,0,0,0,0,0,0,0
8,as,1,0,0,0,0,0,0,0,0,0
9,aq,0,0,0,0,0,0,0,0,0,0


In [19]:
# get whole set of dummy variables, for all the categorical variables

def one_hot_top_x(df, variable, top_x_labels):
    #function to create the dummy var for the most frequent labels
    
    for label in top_x_labels:
        df[variable+'_'+label] = np.where(data[variable]==label, 1, 0)
        
# read the data again

data = pd.read_csv('train.csv', usecols=['X1','X2','X3','X4','X5','X6'])

# encode X2 into 10 most frequent categories

one_hot_top_x(data, 'X2', top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,X2_ak,X2_r,X2_n,X2_s,X2_f,X2_e
0,v,at,a,d,u,j,0,0,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,0,0,1,0,0,0
3,t,n,f,d,x,l,0,0,0,0,0,0,1,0,0,0
4,v,n,f,d,h,d,0,0,0,0,0,0,1,0,0,0


In [20]:
# Similiarly find for X1

data.X1.value_counts().sort_values(ascending = False).head(20)

# now make a list of of the top 10 categories appeared

top_10 = [x for x in data.X1.value_counts().sort_values(ascending = False).head(10).index]
top_10

['aa', 's', 'b', 'l', 'v', 'r', 'i', 'a', 'c', 'o']

In [22]:
# now we make 10 binary variables

for label in top_10:
    data[label] = np.where(data['X1']==label, 1, 0)
    
data[['X1']+top_10].head(4)

Unnamed: 0,X1,aa,s,b,l,v,r,i,a,c,o
0,v,0,0,0,0,1,0,0,0,0,0
1,t,0,0,0,0,0,0,0,0,0,0
2,w,0,0,0,0,0,0,0,0,0,0
3,t,0,0,0,0,0,0,0,0,0,0


In [24]:
# find the top 10 categories for x1
top_10 = [x for x in data.X1.value_counts().sort_values(ascending = False).head(10).index]
    
# now we create dummy var for x1
one_hot_top_x(data, 'X1', top_10)
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X2_as,X2_ae,X2_ai,X2_m,...,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o
0,v,at,a,d,u,j,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0


#### The same steps are repeated for other variables also