In [1]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('train.csv',usecols=['X1','X2','X3','X4','X5','X6'])
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6
0,v,at,a,d,u,j
1,t,av,e,d,y,l
2,w,n,c,d,x,j
3,t,n,f,d,x,l
4,v,n,f,d,h,d


In [4]:
# Let's have a look at how many labels each variable has

for cols in data.columns:
    print(cols, ":" ,len(data[cols].unique()),'labels')

X1 : 27 labels
X2 : 44 labels
X3 : 7 labels
X4 : 4 labels
X5 : 29 labels
X6 : 12 labels


In [5]:
# Let's see how many columns we get after one hot encoding these variables
pd.get_dummies(data,drop_first=True).shape

(4209, 117)

We can see that from just 6 initial categorical variables, we end up with 117 new variables.</br>
What we can do instead is to limit one-hot encoding to the 10 most frequent labels of the variable. This means that they would make one binary variable for each of the 10 most frequent labels only. This is equivalent to grouping all the other labels under a new category, that in this case will be dropped.

In [6]:
data.X2.value_counts().sort_values(ascending=False).head(20)

as    1659
ae     496
ai     415
m      367
ak     265
r      153
n      137
s       94
f       87
e       81
aq      63
ay      54
a       47
t       29
k       25
i       25
b       21
ao      20
z       19
ag      19
Name: X2, dtype: int64

In [7]:
# Let's make a list with the most frequent categories of the variable

top_10 = [x for x in data.X2.value_counts().sort_values(ascending=False).head(10).index]
top_10

['as', 'ae', 'ai', 'm', 'ak', 'r', 'n', 's', 'f', 'e']

In [9]:
# now we make the 10 binary variables
for label in top_10:
    data[label] = np.where(data.X2 == label, 1, 0)

data[['X2']+top_10].head()


Unnamed: 0,X2,as,ae,ai,m,ak,r,n,s,f,e
0,at,0,0,0,0,0,0,0,0,0,0
1,av,0,0,0,0,0,0,0,0,0,0
2,n,0,0,0,0,0,0,1,0,0,0
3,n,0,0,0,0,0,0,1,0,0,0
4,n,0,0,0,0,0,0,1,0,0,0


In [10]:
# get whole set of dummy variables, for all the categorical variables

def one_hot_top_x(df, variable,top_x_labels):
    """
    Function to create dummy variables for a variable with top x labels
    """
    for label in top_x_labels:
        df[variable+"_"+label] = np.where(df[variable] == label, 1, 0)

In [11]:
# read data again
data = pd.read_csv('train.csv',usecols=['X1','X2','X3','X4','X5','X6'])

columns = data.columns

for cols in columns:
    top_10 = [x for x in data[cols].value_counts().sort_values(ascending=False).head(10).index]
    one_hot_top_x(data,cols,top_10)
    
data.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X1_aa,X1_s,X1_b,X1_l,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,v,at,a,d,u,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,t,av,e,d,y,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,w,n,c,d,x,j,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,t,n,f,d,x,l,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,v,n,f,d,h,d,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [13]:
data.drop(['X1','X2','X3','X4','X5','X6'],axis=1,inplace=True)

In [14]:
data.head()

Unnamed: 0,X1_aa,X1_s,X1_b,X1_l,X1_v,X1_r,X1_i,X1_a,X1_c,X1_o,...,X6_g,X6_j,X6_d,X6_i,X6_l,X6_a,X6_h,X6_k,X6_c,X6_b
0,0,0,0,0,1,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


### Advantages
1. It is easier to understand and interpret the results.
2. Straightforward to implement.
3. Does not require hrs of variable exploration.
4. Does not expand massively the number of variables.
### Disadvantages
1. Does not add any new information that may make the variable more predictive.
2. Does not keep the information of the ignored labels.