In [3]:
import category_encoders as ce
import pandas as pd

# Encoding of Ordinal Categorical Variable

In [8]:
df=pd.DataFrame({'height':['tall','medium','short','tall','medium','short','tall','medium','short',]})

# create object of Ordinal encoding
encoder= ce.OrdinalEncoder(cols=['height'],return_df=True,
                           mapping=[{'col':'height',
'mapping':{'None':0,'tall':1,'medium':2,'short':3}}])
df['transformed'] = encoder.fit_transform(df)
df

Unnamed: 0,height,transformed
0,tall,1
1,medium,2
2,short,3
3,tall,1
4,medium,2
5,short,3
6,tall,1
7,medium,2
8,short,3


# One hot encoding

In [9]:
df=pd.DataFrame({'name':[
'rahul','ashok','ankit','aditya','yash','vipin','amit'
]})
print(df)

one_hot_encoder=ce.OneHotEncoder(cols='name',handle_unknown='return_nan',return_df=True,use_cat_names=True)
df_encoded = one_hot_encoder.fit_transform(df)
print(df_encoded)


     name
0   rahul
1   ashok
2   ankit
3  aditya
4    yash
5   vipin
6    amit
   name_rahul  name_ashok  name_ankit  name_aditya  name_yash  name_vipin  \
0         1.0         0.0         0.0          0.0        0.0         0.0   
1         0.0         1.0         0.0          0.0        0.0         0.0   
2         0.0         0.0         1.0          0.0        0.0         0.0   
3         0.0         0.0         0.0          1.0        0.0         0.0   
4         0.0         0.0         0.0          0.0        1.0         0.0   
5         0.0         0.0         0.0          0.0        0.0         1.0   
6         0.0         0.0         0.0          0.0        0.0         0.0   

   name_amit  
0        0.0  
1        0.0  
2        0.0  
3        0.0  
4        0.0  
5        0.0  
6        1.0  


# Effect encoding

In [10]:
data=pd.DataFrame({'City':['Delhi','Mumbai','Hyderabad','Chennai','Bangalore','Delhi','Hyderabad']}) 
print(data)
effect_encoder=ce.sum_coding.SumEncoder(cols='City',verbose=False)
df=effect_encoder.fit_transform(data)
print(df)

        City
0      Delhi
1     Mumbai
2  Hyderabad
3    Chennai
4  Bangalore
5      Delhi
6  Hyderabad
   intercept  City_0  City_1  City_2  City_3
0          1     1.0     0.0     0.0     0.0
1          1     0.0     1.0     0.0     0.0
2          1     0.0     0.0     1.0     0.0
3          1     0.0     0.0     0.0     1.0
4          1    -1.0    -1.0    -1.0    -1.0
5          1     1.0     0.0     0.0     0.0
6          1     0.0     0.0     1.0     0.0




# Hash Encoder

In [11]:
data=pd.DataFrame({'Month':['January','April','March','April','February','June','July','June','September']})
print(data)
#Create object for hash encoder
encoder=ce.HashingEncoder(cols='Month',n_components=6)#Fit and Transform Data
df = encoder.fit_transform(data)
print(df)

       Month
0    January
1      April
2      March
3      April
4   February
5       June
6       July
7       June
8  September
   col_0  col_1  col_2  col_3  col_4  col_5
0      0      0      0      0      1      0
1      0      0      0      1      0      0
2      0      0      0      0      1      0
3      0      0      0      1      0      0
4      0      0      0      0      1      0
5      0      1      0      0      0      0
6      1      0      0      0      0      0
7      0      1      0      0      0      0
8      0      0      0      0      1      0


# Dummy Encoding

 Dummy encoding is similar to one-hot encoding, but unlike that encoding, it allows for only k-1 degrees of freedom. To put it simply, it eliminates the one extra column that we had in one-hot encoding. The category that is left out is called the reference category.

In [17]:
df=pd.DataFrame({'name':['rahul','ashok','ankit']})
print(df)
df1 = pd.get_dummies(df, drop_first= True)
print(df1)

    name
0  rahul
1  ashok
2  ankit
   name_ashok  name_rahul
0           0           1
1           1           0
2           0           0


In [18]:
df=pd.DataFrame({'name':['rahul','ashok','ankit']})
print(df)
df1 = pd.get_dummies(df)
print(df1)

    name
0  rahul
1  ashok
2  ankit
   name_ankit  name_ashok  name_rahul
0           0           0           1
1           0           1           0
2           1           0           0


# handle categorical variables / handle multocollinearity / The dummy variable trap

In [20]:
df=pd.DataFrame({'name':['rahul','ashok','ankit']})
print(df)
df1 = pd.get_dummies(df)
print(df1)
## the three columns are correlated with each other


    name
0  rahul
1  ashok
2  ankit
   name_ankit  name_ashok  name_rahul
0           0           0           1
1           0           1           0
2           1           0           0


In [22]:
# solution
df=pd.DataFrame({'name':['rahul','ashok']})
# drop_first is set to True , k-1 degrees of freedom
df1 = pd.get_dummies(df, drop_first= True)
print(df1)


   name_rahul
0           1
1           0


# Mismatched column between train and test set

In [24]:
df = pd.DataFrame({
   'Gender' : ['Female', 'Male', 'Male', 'Male', 'Male', 'Female', 'Male', 'Male','Male', 'Female','Male', 'Female'],
   'Age' : [41, 49, 37, 33, 27, 32, 59, 30, 38, 36, 35, 29],
   'EducationField': ['Life Sciences', 'Engineering', 'Life Sciences', 'Life Sciences', 'Medical', 'Life Sciences', 'Life Sciences', 'Life Sciences', 'Engineering', 'Medical', 'Life Sciences', 'Life Sciences'],
   'MonthlyIncome': [5993, 5130, 2090, 2909, 3468, 3068, 2670, 2693, 9526, 5237, 2426, 4193]})
df

Unnamed: 0,Gender,Age,EducationField,MonthlyIncome
0,Female,41,Life Sciences,5993
1,Male,49,Engineering,5130
2,Male,37,Life Sciences,2090
3,Male,33,Life Sciences,2909
4,Male,27,Medical,3468
5,Female,32,Life Sciences,3068
6,Male,59,Life Sciences,2670
7,Male,30,Life Sciences,2693
8,Male,38,Engineering,9526
9,Female,36,Medical,5237


In [26]:
from sklearn.model_selection import train_test_split
X = df.drop('MonthlyIncome', axis=1)
y = df['MonthlyIncome']
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)
print(pd.get_dummies(X_train))
print(pd.get_dummies(X_test))

#There is a column mismatch in the training and test set. This means the number of 
# columns in the training set is not equal to the ones in the test set, and this will introduce an error in the modeling process.

    Age  Gender_Female  Gender_Male  EducationField_Engineering  \
10   35              0            1                           0   
1    49              0            1                           1   
6    59              0            1                           0   
0    41              1            0                           0   
7    30              0            1                           0   
11   29              1            0                           0   
9    36              1            0                           0   
8    38              0            1                           1   
5    32              1            0                           0   

    EducationField_Life Sciences  EducationField_Medical  
10                             1                       0  
1                              0                       0  
6                              1                       0  
0                              1                       0  
7                              1  

In [27]:
# Dummy encoding Training set
X_train_encoded = pd.get_dummies(X_train)
# Saving the columns in a list
cols = X_train_encoded.columns.tolist()
# Viewing the first three rows of the encoded dataframe
X_train_encoded[:3]

Unnamed: 0,Age,Gender_Female,Gender_Male,EducationField_Engineering,EducationField_Life Sciences,EducationField_Medical
10,35,0,1,0,1,0
1,49,0,1,1,0,0
6,59,0,1,0,1,0


In [28]:
X_test_encoded = pd.get_dummies(X_test)
X_test_encoded = X_test_encoded.reindex(columns=cols).fillna(0)
X_test_encoded

Unnamed: 0,Age,Gender_Female,Gender_Male,EducationField_Engineering,EducationField_Life Sciences,EducationField_Medical
2,37,0.0,1,0.0,1,0
3,33,0.0,1,0.0,1,0
4,27,0.0,1,0.0,0,1


One hot encoding also solves the problem


In [31]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(sparse=False, handle_unknown='ignore', drop= 'if_binary')
train_enc = ohe.fit_transform(X_train[['Gender','EducationField']])

pd.DataFrame(train_enc, columns=ohe.get_feature_names())[:3]



Unnamed: 0,x0_Male,x1_Engineering,x1_Life Sciences,x1_Medical
0,1.0,0.0,1.0,0.0
1,1.0,1.0,0.0,0.0
2,1.0,0.0,1.0,0.0


In [32]:
# Transforming the test set
test_enc = ohe.fit_transform(X_test[['Gender','EducationField']])
#Converting back to a dataframe
pd.DataFrame(test_enc,columns=ohe.get_feature_names())



Unnamed: 0,x0_Male,x1_Medical
0,1.0,0.0
1,1.0,0.0
2,1.0,1.0


# Feature hashing (hashing trick)

Feature hashing uses a hash function (a function that maps data of arbitrary size to a fixed size) to transform each category into an integer. These integers will be contained within a prescribed range. Feature hashing is very similar to one-hot encoding in the sense that each category gets an integer but, unlike the one-hot encoding, it gives control over the output dimensions. Therefore, on a high cardinality data set, the user can limit the number of new features that are added due to the encoding. This improves computational efficiency. The price that has to be paid is that, unlike the one-hot encoding, the distance between any pair of categories is not guaranteed to be preserved. In an extreme case, different categories may be mapped into the same integer (a problem known as hash collision). 

One-hot Encoding

In the simplest of all cases, we have low and fixed cardinality. In such a case, we can have a model feature for each possible value that the variable can take on and set all of these features to zero except for the one corresponding to the value of our categorical feature. This works great and is known as one-hot encoding. This might lead to encoding days of the week as Monday =  (1,0,0,0,0,0,0)
 , Tuesday =  (0,1,0,0,0,0,0)
  and so on.

Rare-word Collapse

As the cardinality increases, however, this works less and less well, largely because some values will be much more rare than other values and increasing the number of features to a model beyond a few thousand generally has very bad effects on the ability to build a model. Even worse, high cardinality generally goes hand in hand with indefinite cardinality. Even so, it is common in natural language models to simply group all but the  ùëò
  most common values of a categorical variable as a single ‚ÄúRARE-WORD‚Äù value. This reduction allows us to have a  ùëò+1
 -hot encoding. If  ùëò
  is big enough, this will work pretty well because the ‚ÄúRARE-WORD‚Äù value will itself be pretty rare.

Frequency Binning

We can take this idea of collapsing to a radical and surprisingly effective extreme. This is done by reducing a high cardinality categorical feature to a single number that represents the frequency of the value of the feature. Alternately, you might use the quantile of the rank of the frequency, or bin the frequency of the value. In any case, this works in applications where a specific value isn‚Äôt as important as the fact that you have seen a surprisingly rare value. Consider, network intrusion detection where suddenly seeing lots of data going to a previously almost unknown external network address could be very informative. It doesn‚Äôt really matter which previously unknown address is being used, just that it is previously unknown or nearly so. Note that you can combine this kind of frequency feature with other features as well so that you not only get these desirable novelty effects, but you can keep the precise resolution about exactly which categorical value was seen.

Random Embedding

Another way to keep a fixed sized encoding with values of large or unknown cardinality without collapsing rare values together is to use a random embedding or projection. One simple way to do this is convert each possible value to a 50‚Äì300 dimensional vector. Commonly, these vectors will be constrained to have unit length You can actually do this in a consistent way without knowing the categorical values ahead of time by using the actual value as a seed for a random number generator and then using that generator to sample a ‚Äúrandom‚Äù unit vector. If the dimension of the vector is high enough (say 100 to 500 dimensions or more) then the vectors corresponding to any two categorical values will be nearly orthogonal with high probability. This quasi-orthogonality of random vectors is very handy since it makes each different value be sufficiently different from all other values so that machine learning algorithms can pick out important structure.

These random vectors can also be tuned somewhat using simple techniques to build a semantic space, or using more advanced techniques to get some very fancy results. Such random projections can be used to do linear algebraic decompositions as well.

The Hash Trick

We can use different random projections to get something much more like the one-hot encoding as well without having to collapse rare features or, indeed, without having to even know which features are rare. For each distinct value, we can encode that value using a  ùëõ
  binary values of which exactly  ùëò
  randomly chosen values are set to 1 with the rest set to 0 using the same seeding trick as before. Commonly  ùëõ
  is taken to be a few thousand while  ùëò
  can be relatively small, typically less than 20. When  ùëò=1
 , we get one-hot encoding again. This technique works because of the same mathematical techniques as random projection, but is generally described more in terms of analogies to Bloom filters.

Luduan Features

Finally, you can derive a numerical features by grouping values that have anomalous correlation with some objective observation and then weighting by the underlying frequency of the feature value (or the inverse log of that frequency). This reduction is known as a Luduan feature and is based on the use of log-likelihood ratio tests for finding interesting cooccurrence. I gave a talk on using these techniques for transaction mining some time ago that described how to do this.