In [1]:
# NIRMALYA_THAKURTA_2052_DM_LAB_7
# Categorical Encoading Assignment 1-One-hot-encoding


In [2]:
import pandas as pd

# to split the datasets
from sklearn.model_selection import train_test_split

# for one hot encoding with sklearn
from sklearn.preprocessing import OneHotEncoder

# for one hot encoding with feature-engine
from feature_engine.encoding import OneHotEncoder as fe_OneHotEncoder

In [3]:

data = pd.read_csv('titanic.csv',
                   usecols=['sex', 'embarked', 'cabin', 'survived'])
data.head()

Unnamed: 0,survived,sex,cabin,embarked
0,1,female,B5,S
1,1,male,C22,S
2,0,female,C22,S
3,0,male,C22,S
4,0,female,C22,S


In [4]:
data['cabin']=data['cabin'].str[:1]
data.head()


Unnamed: 0,survived,sex,cabin,embarked
0,1,female,B,S
1,1,male,C,S
2,0,female,C,S
3,0,male,C,S
4,0,female,C,S


In [7]:
# let's separate into training and testing set: target = survived, test size = 30%
train,test=train_test_split(data[["sex","cabin","embarked"]],test_size=0.3,random_state=0)
print(train.shape,test.shape)

(916, 3) (393, 3)


In [8]:
# Find Cardinality for sex

train['sex'].unique()

array(['female', 'male'], dtype=object)

In [9]:
# crosscheck that embarked has 3 labels and missing data

train['embarked'].unique()


array(['S', 'C', 'Q', nan], dtype=object)

In [10]:
# cabin has 9 labels and missing data

train['cabin'].unique()

array([nan, 'E', 'C', 'D', 'B', 'A', 'F', 'T', 'G'], dtype=object)

In [11]:
# we can create dummy variables with the build in (e.g., tmp)
# pandas method get_dummies
tmp=pd.get_dummies(train['sex'])


tmp.head()

Unnamed: 0,female,male
501,1,0
588,1,0
402,1,0
1193,0,1
686,1,0


In [12]:
# for better visualisation let's put the dummies next
# to the original variable
tmp['sex']=train['sex']
tmp.head()

Unnamed: 0,female,male,sex
501,1,0,female
588,1,0,female
402,1,0,female
1193,0,1,male
686,1,0,female


In [13]:
# and now let's repeat for embarked

tmp=pd.get_dummies(train['embarked'])

tmp.head()

Unnamed: 0,C,Q,S
501,0,0,1
588,0,0,1
402,1,0,0
1193,0,1,0
686,0,1,0


In [14]:
# for better visualisation put the dummies next to the original variable

tmp['embarked']=train['embarked']
tmp.head()

Unnamed: 0,C,Q,S,embarked
501,0,0,1,S
588,0,0,1,S
402,1,0,0,C
1193,0,1,0,Q
686,0,1,0,Q


In [15]:
# and now do the same for cabin

tmp=pd.get_dummies(train['cabin'])

tmp.head()

Unnamed: 0,A,B,C,D,E,F,G,T
501,0,0,0,0,0,0,0,0
588,0,0,0,0,0,0,0,0
402,0,0,0,0,0,0,0,0
1193,0,0,0,0,0,0,0,0
686,0,0,0,0,0,0,0,0


In [16]:
# and now for all variables together: train set
tmp=pd.get_dummies(train)

print(tmp.shape)
tmp.head()

(916, 13)


Unnamed: 0,sex_female,sex_male,cabin_A,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,cabin_T,embarked_C,embarked_Q,embarked_S
501,1,0,0,0,0,0,0,0,0,0,0,0,1
588,1,0,0,0,0,0,0,0,0,0,0,0,1
402,1,0,0,0,0,0,0,0,0,0,1,0,0
1193,0,1,0,0,0,0,0,0,0,0,0,1,0
686,1,0,0,0,0,0,0,0,0,0,0,1,0


In [17]:
# and now for all variables together: test set
tmp=pd.get_dummies(test)

print(tmp.shape)

tmp.head()

(393, 12)


Unnamed: 0,sex_female,sex_male,cabin_A,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,embarked_C,embarked_Q,embarked_S
1139,0,1,0,0,0,0,0,0,0,0,0,1
533,1,0,0,0,0,0,0,0,0,0,0,1
459,0,1,0,0,0,0,0,0,0,0,0,1
1150,0,1,0,0,0,0,0,0,0,0,0,1
393,0,1,0,0,0,0,0,0,0,0,0,1


In [23]:
# obtaining k-1 labels for sex: use get_dummies to drop the first binary variable
tmp=pd.get_dummies(train['sex'],drop_first=True)



tmp.head()

Unnamed: 0,male
501,0
588,0
402,0
1193,1
686,0


In [24]:
# obtaining k-1 labels for embarked: use get_dummies to drop the first binary variable
tmp=pd.get_dummies(train['embarked'],drop_first=True)


tmp.head()

Unnamed: 0,Q,S
501,0,1
588,0,1
402,0,0
1193,1,0
686,1,0


In [20]:
# Put all these together: train set
tmp=pd.get_dummies(train,drop_first=True)
print(tmp.shape)

tmp.head()

(916, 10)


Unnamed: 0,sex_male,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,cabin_T,embarked_Q,embarked_S
501,0,0,0,0,0,0,0,0,0,1
588,0,0,0,0,0,0,0,0,0,1
402,0,0,0,0,0,0,0,0,0,0
1193,1,0,0,0,0,0,0,0,1,0
686,0,0,0,0,0,0,0,0,1,0


In [21]:
# put all these together: test set

tmp=pd.get_dummies(test,drop_first=True)
print(tmp.shape)

tmp.head()

(393, 9)


Unnamed: 0,sex_male,cabin_B,cabin_C,cabin_D,cabin_E,cabin_F,cabin_G,embarked_Q,embarked_S
1139,1,0,0,0,0,0,0,0,1
533,0,0,0,0,0,0,0,0,1
459,1,0,0,0,0,0,0,0,1
1150,1,0,0,0,0,0,0,0,1
393,1,0,0,0,0,0,0,0,1
