In [13]:
import pandas as pd

dummy_df = pd.read_csv('./dataset/dummy_ds.csv')
print(dummy_df.shape)
dummy_df.head();

(44, 6)


In [9]:
data, target = dummy_df.drop(columns="employed"), dummy_df["employed"];

# Same as :
# data = dummy_df.drop("employed")
# target = dummy_df["employed"]
# in two separate lines

data.head();

In [19]:
print(target.head());

0    y
1    n
2    n
3    y
4    y
Name: employed, dtype: object


In [20]:
data.dtypes

age           int64
education    object
region       object
savings       int64
house        object
dtype: object

## Working with Numerical Data

In [31]:
numerical_columns = ["age", "savings"]
data_numerical = data[numerical_data]
data_numerical.head()

Unnamed: 0,age,savings
0,23,13000
1,67,25013
2,34,13000
3,56,4000
4,78,12000


In [22]:
data["age"].describe()

count    44.000000
mean     41.340909
std      17.131425
min      16.000000
25%      30.500000
50%      35.500000
75%      52.250000
max      78.000000
Name: age, dtype: float64

## Working with Categorical Data

In [34]:
categorical_columns = ["education", "region", "house"]
data_categorical = data[categorical_data]
data_categorical.head()

Unnamed: 0,education,region,house
0,SC,Africa,rent
1,SC,UK,owner
2,HSC,US,owner
3,Primary,UK,parent
4,Degree,UK,parent


In [25]:
data["education"].value_counts().sort_index()

?           1
Degree      2
HSC         8
Masters     7
PhD         4
Primary    10
SC         12
Name: education, dtype: int64

## Encoding Categorical Values with Ordinal Encoder

In [29]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
edu_column_encoder = encoder.fit_transform(data[["education"]])
edu_column_encoder

array([[6.],
       [6.],
       [2.],
       [5.],
       [1.],
       [3.],
       [6.],
       [6.],
       [2.],
       [5.],
       [3.],
       [0.],
       [5.],
       [6.],
       [5.],
       [4.],
       [3.],
       [2.],
       [6.],
       [5.],
       [4.],
       [2.],
       [6.],
       [6.],
       [2.],
       [5.],
       [1.],
       [3.],
       [6.],
       [6.],
       [2.],
       [5.],
       [3.],
       [3.],
       [5.],
       [6.],
       [5.],
       [4.],
       [3.],
       [2.],
       [6.],
       [5.],
       [4.],
       [2.]])

In [30]:
encoder.categories_

[array(['?', 'Degree', 'HSC', 'Masters', 'PhD', 'Primary', 'SC'],
       dtype=object)]

In [35]:
data_encoded = encoder.fit_transform(data_categorical)
data_encoded[:10]

array([[6., 1., 3.],
       [6., 4., 1.],
       [2., 5., 1.],
       [5., 4., 2.],
       [1., 4., 2.],
       [3., 3., 3.],
       [6., 3., 3.],
       [6., 1., 3.],
       [2., 3., 1.],
       [5., 3., 1.]])

In [36]:
encoder.categories_

[array(['?', 'Degree', 'HSC', 'Masters', 'PhD', 'Primary', 'SC'],
       dtype=object),
 array(['?', 'Africa', 'Asia', 'Europe', 'UK', 'US'], dtype=object),
 array(['?', 'owner', 'parent', 'rent'], dtype=object)]

## Encoding Categorical values with oneHotEncoder

In [41]:
from sklearn.preprocessing import OneHotEncoder
ohencoder = OneHotEncoder(sparse=False)

In [43]:
edu_column_ohe = ohencoder.fit_transform(data[["education"]])
edu_column_ohe

#converted 1 column edu into 7 columns
#only corresponding column is converted to 1
#example: first array is SC because last column is 1

array([[0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
 

In [45]:
feature_names = ohencoder.get_feature_names_out(input_features=["education"])
edu_column_ohe = pd.DataFrame(edu_column_ohe, columns=feature_names)
edu_column_ohe

Unnamed: 0,education_?,education_Degree,education_HSC,education_Masters,education_PhD,education_Primary,education_SC
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,1.0
7,0.0,0.0,0.0,0.0,0.0,0.0,1.0
8,0.0,0.0,1.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [46]:
data_encoded = ohencoder.fit_transform(data_categorical)
data_encoded[:5]

array([[0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        1.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 1., 0.,
        0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0.,
        0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
        0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1.,
        0.]])

In [49]:
columns_encoded = ohencoder.get_feature_names_out(data_categorical.columns)
pd.DataFrame(data_encoded, columns=columns_encoded).head()

Unnamed: 0,education_?,education_Degree,education_HSC,education_Masters,education_PhD,education_Primary,education_SC,region_?,region_Africa,region_Asia,region_Europe,region_UK,region_US,house_?,house_owner,house_parent,house_rent
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
