In [1]:
import pandas as pd
import numpy as np

In [2]:
# read the data file using pandas
df = pd.read_csv('Customers.csv')

In [3]:
df.head()

Unnamed: 0,age,income,gender,m_status,buys
0,25.0,high,male,single,no
1,25.0,high,male,married,no
2,35.0,high,male,single,yes
3,35.0,medium,male,single,yes
4,30.0,low,female,single,yes


In [4]:
# checking whether there is any null value or not
df.isnull().sum()

age         2
income      0
gender      0
m_status    1
buys        0
dtype: int64

In [5]:
# calculating the mean value of age column
age_mean = df.age.mean()

In [6]:
age_mean

28.65

In [7]:
# calculating the mode value of m_status column
m_status_mode = df.m_status.mode()

In [8]:
m_status_mode

0    single
dtype: object

In [9]:
# replacing the value and fill the null value
df.age = df.age.fillna(age_mean)

In [10]:
df.isnull().sum()

age         0
income      0
gender      0
m_status    1
buys        0
dtype: int64

In [11]:
# replacing the null value by 'single'
df.m_status = df.m_status.fillna('single')

In [12]:
df.isnull().sum()

age         0
income      0
gender      0
m_status    0
buys        0
dtype: int64

### On Hot Encoding

In [13]:
df.gender.unique()

array(['male', 'female'], dtype=object)

In [14]:
dummy_gender = pd.get_dummies(df.gender)

In [15]:
dummy_gender.head()

Unnamed: 0,female,male
0,0,1
1,0,1
2,0,1
3,0,1
4,1,0


In [16]:
dummy_m_status = pd.get_dummies(df.m_status)

In [17]:
dummy_m_status.head()

Unnamed: 0,married,single
0,0,1
1,1,0
2,0,1
3,0,1
4,0,1


In [18]:
df = df.drop('gender', axis=1)

In [19]:
df = pd.concat([df, dummy_gender],axis=1)

In [20]:
df.head()

Unnamed: 0,age,income,m_status,buys,female,male
0,25.0,high,single,no,0,1
1,25.0,high,married,no,0,1
2,35.0,high,single,yes,0,1
3,35.0,medium,single,yes,0,1
4,30.0,low,single,yes,1,0


In [22]:
df = df.drop('m_status', axis=1)

In [23]:
df = pd.concat([df, dummy_m_status], axis=1)

In [24]:
df.head()

Unnamed: 0,age,income,buys,female,male,married,single
0,25.0,high,no,0,1,0,1
1,25.0,high,no,0,1,1,0
2,35.0,high,yes,0,1,0,1
3,35.0,medium,yes,0,1,0,1
4,30.0,low,yes,1,0,0,1


## Ordinal Encoding
### income column is in ordinal structure

In [27]:
# find unique data from income column
persons_income = df.income.unique()

In [28]:
persons_income

array(['high', 'medium', 'low'], dtype=object)

In [29]:
from sklearn.preprocessing import OrdinalEncoder

In [33]:
ord_encoding = OrdinalEncoder(categories = [persons_income])

In [35]:
encoded = ord_encoding.fit_transform(df[['income']])

In [39]:
encoded

array([[0.],
       [0.],
       [0.],
       [1.],
       [2.],
       [2.],
       [2.],
       [1.],
       [2.],
       [1.],
       [1.],
       [1.],
       [0.],
       [1.],
       [0.],
       [0.],
       [1.],
       [0.],
       [1.],
       [0.],
       [1.],
       [2.]])

In [40]:
encoded_income = pd.DataFrame(encoded, columns = ['income']) 

In [42]:
encoded_income.head()

Unnamed: 0,income
0,0.0
1,0.0
2,0.0
3,1.0
4,2.0


In [44]:
df.head()

Unnamed: 0,age,income,buys,female,male,married,single
0,25.0,high,no,0,1,0,1
1,25.0,high,no,0,1,1,0
2,35.0,high,yes,0,1,0,1
3,35.0,medium,yes,0,1,0,1
4,30.0,low,yes,1,0,0,1


In [45]:
df.income = encoded_income

In [46]:
df.head()

Unnamed: 0,age,income,buys,female,male,married,single
0,25.0,0.0,no,0,1,0,1
1,25.0,0.0,no,0,1,1,0
2,35.0,0.0,yes,0,1,0,1
3,35.0,1.0,yes,0,1,0,1
4,30.0,2.0,yes,1,0,0,1


In [53]:
# Rearranging the table column: move output column to last or right
df = df[['age', 'income', 'male', 'female', 'single', 'married', 'buys']]

In [49]:
df.head()

Unnamed: 0,age,income,male,female,single,married,buys
0,25.0,0.0,1,0,1,0,no
1,25.0,0.0,1,0,0,1,no
2,35.0,0.0,1,0,1,0,yes
3,35.0,1.0,1,0,1,0,yes
4,30.0,2.0,0,1,1,0,yes
