In [1]:
import numpy as np
import pandas as pd

In [3]:
df = pd.read_csv('market.csv')

In [10]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [7]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          1
Area               0
Profit             0
dtype: int64

# Handle Null value

In [14]:
mean = df.Transport.mean()
median = df.Transport.median()

In [15]:
mean

215331.73244897963

In [16]:
median

214634.81

In [18]:
df.Transport = df.Transport.fillna(median)

In [19]:
df.isnull().sum()

Marketing Spend    0
Administration     0
Transport          0
Area               0
Profit             0
dtype: int64

# Encoding 

In [21]:
df.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

# Manual Encoding

In [23]:
df.Area =  df.Area.replace(['Dhaka', 'Ctg', 'Rangpur'],[3,2,1])

In [24]:
df.Area.head()

0    3
1    2
2    1
3    3
4    1
Name: Area, dtype: int64

# Label Encoding

In [26]:
from sklearn.preprocessing import LabelEncoder


In [30]:
df = pd.read_csv('market.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [27]:
label = LabelEncoder()

In [31]:
df.Area = label.fit_transform(df['Area'])

In [32]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


In [38]:
#Loop
for column in df.columns:
    if df[column].dtype == np.number:
        continue
    df[column] = LabelEncoder().fit_transform(df[column]) 

  if df[column].dtype == np.number:


In [40]:
df.Area.head()

0    1
1    0
2    2
3    1
4    2
Name: Area, dtype: int64

# One Hot Encoding

In [45]:
dummy_variable = pd.get_dummies(df['Area'],drop_first=True)

In [46]:
dummy_variable.head()

Unnamed: 0,1,2
0,1,0
1,0,0
2,0,1
3,1,0
4,0,1


In [47]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,1,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,2,191050.39
3,144372.41,118671.85,383199.62,1,182901.99
4,142107.34,91391.77,366168.42,2,166187.94


In [50]:
new_df = df.drop('Area',axis=1)

In [51]:
new_df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit
0,114523.61,136897.8,471784.1,192261.83
1,162597.7,151377.59,443898.53,191792.06
2,153441.51,101145.55,407934.54,191050.39
3,144372.41,118671.85,383199.62,182901.99
4,142107.34,91391.77,366168.42,166187.94


In [52]:
df = pd.concat([new_df,dummy_variable],axis=1)

In [54]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Profit,1,2
0,114523.61,136897.8,471784.1,192261.83,1,0
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1
3,144372.41,118671.85,383199.62,182901.99,1,0
4,142107.34,91391.77,366168.42,166187.94,0,1


# Ordinal Encoder

In [55]:
df = pd.read_csv('market.csv')

In [56]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [57]:
from sklearn.preprocessing import OrdinalEncoder 

In [58]:
df.Area.unique()

array(['Dhaka', 'Ctg', 'Rangpur'], dtype=object)

In [59]:
city_list = ['Dhaka', 'Ctg', 'Rangpur']

In [60]:
ordinal = OrdinalEncoder(categories=[city_list])

In [63]:
encoded_values = ordinal.fit_transform(df[['Area']]) # number of sample & number of feature

In [66]:
new_area = pd.DataFrame(encoded_values, columns=['Area'])

In [67]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [68]:
df.Area = new_area

In [69]:
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,0.0,192261.83
1,162597.7,151377.59,443898.53,1.0,191792.06
2,153441.51,101145.55,407934.54,2.0,191050.39
3,144372.41,118671.85,383199.62,0.0,182901.99
4,142107.34,91391.77,366168.42,2.0,166187.94


# Hasing Encoder

In [70]:
df = pd.read_csv('market.csv')
df.head()

Unnamed: 0,Marketing Spend,Administration,Transport,Area,Profit
0,114523.61,136897.8,471784.1,Dhaka,192261.83
1,162597.7,151377.59,443898.53,Ctg,191792.06
2,153441.51,101145.55,407934.54,Rangpur,191050.39
3,144372.41,118671.85,383199.62,Dhaka,182901.99
4,142107.34,91391.77,366168.42,Rangpur,166187.94


In [76]:
import category_encoders as ce


In [79]:
encoders = ce.HashingEncoder(cols='Area',n_components=3)

In [80]:
encoders.fit_transform(df)

Unnamed: 0,col_0,col_1,col_2,Marketing Spend,Administration,Transport,Profit
0,0,1,0,114523.61,136897.8,471784.1,192261.83
1,0,0,1,162597.7,151377.59,443898.53,191792.06
2,1,0,0,153441.51,101145.55,407934.54,191050.39
3,0,1,0,144372.41,118671.85,383199.62,182901.99
4,1,0,0,142107.34,91391.77,366168.42,166187.94
5,0,1,0,131876.9,99814.71,362861.36,156991.12
6,0,0,1,134615.46,147198.87,127716.82,156122.51
7,1,0,0,130298.13,145530.06,323876.68,155752.6
8,0,1,0,120542.52,148718.95,311613.29,152211.77
9,0,0,1,123334.88,108679.17,304981.62,149759.96
