# Dummy Variables and One Hot Encoding

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model

df = pd.read_csv('homeprices.csv')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [14]:
# Function to get Encoding of dummy variables
dummies = pd.get_dummies(df.town)
dummies

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [15]:
# Merging the two dataframes
merged = pd.concat([df,dummies], axis = 'columns')
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [16]:
# Dropping original town column and one of the dummy variables (1 of them is dependant of other variables)
final = merged.drop(['town','west windsor'],axis='columns')
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [17]:
model = linear_model.LinearRegression()
x = final.drop('price', axis='columns')
y = final.price
model.fit(x.values,y.values)

In [18]:
model.predict([[2800,0,1]]) # House in Robinsville

array([590775.63964739])

In [19]:
model.predict([[3400,0,0]]) # House in Westwindsor

array([681241.66845839])

In [20]:
# Checking accuracy of our model

print(100*model.score(x.values,y.values),'%')

95.73929037221873 %


<b> Using sci-kit learn's one hot encoding </b>

In [21]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
dfle = df
dfle.town = le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [25]:
x = dfle[['town','area']].values
y = dfle.price

In [30]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('town',OneHotEncoder(), [0])], remainder='passthrough')
# 0th column is categorical features
x = ct.fit_transform(x)
x

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [31]:
x = x[:,1:] #Dropping first dummy column as they are co dependant
x
 

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [32]:
model.fit(x,y)

In [34]:
model.predict([[1,0,2800]]) # 1st column robinsville and 2nd is west windsor

array([590775.63964739])

## Exercise

In [35]:
df1 = pd.read_csv('carprices.csv')
df1

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [46]:
# Data Preprocessing
df1.rename(columns={'Car Model':'CarModel','Sell Price($)':'Price','Age(yrs)':'Age'}, inplace=True)
label = LabelEncoder()
df2 = df1
df2.CarModel = le.fit_transform(df2.CarModel)
df2
x1 = df2[['CarModel','Mileage','Age']].values
y1 = df2.Price

ct1 = ColumnTransformer([('CarModel',OneHotEncoder(), [0])], remainder='passthrough')
# 0th column is categorical features
x1 = ct.fit_transform(x1)
x1 = x1[:,1:]
x1 # 1st Column is Audi A5 and 2nd Column is Mercedes

array([[1.00e+00, 0.00e+00, 6.90e+04, 6.00e+00],
       [1.00e+00, 0.00e+00, 3.50e+04, 3.00e+00],
       [1.00e+00, 0.00e+00, 5.70e+04, 5.00e+00],
       [1.00e+00, 0.00e+00, 2.25e+04, 2.00e+00],
       [1.00e+00, 0.00e+00, 4.60e+04, 4.00e+00],
       [0.00e+00, 0.00e+00, 5.90e+04, 5.00e+00],
       [0.00e+00, 0.00e+00, 5.20e+04, 5.00e+00],
       [0.00e+00, 0.00e+00, 7.20e+04, 6.00e+00],
       [0.00e+00, 0.00e+00, 9.10e+04, 8.00e+00],
       [0.00e+00, 1.00e+00, 6.70e+04, 6.00e+00],
       [0.00e+00, 1.00e+00, 8.30e+04, 7.00e+00],
       [0.00e+00, 1.00e+00, 7.90e+04, 7.00e+00],
       [0.00e+00, 1.00e+00, 5.90e+04, 5.00e+00]])

In [48]:
# Regression model
model.fit(x1,y1)
model.predict([[0,1,45000,4]])

array([36991.31721062])

In [49]:
model.predict([[0,0,86000,7]])

array([15365.40972059])

In [51]:
print(100*model.score(x1,y1),'%')

94.17050937281081 %


<b> Training and Testing Data </b>

In [55]:
x = df1[['Mileage','Age']]
y = df1['Price']
x

Unnamed: 0,Mileage,Age
0,69000,6
1,35000,3
2,57000,5
3,22500,2
4,46000,4
5,59000,5
6,52000,5
7,72000,6
8,91000,8
9,67000,6


In [59]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state=10)
# Dividing the data into training and testing sets is important and good practice
# Training dataset is 80% of all data
# random state ensures that the same samples are in training each time we run the command
# or else, at each run it changes randomly
len(x_train)

10

In [60]:
len(x_test)

3

In [61]:
x_train

Unnamed: 0,Mileage,Age
6,52000,5
8,91000,8
2,57000,5
12,59000,5
5,59000,5
10,83000,7
1,35000,3
0,69000,6
4,46000,4
9,67000,6


In [62]:
clf = linear_model.LinearRegression()
clf.fit(x_train,y_train)

In [64]:
clf.predict(x_test)

array([41842.49106079, 22531.68057211, 18423.93325387])

In [65]:
y_test

3     40000
7     19300
11    21000
Name: Price, dtype: int64

In [68]:
str(clf.score(x_test,y_test)*100) + '%'

'92.24816911971743%'