## **Categorical Variables and One Hot Encoding**

In [None]:
import pandas as pd
df = pd.read_csv('homeprices.csv')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [None]:
dum_town = pd.get_dummies(df.town)
dum_town = dum_town.replace({True: 1, False: 0})
dum_town

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [None]:
merged = pd.concat([df, dum_town], axis='columns')
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [None]:
merged.drop(['town', 'west windsor'], axis='columns', inplace=True)
merged

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
x = merged.drop('price', axis='columns')
x

Unnamed: 0,area,monroe township,robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [None]:
y = merged.price
y

Unnamed: 0,price
0,550000
1,565000
2,610000
3,680000
4,725000
5,585000
6,615000
7,650000
8,710000
9,575000


In [None]:
model.fit(x,y)

In [None]:
model.predict([[2800,0,1]])



array([590775.63964739])

In [None]:
model.predict([[3400,0,1]])



array([666914.10449365])

In [None]:
model.score(x,y)

0.9573929037221872

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
dfle = df
dfle.town = le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [None]:
x = dfle[['town', 'area']].values
x

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]])

In [None]:
y = dfle.price
y

Unnamed: 0,price
0,550000
1,565000
2,610000
3,680000
4,725000
5,585000
6,615000
7,650000
8,710000
9,575000


In [None]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()

x = ohe.fit_transform(x).toarray()
x

array([[1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.]])

In [None]:
x = x[:,9:]
x

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 0.],
       [0., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.],
       [0., 1., 0.]])

In [None]:
model.fit(x,y)

In [None]:
model.predict([[1,0, 3400]])

array([4.5965e+08])

# **Exercise**

At the same level as this notebook on github, there is an exercise folder that contains carprice.csv, This car has car sell prices for 3 different models. First plot data points on scatterplot chart to see if linear regression model can be applied. If yes, then build a model that can answer following questions :

**1. Predict the price of a mercedes benz that is 4 yr old with mileage 45000**

**2. Predict price of a BMW X5 that is 7 yr old with mileage 86000**

**3. Tell me the score (Accuracy) of model. (Hint: Use LinearRegression.score())**

In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [27]:
df = pd.read_csv('carprices.csv')
df

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs)
0,BMW X5,69000,18000,6
1,BMW X5,35000,34000,3
2,BMW X5,57000,26100,5
3,BMW X5,22500,40000,2
4,BMW X5,46000,31500,4
5,Audi A5,59000,29400,5
6,Audi A5,52000,32000,5
7,Audi A5,72000,19300,6
8,Audi A5,91000,12000,8
9,Mercedez Benz C class,67000,22000,6


In [28]:
car_model = pd.get_dummies(df['Car Model'])
car_model = car_model.replace({True: 1, False: 0})
car_model

Unnamed: 0,Audi A5,BMW X5,Mercedez Benz C class
0,0,1,0
1,0,1,0
2,0,1,0
3,0,1,0
4,0,1,0
5,1,0,0
6,1,0,0
7,1,0,0
8,1,0,0
9,0,0,1


In [29]:
merged = pd.concat([df, car_model], axis='columns')
merged

Unnamed: 0,Car Model,Mileage,Sell Price($),Age(yrs),Audi A5,BMW X5,Mercedez Benz C class
0,BMW X5,69000,18000,6,0,1,0
1,BMW X5,35000,34000,3,0,1,0
2,BMW X5,57000,26100,5,0,1,0
3,BMW X5,22500,40000,2,0,1,0
4,BMW X5,46000,31500,4,0,1,0
5,Audi A5,59000,29400,5,1,0,0
6,Audi A5,52000,32000,5,1,0,0
7,Audi A5,72000,19300,6,1,0,0
8,Audi A5,91000,12000,8,1,0,0
9,Mercedez Benz C class,67000,22000,6,0,0,1


In [30]:
x = merged.drop(['Sell Price($)',  'Car Model', 'Audi A5'], axis = 1)
x

Unnamed: 0,Mileage,Age(yrs),BMW X5,Mercedez Benz C class
0,69000,6,1,0
1,35000,3,1,0
2,57000,5,1,0
3,22500,2,1,0
4,46000,4,1,0
5,59000,5,0,0
6,52000,5,0,0
7,72000,6,0,0
8,91000,8,0,0
9,67000,6,0,1


In [31]:
y = merged['Sell Price($)']
y

Unnamed: 0,Sell Price($)
0,18000
1,34000
2,26100
3,40000
4,31500
5,29400
6,32000
7,19300
8,12000
9,22000


In [34]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [35]:
model.fit(x,y)

**Price of mercedez benz that is 4 yr old with mileage 45000**

In [36]:
model.predict([[45000, 4, 0, 1]])



array([36991.31721061])

**Price of BMW X5 that is 7 yr old with mileage 86000**

In [37]:
model.predict([[86000, 7, 1, 0]])



array([11080.74313219])

**Accuracy of model**

In [38]:
model.score(x,y)

0.9417050937281083