In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Load Data

In [2]:
df = pd.read_csv('homeprices_ohe.csv')

# Create dummies for the categorical data column using Pandas

In [3]:
#Create the dummies
dummies = pd.get_dummies(df.town)

In [4]:
## Merge the datasets
merged = pd.concat([df,dummies], axis='columns')
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [5]:
## Drop the unwanted features - the categorical feature and the 3rd dummy feature to avoid colinearity.
final = merged.drop(['town', 'monroe township'], axis = 'columns')
final

Unnamed: 0,area,price,robinsville,west windsor
0,2600,550000,0,0
1,3000,565000,0,0
2,3200,610000,0,0
3,3600,680000,0,0
4,4000,725000,0,0
5,2600,585000,0,1
6,2800,615000,0,1
7,3300,650000,0,1
8,3600,710000,0,1
9,2600,575000,1,0


In [6]:
#Seperate the target feature and drop it from the main dataset
price = final.price
final = final.drop(['price'], axis = 'columns')

In [7]:
#build the LR model and train the dataset
lr= LinearRegression()
lr.fit(final,price)

LinearRegression()

In [8]:
#Do the predictions
lr.predict([[2500,0,1]])

  "X does not have valid feature names, but"


array([567033.97118899])

In [9]:
lr.predict([[2300,1,0]])

  "X does not have valid feature names, but"


array([527326.91894216])

In [10]:
lr.predict([[2300,0,0]])

  "X does not have valid feature names, but"


array([501640.50741776])

In [11]:
## To find the accuracy of the model
lr.score(final,price)

0.9573929037221872

# Encoding using sklearn

In [12]:
## LabelEncoding - to convert categorical data into numbers
le = LabelEncoder()
dfle = df
dfle.town = le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [13]:
## Split the data into X and y
X = dfle[['town','area']].values
y = dfle['price'].values
X

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]])

In [14]:
y

array([550000, 565000, 610000, 680000, 725000, 585000, 615000, 650000,
       710000, 575000, 600000, 620000, 695000])

In [15]:
## Transform the column into one hot encoding
ct=ColumnTransformer([('town',OneHotEncoder(),[0])],remainder = 'passthrough')
X = ct.fit_transform(X)
X

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [16]:
#Drop the 3rd dummy feature to avoid colinearity
X=X[:,1:]
X

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [17]:
## Train the LR model with the new dataset
lr.fit(X,y)

LinearRegression()

In [18]:
## Do the predictions
lr.predict([[0,1,3400]])

array([681241.6684584])

In [19]:
lr.predict([[1,0,3400]])

array([666914.10449366])