In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('homeprices_ohe.csv')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [3]:
# Creating dumies of the town column so that we can use them in our predictions

dumies = pd.get_dummies(df.town)
dumies

Unnamed: 0,monroe township,robinsville,west windsor
0,True,False,False
1,True,False,False
2,True,False,False
3,True,False,False
4,True,False,False
5,False,False,True
6,False,False,True
7,False,False,True
8,False,False,True
9,False,True,False


In [4]:
# Replacing True and False (we can also use them)

dumies = dumies.replace({
    True : 1,
    False : 0
})
dumies

Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [5]:
# Merging both dataframes

merged = pd.concat([df, dumies], axis='columns')
merged

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [6]:
# Dropping the two columns
# One from which we derived dumies and one of the derived dumies column
# We did this to avoid dummy trap

# When you can derive one variable from other variables, they are known to be multi-colinear.
# Here if you know values of california and georgia then you can easily infer value of new jersey
# state, i.e. california=0 and georgia=0. There for these state variables are called to be multi-colinear.
# In this situation linear regression won't work as expected. Hence you need to drop one column.

# NOTE: sklearn library takes care of dummy variable trap hence even if you don't drop one of the state columns
# it is going to work, however we should make a habit of taking care of dummy variable trap ourselves just in
# case library that you are using is not handling this for you

final = merged.drop(['town', 'monroe township'], axis='columns')
final

Unnamed: 0,area,price,robinsville,west windsor
0,2600,550000,0,0
1,3000,565000,0,0
2,3200,610000,0,0
3,3600,680000,0,0
4,4000,725000,0,0
5,2600,585000,0,1
6,2800,615000,0,1
7,3300,650000,0,1
8,3600,710000,0,1
9,2600,575000,1,0


In [7]:
from sklearn import linear_model

In [8]:
model = linear_model.LinearRegression()

# Passing the x and y columns according to final dataframe after dropping columns 
model.fit(final[['area', 'robinsville', 'west windsor']], final['price'])

# We can also do
# X = final.drop('price', axis='columns')
# y = final.price
# and then
# model.fit(X,y)

# This is do the same work

In [9]:
# Predicting price of 5000 sqr ft land in robinsville (1 is its corresponding column)
# Since we didn't need west windsor here so we used 0 here

model.predict([[5000, 1, 0]])



array([869950.01075037])

In [10]:
# Predicting price of 6000 sqr ft land in monroe township
# We kept both other towns 0 to select this town

model.predict([[6000, 0, 0]])



array([971161.04063642])

In [11]:
# Accuracy or score of the model out of 1

model.score(final[['area', 'robinsville', 'west windsor']], final['price'])

0.9573929037221872

***

# Using sklearn One Hot Encoding

In [12]:
from sklearn.preprocessing import LabelEncoder

In [13]:
le = LabelEncoder()

In [14]:
dfle = df

# Transforming or creating dumies of the town column and then changing it
dfle.town = le.fit_transform(dfle.town)
dfle

Unnamed: 0,town,area,price
0,0,2600,550000
1,0,3000,565000
2,0,3200,610000
3,0,3600,680000
4,0,4000,725000
5,2,2600,585000
6,2,2800,615000
7,2,3300,650000
8,2,3600,710000
9,1,2600,575000


In [15]:
x = dfle[['town','area']].values
x

array([[   0, 2600],
       [   0, 3000],
       [   0, 3200],
       [   0, 3600],
       [   0, 4000],
       [   2, 2600],
       [   2, 2800],
       [   2, 3300],
       [   2, 3600],
       [   1, 2600],
       [   1, 2900],
       [   1, 3100],
       [   1, 3600]], dtype=int64)

In [16]:
y = dfle.price.values
y

array([550000, 565000, 610000, 680000, 725000, 585000, 615000, 650000,
       710000, 575000, 600000, 620000, 695000], dtype=int64)

In [17]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Here town is the column, 0 is first column so that it is not considered
ct = ColumnTransformer([('town', OneHotEncoder(), [0])], remainder = 'passthrough')

In [18]:
# Now transforming x such that it has created dumies like in above cell
# Last column is of price

x = ct.fit_transform(x)
x

array([[1.0e+00, 0.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.0e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.2e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 0.0e+00, 1.0e+00, 3.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 2.9e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.1e+03],
       [0.0e+00, 1.0e+00, 0.0e+00, 3.6e+03]])

In [19]:
# Removing one dummy column (here first) to avoid dummy trap

x = x[:, 1:]
x

array([[0.0e+00, 0.0e+00, 2.6e+03],
       [0.0e+00, 0.0e+00, 3.0e+03],
       [0.0e+00, 0.0e+00, 3.2e+03],
       [0.0e+00, 0.0e+00, 3.6e+03],
       [0.0e+00, 0.0e+00, 4.0e+03],
       [0.0e+00, 1.0e+00, 2.6e+03],
       [0.0e+00, 1.0e+00, 2.8e+03],
       [0.0e+00, 1.0e+00, 3.3e+03],
       [0.0e+00, 1.0e+00, 3.6e+03],
       [1.0e+00, 0.0e+00, 2.6e+03],
       [1.0e+00, 0.0e+00, 2.9e+03],
       [1.0e+00, 0.0e+00, 3.1e+03],
       [1.0e+00, 0.0e+00, 3.6e+03]])

In [20]:
model.fit(x, y)

In [21]:
model.predict([[0, 0, 2500]])

array([527019.99569985])

In [22]:
model.predict([[0,1, 4000]])

array([757380.13330466])

In [23]:
model.score(x, y)

0.9573929037221873