In [24]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

In [4]:
df = pd.read_csv('homeprices2.csv')
df

Unnamed: 0,town,area,price
0,monroe township,2600,550000
1,monroe township,3000,565000
2,monroe township,3200,610000
3,monroe township,3600,680000
4,monroe township,4000,725000
5,west windsor,2600,585000
6,west windsor,2800,615000
7,west windsor,3300,650000
8,west windsor,3600,710000
9,robinsville,2600,575000


In [7]:
#because with strings machine learning model can't be trained, we should convert them into numerical values
#using get_dummies method in pandas we can do that

dummies = pd.get_dummies(df.town)
dummies

#using below table, if we want to get monroe township thevalue = 1 0 0
#                                     robinsville              = 0 0 1
#                                     west windsor             = 0 1 0



Unnamed: 0,monroe township,robinsville,west windsor
0,1,0,0
1,1,0,0
2,1,0,0
3,1,0,0
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
8,0,0,1
9,0,1,0


In [23]:
#concatenate two tables
#axis attribute takes column wise or row wise "columns", "rows"

concatenated = pd.concat([df,dummies],axis='columns')
concatenated

Unnamed: 0,town,area,price,monroe township,robinsville,west windsor
0,monroe township,2600,550000,1,0,0
1,monroe township,3000,565000,1,0,0
2,monroe township,3200,610000,1,0,0
3,monroe township,3600,680000,1,0,0
4,monroe township,4000,725000,1,0,0
5,west windsor,2600,585000,0,0,1
6,west windsor,2800,615000,0,0,1
7,west windsor,3300,650000,0,0,1
8,west windsor,3600,710000,0,0,1
9,robinsville,2600,575000,0,1,0


In [9]:
#drop unneccesary columns
#to reduce dummy variable trap we should drop one of dummy varibale

final = concatenated.drop(['town', 'west windsor'], axis = 'columns')
final

Unnamed: 0,area,price,monroe township,robinsville
0,2600,550000,1,0
1,3000,565000,1,0
2,3200,610000,1,0
3,3600,680000,1,0
4,4000,725000,1,0
5,2600,585000,0,0
6,2800,615000,0,0
7,3300,650000,0,0
8,3600,710000,0,0
9,2600,575000,0,1


In [25]:
reg = LinearRegression()

In [27]:
#axis = 1 => columns
#axis = 0 => rows

x = final.drop('price', axis = 1)
x

Unnamed: 0,area,monroe township,robinsville
0,2600,1,0
1,3000,1,0
2,3200,1,0
3,3600,1,0
4,4000,1,0
5,2600,0,0
6,2800,0,0
7,3300,0,0
8,3600,0,0
9,2600,0,1


In [15]:
y = final.price
y

0     550000
1     565000
2     610000
3     680000
4     725000
5     585000
6     615000
7     650000
8     710000
9     575000
10    600000
11    620000
12    695000
Name: price, dtype: int64

In [33]:
reg.fit(x,y)

LinearRegression()

In [34]:
#predict home price with 2800 area and located in robinsville

reg.predict([[2800, 0, 1]])



array([590775.63964739])

In [35]:
#predict home price with 2800 area and located in west windsor

reg.predict([[3400, 0, 0]])



array([681241.66845839])

In [36]:
import joblib as jb

In [37]:
jb.dump(reg, 'model_jb')

['model_jb']

In [38]:
mjb = jb.load('model_jb')

In [39]:
mjb.predict([[3400, 0, 0]])



array([681241.66845839])

In [41]:
#check how much accurate use trained model is
#if it is 1, it means trained model is perfect

reg.score(x,y)

0.9573929037221871