In [27]:
import joblib
import numpy as np 
import pandas as pd 
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv("clean_realtor_data.csv")

In [3]:
data.head()

Unnamed: 0,status,price,bed,bath,acre_lot,street,city,state,house_size
0,for_sale,105000.0,3.0,2.0,0.12,1962661.0,Adjuntas,Puerto Rico,920.0
1,for_sale,80000.0,4.0,2.0,0.08,1902874.0,Adjuntas,Puerto Rico,1527.0
2,for_sale,67000.0,2.0,1.0,0.15,1404990.0,Juana Diaz,Puerto Rico,748.0
3,for_sale,145000.0,4.0,2.0,0.1,1947675.0,Ponce,Puerto Rico,1800.0
4,for_sale,179000.0,4.0,3.0,0.46,1850806.0,San Sebastian,Puerto Rico,2520.0


In [4]:
X = data[["bed", "bath", "house_size"]]

In [5]:
y = data["price"]

In [6]:
X

Unnamed: 0,bed,bath,house_size
0,3.0,2.0,920.0
1,4.0,2.0,1527.0
2,2.0,1.0,748.0
3,4.0,2.0,1800.0
4,4.0,3.0,2520.0
...,...,...,...
1355857,4.0,2.0,3600.0
1355858,3.0,2.0,1616.0
1355859,6.0,3.0,3200.0
1355860,2.0,1.0,933.0


In [7]:
y

0          105000.0
1           80000.0
2           67000.0
3          145000.0
4          179000.0
             ...   
1355857    359900.0
1355858    350000.0
1355859    440000.0
1355860    179900.0
1355861    580000.0
Name: price, Length: 1355862, dtype: float64

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2)

In [9]:
len(y_test)

271173

In [10]:
len(y_train)

1084689

In [12]:
scaler = StandardScaler()

In [13]:
X_train

Unnamed: 0,bed,bath,house_size
797963,3.0,2.0,1347.0
1023819,3.0,1.0,1001.0
1071143,2.0,1.0,732.0
827957,3.0,2.0,1092.0
1022748,3.0,1.0,1248.0
...,...,...,...
1190307,3.0,3.0,1883.0
240501,3.0,2.0,1935.0
231795,3.0,2.0,1100.0
118367,4.0,4.0,3260.0


In [14]:
X_train = scaler.fit_transform(X_train)

In [15]:
X_train

array([[-0.26542236, -0.39207783, -0.20761385],
       [-0.26542236, -1.11399931, -0.30072021],
       [-0.95211013, -1.11399931, -0.37310636],
       ...,
       [-0.26542236, -0.39207783, -0.27407995],
       [ 0.42126541,  1.05176515,  0.30716203],
       [-0.95211013, -1.11399931, -0.37364455]])

In [16]:
joblib.dump(scaler,"Scaler.pkl")

['Scaler.pkl']

In [17]:
X_test = scaler.fit_transform(X_test)

In [19]:
lr = LinearRegression()
lr

In [21]:
lr.fit(X_train, y_train)

In [23]:
predictions = lr.predict(X_test)

In [24]:
mean_absolute_error(y_test, predictions)

361636.056635992

In [28]:
data['price'].describe().astype(int)

count       1355862
mean         574225
std         2212087
min               0
25%          239000
50%          379000
75%          600000
max      2147483600
Name: price, dtype: int32

In [29]:
joblib.dump(lr,'Model.pkl')

['Model.pkl']