In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [2]:
# Load clean data
df = pd.read_csv('../pre_data/clean_data.csv')

In [3]:
df.describe()

Unnamed: 0,bed,bath,square,year_built,price
count,1413.0,1413.0,1413.0,1413.0,1413.0
mean,1.714084,1.298655,974.54494,198.393489,2544.073602
std,0.947821,0.459375,393.805067,588.238638,1123.447291
min,0.0,0.5,180.0,0.0,175.0
25%,1.0,1.0,733.0,0.0,1795.0
50%,2.0,1.0,953.0,0.0,2300.0
75%,2.0,1.5,1060.0,1.0,2971.0
max,5.0,4.0,5000.0,2017.0,13000.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1413 entries, 0 to 1412
Data columns (total 7 columns):
bed           1413 non-null int64
bath          1413 non-null float64
square        1413 non-null int64
city          1413 non-null object
year_built    1413 non-null int64
home_type     1413 non-null object
price         1413 non-null int64
dtypes: float64(1), int64(4), object(2)
memory usage: 77.4+ KB


In [5]:
df.sample(5)

Unnamed: 0,bed,bath,square,city,year_built,home_type,price
1315,2,1.0,1006,Union City,1,MULTI_FAMILY,1850
258,1,1.5,733,Hoboken,1,APARTMENT,2550
232,3,1.0,1100,Hoboken,1921,APARTMENT,3500
182,1,1.0,750,Hoboken,1,APARTMENT,1800
785,2,2.0,1006,Jersey City,1,APARTMENT,2400


In [6]:
# Select factor and value columns
data = df[['bed', 'bath', 'square', 'city', 'price']]

In [7]:
data.sample(10)

Unnamed: 0,bed,bath,square,city,price
1123,2,1.5,1006,Jersey City,1347
677,3,1.0,1000,Jersey City,1700
1191,1,1.0,733,Union City,2100
1054,3,1.0,850,Jersey City,1675
550,1,1.0,830,Hoboken,3000
1145,1,1.0,733,Jersey City,2650
722,1,1.0,818,Jersey City,2750
832,1,1.0,798,Jersey City,2300
719,1,1.0,600,Jersey City,1400
545,2,2.0,875,Hoboken,3505


In [8]:
# One Hot Encoder factors 'bed', 'bath', 'city'
# Save to new DataFrame data_one_hot
data_one_hot = pd.get_dummies(data[['bed', 'bath', 'city']])

In [9]:
data_one_hot.sample(10)

Unnamed: 0,bed,bath,city_Hoboken,city_Jersey City,city_Union City
797,2,1.5,0,1,0
806,0,1.0,0,1,0
735,3,1.5,0,1,0
935,2,2.0,0,1,0
1211,1,1.0,0,0,1
1079,2,1.0,0,1,0
53,2,1.5,1,0,0
997,3,2.0,0,1,0
1064,1,1.0,0,1,0
258,1,1.5,1,0,0


In [10]:
# Connet data_one_hot with column 'square' and 'price'
new_data = pd.concat([data_one_hot, data[['square', 'price']]], axis=1)

In [11]:
new_data.sample(10)

Unnamed: 0,bed,bath,city_Hoboken,city_Jersey City,city_Union City,square,price
132,2,1.0,1,0,0,1020,3500
51,2,1.5,1,0,0,1006,3965
316,1,1.0,1,0,0,589,1850
347,1,1.0,1,0,0,780,2800
516,1,1.0,1,0,0,733,1920
191,1,1.0,1,0,0,650,1800
32,1,1.0,1,0,0,875,2975
858,1,1.0,0,1,0,766,2650
277,2,2.0,1,0,0,1261,4325
1122,1,1.0,0,1,0,673,2205


In [12]:
# Normalize the square
new_data['square'] = new_data[['square']].apply(lambda x : (x-np.min(x))/(np.max(x)-np.min(x)))

In [13]:
new_data.sample(10)

Unnamed: 0,bed,bath,city_Hoboken,city_Jersey City,city_Union City,square,price
943,2,2.0,0,1,0,0.194191,3450
584,1,1.0,0,1,0,0.11473,2505
1389,1,1.0,0,0,1,0.11473,2150
104,1,1.0,1,0,0,0.11473,2175
156,0,1.0,1,0,0,0.095436,2600
347,1,1.0,1,0,0,0.124481,2800
857,3,1.0,0,1,0,0.170124,1700
583,2,1.5,0,1,0,0.171369,3850
176,1,1.5,1,0,0,0.14834,3400
801,1,1.0,0,1,0,0.15249,2450


In [14]:
# Split data into train data and test data with ratio 4:1
from sklearn.model_selection import train_test_split

X = new_data.iloc[:, : -1]
y = new_data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/5, random_state=1)

In [15]:
print('Total Data：{}，Train Data：{}，Test Data：{}'.format(len(X), len(X_train), len(X_test)))

Total Data：1413，Train Data：1130，Test Data：283


In [27]:
# Build gradient descent method linear regression model
from sklearn.linear_model import SGDRegressor
from sklearn.model_selection import cross_val_score
# from sklearn.preprocessing import StandardScaler

# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)
# X_test = scaler.transform(X_test)

alphas = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3]
cv_scores = []
for a in alphas:
    SGD_model= SGDRegressor(alpha=a, loss='squared_loss')
    scores = cross_val_score(SGD_model, X_train, y_train, cv=10)
    cv_score = np.mean(scores)
    print('alpha={}，R-squared score on train data={:.3f}'.format(a, cv_score))
    cv_scores.append(cv_score)

alpha=0.001，R-squared score on train data=0.419
alpha=0.003，R-squared score on train data=0.424
alpha=0.01，R-squared score on train data=0.419
alpha=0.03，R-squared score on train data=0.420
alpha=0.1，R-squared score on train data=0.409
alpha=0.3，R-squared score on train data=0.367


In [33]:
best_alpha = alphas[np.argmax(cv_scores)]
best_alpha = SGDRegressor(alpha=best_alpha, loss='squared_loss', random_state=1)
best_alpha.fit(X_train, y_train)

print('SGDRegressor model coefficient(w): {}'.format(best_alpha.coef_))
print('SGDRegressor model coefficient constant(b): {}'.format(best_alpha.intercept_ ))
print('Train data R-squared score: {:.3f}'.format(best_alpha.score(X_train, y_train)))
print('Test data R-squared score: {:.3f}'.format(best_alpha.score(X_test, y_test)))

SGDRegressor model coefficient(w): [ 225.15887608  977.97753994  677.66257696  101.54093272 -197.87684175
  112.60220405]
SGDRegressor model coefficient constant(b): [595.30443563]
Train data R-squared score: 0.434
Test data R-squared score: 0.424


In [18]:
# Build least square method linear regression model
from sklearn.linear_model import LinearRegression

# Build LinearRegression model
linreg_model= LinearRegression()

# Fit LinearRegression model
linreg_model.fit(X_train, y_train)

# Output result
print('LinearRegression model coefficient(w): {}'.format(linreg_model.coef_))
print('LinearRegression model coefficient constant(b): {:.3f}'.format(linreg_model.intercept_))
print('Train data R-squared score: {:.3f}'.format(linreg_model.score(X_train, y_train)))
print('Test data R-squared score: {:.3f}'.format(linreg_model.score(X_test, y_test)))

LinearRegression model coefficient(w): [  72.82536841 1037.968642    527.97354631  -93.09761534 -434.87593097
 2004.01506581]
LinearRegression model coefficient constant(b): 631.555
Train data R-squared score: 0.445
Test data R-squared score: 0.434


In [19]:
# Save model
from sklearn.externals import joblib

model_path = './SGDRegression_model.pkl'
joblib.dump(best_alpha, model_path) 

['./SGDRegression_model.pkl']

In [26]:
# Predict test
model = joblib.load('./SGDRegression_model.pkl') 

# Predict
ex = np.array([1,1,1,0,0,0.9])
predict = np.round(model.predict([ex])[0],1)
print(predict)

2577.4
