In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

In [2]:
# Load clean data
df = pd.read_csv('../pre_data/clean_data.csv')

In [3]:
df.describe()

Unnamed: 0,bed,bath,square,year_built,price
count,1413.0,1413.0,1413.0,1413.0,1413.0
mean,1.714084,1.298655,974.54494,198.393489,2544.073602
std,0.947821,0.459375,393.805067,588.238638,1123.447291
min,0.0,0.5,180.0,0.0,175.0
25%,1.0,1.0,733.0,0.0,1795.0
50%,2.0,1.0,953.0,0.0,2300.0
75%,2.0,1.5,1060.0,1.0,2971.0
max,5.0,4.0,5000.0,2017.0,13000.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1413 entries, 0 to 1412
Data columns (total 7 columns):
bed           1413 non-null int64
bath          1413 non-null float64
square        1413 non-null int64
city          1413 non-null object
year_built    1413 non-null int64
home_type     1413 non-null object
price         1413 non-null int64
dtypes: float64(1), int64(4), object(2)
memory usage: 77.4+ KB


In [5]:
df.sample(5)

Unnamed: 0,bed,bath,square,city,year_built,home_type,price
299,2,2.0,1285,Hoboken,2011,CONDO,3750
1210,2,1.5,1006,Union City,0,,1495
406,2,2.0,1155,Hoboken,2000,CONDO,3900
1290,2,1.0,1006,Union City,1,MULTI_FAMILY,1450
614,3,1.5,1423,Jersey City,0,,5153


In [6]:
# Select factor and value columns
data = df[['bed', 'bath', 'square', 'city', 'price']]

In [7]:
data.sample(10)

Unnamed: 0,bed,bath,square,city,price
638,3,1.5,1423,Jersey City,4695
349,1,1.0,551,Hoboken,2100
341,3,2.0,1150,Hoboken,3990
1232,2,1.0,1100,Union City,2000
1114,1,1.0,600,Jersey City,2000
1269,2,1.5,1006,Union City,2250
516,2,2.0,875,Hoboken,3515
780,1,1.0,733,Jersey City,2594
1174,1,1.0,733,Jersey City,1985
504,2,1.5,1006,Hoboken,2500


In [8]:
# One Hot Encoder factors 'bed', 'bath', 'city'
# Save to new DataFrame data_one_hot
data_one_hot = pd.get_dummies(data[['bed', 'bath', 'city']])

In [9]:
data_one_hot.sample(10)

Unnamed: 0,bed,bath,city_Hoboken,city_Jersey City,city_Union City
1410,1,1.0,0,0,1
1391,2,1.5,0,0,1
240,3,2.0,1,0,0
1261,1,1.0,0,0,1
721,2,1.5,0,1,0
507,2,2.0,1,0,0
464,2,2.0,1,0,0
36,2,1.5,1,0,0
1253,1,1.0,0,0,1
1377,2,1.0,0,0,1


In [10]:
# Connet data_one_hot with column 'square' and 'price'
new_data = pd.concat([data_one_hot, data[['square', 'price']]], axis=1)

In [11]:
# Normalize the square
new_data['square'] = new_data[['square']].apply(lambda x : (x-np.min(x))/(np.max(x)-np.min(x)))

In [12]:
new_data.sample(10)

Unnamed: 0,bed,bath,city_Hoboken,city_Jersey City,city_Union City,square,price
693,0,1.0,0,1,0,0.095436,2385
565,3,1.5,0,1,0,0.273859,2299
1347,0,1.0,0,0,1,0.325726,1950
1267,1,1.0,0,0,1,0.11473,1375
459,2,2.0,1,0,0,0.253112,3400
272,2,2.0,1,0,0,0.204772,4125
327,2,2.0,1,0,0,0.201245,3800
497,1,1.0,1,0,0,0.075934,2100
274,2,2.0,1,0,0,0.164938,3635
758,2,1.5,0,1,0,0.171369,3495


In [13]:
# Split data into train data and test data with ratio 4:1
from sklearn.model_selection import train_test_split

X = new_data.iloc[:, : -1]
y = new_data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1/5, random_state=0)

In [14]:
print('Total Data：{}，Train Data：{}，Test Data：{}'.format(len(X), len(X_train), len(X_test)))

Total Data：1413，Train Data：1130，Test Data：283


In [15]:
from sklearn.linear_model import LogisticRegression

# Accuracy
lr_acc=[]
# Build LogisticRegression model with default arguments, and fit train data
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)

# Calculate LogisticRegression model on test data
# Save the accuracy to lr_acc
lr_test_score = lr_model.score(X_test, y_test)
lr_acc.append(lr_test_score)

print('Accuracy：', lr_acc[0])

Accuracy： 0.04240282685512368


In [16]:
from sklearn.linear_model import LinearRegression

# Build LinearRegression model
linreg_model= LinearRegression()

# Fit LinearRegression model
linreg_model.fit(X_train, y_train)

# Output result
print('LinearRegression model coefficient(w): {}'.format(linreg_model.coef_))
print('LinearRegression model coefficient constant(b): {:.3f}'.format(linreg_model.intercept_))
print('Train data R-squared score: {:.3f}'.format(linreg_model.score(X_train, y_train)))
print('Test data R-squared score: {:.3f}'.format(linreg_model.score(X_test, y_test)))

LinearRegression model coefficient(w): [  94.60395558  969.02700318  508.97803336  -54.1489398  -454.82909357
 1942.64934089]
LinearRegression model coefficient constant(b): 699.363
Train data R-squared score: 0.418
Test data R-squared score: 0.532
