In [86]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import string
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [87]:
df = pd.read_csv('housing.csv')

In [88]:
px.scatter(x = df['sqft_living'], y = df['price'], trendline = 'ols')

In [89]:
fig = px.box(df, y="price")
fig.show()

In [90]:
df.columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'greenbelt', 'nuisance', 'view', 'condition', 'grade',
       'heat_source', 'sewer_system', 'sqft_above', 'sqft_basement',
       'sqft_garage', 'sqft_patio', 'yr_built', 'address', 'zip_code'],
      dtype='object')

In [91]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,nuisance,view,...,grade,heat_source,sewer_system,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,address,zip_code
0,920000.0,5,2.5,2770,6703,1.0,0,0,1,AVERAGE,...,7 Average,Oil,PUBLIC,1570,1570,0,240,1950,"11231 Greenwood Avenue North, Seattle, Washing...",98133
1,311000.0,6,2.0,2880,6156,1.0,0,0,0,AVERAGE,...,7 Average,Gas,PUBLIC,1580,1580,0,0,1956,"8504 South 113th Street, Seattle, Washington 9...",98178
2,775000.0,3,3.0,2160,1400,2.0,0,0,0,AVERAGE,...,9 Better,Gas,PUBLIC,1090,1070,200,270,2010,"4079 Letitia Avenue South, Seattle, Washington...",98118
3,625000.0,2,1.0,1190,5688,1.0,0,0,1,NONE,...,7 Average,Electricity,PUBLIC,1190,0,300,0,1948,"1602 North 185th Street, Shoreline, Washington...",98133
4,1317227.0,3,3.0,2080,27574,1.0,0,0,0,AVERAGE,...,9 Better,Oil,PRIVATE,2080,0,0,150,1951,"2633 Southwest 164th Place, Burien, Washington...",98166


# Simple Model only Numerical Data

In [92]:
X = df[['sqft_living','bedrooms','floors','waterfront', 'greenbelt']]
y = df["price"]

In [93]:
model = sm.OLS(y, sm.add_constant(X))
results = model.fit()

y_pred = results.predict(sm.add_constant(X)) 


sklearn_model = LinearRegression()
sklearn_model.fit(X, y)





print(f"""R-Squared: {sklearn_model.score(X, y)}
MAE: {mean_absolute_error(y, y_pred)}
MSRE {mean_squared_error(y, y_pred, squared=False)}""")

R-Squared: 0.45072251757982407
MAE: 294268.57949428365
MSRE 538456.5380970082


# Standardize the simple model

In [94]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


# standarding 
ss = StandardScaler()
ss.fit(X_train)


# linear regression model
lr = LinearRegression()


X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr = LinearRegression()

# Getting the R score
lr.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr.score(X_standardized_train, y_train)}')



y_pred = lr.predict(X_standardized_test)

print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')

coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(lr.coef_))], axis = 1)

R^2 value: 0.5057989030903021
MAE: 298641.14473871916
Root Mean Squared Error 649916.8899018852


# Adding Ordinal Data

In [139]:
from sklearn.preprocessing import OrdinalEncoder
# ordering the grade column

cat_subset = df[['grade','condition']]

grade_order = ['2 Substandard','3 Poor', '4 Low', '5 Fair', '6 Low Average', '7 Average', '8 Good', '9 Better', 
               '10 Very Good', '11 Excellent', '12 Luxury','13 Mansion']

con_order = ['Poor','Fair','Average', 'Good','Very Good']

view_order = ['NONE','FAIR','AVERAGE','GOOD','EXCELLENT']

# reg_grade_order = ['7 Average', '9 Better', '8 Good', '6 Low Average', '5 Fair',
#        '4 Low', '10 Very Good', '11 Excellent', '3 Poor', '12 Luxury']

o_enc = OrdinalEncoder(categories = [grade_order,con_order])
o_enc.fit(cat_subset)

OrdinalEncoder(categories=[['2 Substandard', '3 Poor', '4 Low', '5 Fair',
                            '6 Low Average', '7 Average', '8 Good', '9 Better',
                            '10 Very Good', '11 Excellent', '12 Luxury',
                            '13 Mansion'],
                           ['Poor', 'Fair', 'Average', 'Good', 'Very Good']])

In [140]:
X_subset_or = pd.DataFrame(o_enc.transform(cat_subset), columns = cat_subset.columns)
X_subset_or.head()

Unnamed: 0,grade,condition
0,5.0,2.0
1,5.0,2.0
2,7.0,2.0
3,5.0,2.0
4,7.0,3.0


In [141]:
X_all = pd.concat([X.reset_index(),X_subset_or],axis=1)
X_all = X_all.drop('index', axis=1)

In [142]:
X_all

Unnamed: 0,sqft_living,bedrooms,floors,waterfront,greenbelt,grade,condition
0,2770,5,1.0,0,0,5.0,2.0
1,2880,6,1.0,0,0,5.0,2.0
2,2160,3,2.0,0,0,7.0,2.0
3,1190,2,1.0,0,0,5.0,2.0
4,2080,3,1.0,0,0,7.0,3.0
...,...,...,...,...,...,...,...
11962,1270,3,2.0,0,0,6.0,2.0
11963,1910,5,1.5,0,0,6.0,3.0
11964,2020,3,2.0,0,0,5.0,2.0
11965,1620,3,1.0,0,0,5.0,2.0


In [143]:
data = pd.concat([y.reset_index(),X_all],axis=1)
data = data.drop('index', axis=1)
data.head()

Unnamed: 0,price,sqft_living,bedrooms,floors,waterfront,greenbelt,grade,condition
0,920000.0,2770,5,1.0,0,0,5.0,2.0
1,311000.0,2880,6,1.0,0,0,5.0,2.0
2,775000.0,2160,3,2.0,0,0,7.0,2.0
3,625000.0,1190,2,1.0,0,0,5.0,2.0
4,1317227.0,2080,3,1.0,0,0,7.0,3.0


# Modeling with Ordinal Data (Standardize)

In [144]:
Ox = data.drop('price', axis=1)
Oy = data['price']


X_train, X_test, y_train, y_test = train_test_split(Ox, Oy, test_size=0.3)


# standarding 
ss = StandardScaler()
ss.fit(X_train)

# linear regression model
lr = LinearRegression()


X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr = LinearRegression()

# Getting the R score
lr.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr.score(X_standardized_train, y_train)}')



y_pred = lr.predict(X_standardized_test)

print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')



R^2 value: 0.5472115527469121
MAE: 286407.8465636741
Root Mean Squared Error 625725.4580455429


# Adding Nomial Data

In [159]:
from sklearn.preprocessing import OneHotEncoder
onehot_enc = OneHotEncoder(sparse=False)
# heat source
nominal_data = ['zip_code','sewer_system']

# tranforming the nomial subset
X_nom_trans = onehot_enc.fit_transform(df[nominal_data])

X_norm = pd.DataFrame(X_nom_trans, columns = onehot_enc.get_feature_names())
X_norm.head()

Unnamed: 0,x0_98102,x0_98103,x0_98105,x0_98106,x0_98107,x0_98108,x0_98109,x0_98112,x0_98115,x0_98116,...,x0_98168,x0_98177,x0_98178,x0_98188,x0_98198,x0_98199,x1_PRIVATE,x1_PRIVATE RESTRICTED,x1_PUBLIC,x1_PUBLIC RESTRICTED
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [160]:
new_data = pd.concat([data.reset_index(),X_norm],axis=1)
new_data = new_data.drop('index', axis=1)
new_data.head()

Unnamed: 0,price,sqft_living,bedrooms,floors,waterfront,greenbelt,grade,condition,x0_98102,x0_98103,...,x0_98168,x0_98177,x0_98178,x0_98188,x0_98198,x0_98199,x1_PRIVATE,x1_PRIVATE RESTRICTED,x1_PUBLIC,x1_PUBLIC RESTRICTED
0,920000.0,2770,5,1.0,0,0,5.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,311000.0,2880,6,1.0,0,0,5.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,775000.0,2160,3,2.0,0,0,7.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,625000.0,1190,2,1.0,0,0,5.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1317227.0,2080,3,1.0,0,0,7.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


# Modeling with Nomial Data (Standardize)

In [173]:
px = new_data.drop('price', axis=1)
py = new_data['price']


X_train, X_test, y_train, y_test = train_test_split(px, py, test_size=0.3)


# standarding 
ss = StandardScaler()
ss.fit(X_train)

# linear regression model
lr = LinearRegression()


X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr = LinearRegression()

# Getting the R score

lr.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr.score(X_standardized_train, y_train)}')



y_pred = lr.predict(X_standardized_test)

print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')



R^2 value: 0.6382659026654737
MAE: 246341.45285638864
Root Mean Squared Error 641984.18235366


In [174]:
new_data

Unnamed: 0,price,sqft_living,bedrooms,floors,waterfront,greenbelt,grade,condition,x0_98102,x0_98103,...,x0_98168,x0_98177,x0_98178,x0_98188,x0_98198,x0_98199,x1_PRIVATE,x1_PRIVATE RESTRICTED,x1_PUBLIC,x1_PUBLIC RESTRICTED
0,920000.0,2770,5,1.0,0,0,5.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,311000.0,2880,6,1.0,0,0,5.0,2.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,775000.0,2160,3,2.0,0,0,7.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,625000.0,1190,2,1.0,0,0,5.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,1317227.0,2080,3,1.0,0,0,7.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11962,719000.0,1270,3,2.0,0,0,6.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
11963,1555000.0,1910,5,1.5,0,0,6.0,3.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
11964,1313000.0,2020,3,2.0,0,0,5.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
11965,800000.0,1620,3,1.0,0,0,5.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [186]:
poly = PolynomialFeatures(interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(px) 


names = poly.get_feature_names(input_features=px.columns)
X_poly = pd.DataFrame(X_poly, columns=names, index = px.index)

X_train, X_test, y_train, y_test = train_test_split(X_poly, py, test_size = 0.3)


ss.fit(X_train)
X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr_raw = LinearRegression()
lr_raw.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr_raw.score(X_standardized_train, y_train)}')
y_pred = lr_raw.predict(X_standardized_test)
print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')


R^2 value: 0.7638841627532762
MAE: 4.005676722337118e+17
Root Mean Squared Error 1.7527932084202416e+19
