In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import string
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [37]:
df = pd.read_csv('housing.csv')

In [38]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,nuisance,view,...,grade,heat_source,sewer_system,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,address,zip_code
0,920000.0,5,2.5,2770,6703,1.0,0,0,1,AVERAGE,...,7 Average,Oil,PUBLIC,1570,1570,0,240,1950,"11231 Greenwood Avenue North, Seattle, Washing...",98133
1,311000.0,6,2.0,2880,6156,1.0,0,0,0,AVERAGE,...,7 Average,Gas,PUBLIC,1580,1580,0,0,1956,"8504 South 113th Street, Seattle, Washington 9...",98178
2,775000.0,3,3.0,2160,1400,2.0,0,0,0,AVERAGE,...,9 Better,Gas,PUBLIC,1090,1070,200,270,2010,"4079 Letitia Avenue South, Seattle, Washington...",98118
3,625000.0,2,1.0,1190,5688,1.0,0,0,1,NONE,...,7 Average,Electricity,PUBLIC,1190,0,300,0,1948,"1602 North 185th Street, Shoreline, Washington...",98133
4,1317227.0,3,3.0,2080,27574,1.0,0,0,0,AVERAGE,...,9 Better,Oil,PRIVATE,2080,0,0,150,1951,"2633 Southwest 164th Place, Burien, Washington...",98166


# Modeling

In [39]:
X = df[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'greenbelt', 'nuisance', 'sqft_above', 'sqft_basement',
       'sqft_garage', 'sqft_patio', 'yr_built']]
y = df["price"]

In [40]:
model = sm.OLS(y, sm.add_constant(X))
results = model.fit()

y_pred = results.predict(sm.add_constant(X)) 


sklearn_model = LinearRegression()
sklearn_model.fit(X, y)





print(f"""R-Squared: {sklearn_model.score(X, y)}
MAE: {mean_absolute_error(y, y_pred)}
MSRE {mean_squared_error(y, y_pred, squared=False)}

""")


R-Squared: 0.48204719866334544
MAE: 281337.7808576937
MSRE 522877.3747663305




# Standardize

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


# standarding 
ss = StandardScaler()
ss.fit(X_train)


# linear regression model
lr = LinearRegression()


X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr = LinearRegression()

# Getting the R score
lr.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr.score(X_standardized_train, y_train)}')



y_pred = lr.predict(X_standardized_test)

print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')

coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(lr.coef_))], axis = 1)

R^2 value: 0.4726295942195612
MAE: 279265.0967610278
Root Mean Squared Error 409838.90346517833


# Adding Ordinal Data

In [45]:
from sklearn.preprocessing import OrdinalEncoder
# ordering the grade column

cat_subset = df[['grade','condition','view']]

grade_order = ['2 Substandard','3 Poor', '4 Low', '5 Fair', '6 Low Average', '7 Average', '8 Good', '9 Better', 
               '10 Very Good', '11 Excellent', '12 Luxury','13 Mansion']

con_order = ['Poor','Fair','Average', 'Good','Very Good']

view_order = ['NONE','FAIR','AVERAGE','GOOD','EXCELLENT']

# reg_grade_order = ['7 Average', '9 Better', '8 Good', '6 Low Average', '5 Fair',
#        '4 Low', '10 Very Good', '11 Excellent', '3 Poor', '12 Luxury']

o_enc = OrdinalEncoder(categories = [grade_order,con_order,view_order])
o_enc.fit(cat_subset)

OrdinalEncoder(categories=[['2 Substandard', '3 Poor', '4 Low', '5 Fair',
                            '6 Low Average', '7 Average', '8 Good', '9 Better',
                            '10 Very Good', '11 Excellent', '12 Luxury',
                            '13 Mansion'],
                           ['Poor', 'Fair', 'Average', 'Good', 'Very Good'],
                           ['NONE', 'FAIR', 'AVERAGE', 'GOOD', 'EXCELLENT']])

In [46]:
X_subset_or = pd.DataFrame(o_enc.transform(cat_subset), columns = cat_subset.columns)
X_subset_or.head()

Unnamed: 0,grade,condition,view
0,5.0,2.0,2.0
1,5.0,2.0,2.0
2,7.0,2.0,2.0
3,5.0,2.0,0.0
4,7.0,3.0,2.0
