In [274]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import string
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

In [275]:
df = pd.read_csv('clean_house.csv')

In [276]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,nuisance,view,...,grade,heat_source,sewer_system,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,address,zip_code
0,920000.0,5,2.5,2770,6703,1.0,0,0,1,AVERAGE,...,7 Average,Oil,PUBLIC,1570,1570,0,240,1950,"11231 Greenwood Avenue North, Seattle, Washing...",98133
1,311000.0,6,2.0,2880,6156,1.0,0,0,0,AVERAGE,...,7 Average,Gas,PUBLIC,1580,1580,0,0,1956,"8504 South 113th Street, Seattle, Washington 9...",98178
2,775000.0,3,3.0,2160,1400,2.0,0,0,0,AVERAGE,...,9 Better,Gas,PUBLIC,1090,1070,200,270,2010,"4079 Letitia Avenue South, Seattle, Washington...",98118
3,625000.0,2,1.0,1190,5688,1.0,0,0,1,NONE,...,7 Average,Electricity,PUBLIC,1190,0,300,0,1948,"1602 North 185th Street, Shoreline, Washington...",98133
4,1317227.0,3,3.0,2080,27574,1.0,0,0,0,AVERAGE,...,9 Better,Oil,PRIVATE,2080,0,0,150,1951,"2633 Southwest 164th Place, Burien, Washington...",98166


In [277]:
df['log_price'] = np.log(df['price'])

In [278]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,nuisance,view,...,heat_source,sewer_system,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,address,zip_code,log_price
0,920000.0,5,2.5,2770,6703,1.0,0,0,1,AVERAGE,...,Oil,PUBLIC,1570,1570,0,240,1950,"11231 Greenwood Avenue North, Seattle, Washing...",98133,13.732129
1,311000.0,6,2.0,2880,6156,1.0,0,0,0,AVERAGE,...,Gas,PUBLIC,1580,1580,0,0,1956,"8504 South 113th Street, Seattle, Washington 9...",98178,12.647548
2,775000.0,3,3.0,2160,1400,2.0,0,0,0,AVERAGE,...,Gas,PUBLIC,1090,1070,200,270,2010,"4079 Letitia Avenue South, Seattle, Washington...",98118,13.560618
3,625000.0,2,1.0,1190,5688,1.0,0,0,1,NONE,...,Electricity,PUBLIC,1190,0,300,0,1948,"1602 North 185th Street, Shoreline, Washington...",98133,13.345507
4,1317227.0,3,3.0,2080,27574,1.0,0,0,0,AVERAGE,...,Oil,PRIVATE,2080,0,0,150,1951,"2633 Southwest 164th Place, Burien, Washington...",98166,14.091039


In [279]:
fig = px.histogram(df, x='log_price')
fig.show()

# Simple Model only Numerical Data

In [347]:
X = df[['sqft_living','bedrooms','floors','waterfront', 'greenbelt','sqft_above',
        'yr_built','sqft_patio','sqft_basement','bathrooms','sqft_garage']]
y = df["log_price"]

In [348]:
model = sm.OLS(y, sm.add_constant(X))
results = model.fit()

y_pred = results.predict(sm.add_constant(X)) 


sklearn_model = LinearRegression()
sklearn_model.fit(X, y)





print(f"""R-Squared: {sklearn_model.score(X, y)}
MAE: {mean_absolute_error(y, y_pred)}
MSRE {mean_squared_error(y, y_pred, squared=False)}""")
results.summary()

R-Squared: 0.29116984455164263
MAE: 0.23301506651825002
MSRE 0.3188509752856877


0,1,2,3
Dep. Variable:,log_price,R-squared:,0.291
Model:,OLS,Adj. R-squared:,0.29
Method:,Least Squares,F-statistic:,399.9
Date:,"Wed, 31 May 2023",Prob (F-statistic):,0.0
Time:,12:02:42,Log-Likelihood:,-2958.0
No. Observations:,10721,AIC:,5940.0
Df Residuals:,10709,BIC:,6027.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,17.7082,0.250,70.926,0.000,17.219,18.198
sqft_living,0.0001,1.35e-05,7.993,0.000,8.16e-05,0.000
bedrooms,-0.0265,0.004,-6.142,0.000,-0.035,-0.018
floors,0.1283,0.007,17.111,0.000,0.114,0.143
waterfront,0.0975,0.053,1.824,0.068,-0.007,0.202
greenbelt,0.2459,0.160,1.541,0.123,-0.067,0.559
sqft_above,8.59e-05,1.43e-05,5.996,0.000,5.78e-05,0.000
yr_built,-0.0024,0.000,-18.805,0.000,-0.003,-0.002
sqft_patio,0.0001,1.78e-05,7.989,0.000,0.000,0.000

0,1,2,3
Omnibus:,2882.587,Durbin-Watson:,1.986
Prob(Omnibus):,0.0,Jarque-Bera (JB):,13080.812
Skew:,-1.242,Prob(JB):,0.0
Kurtosis:,7.808,Cond. No.,246000.0


# Standardize the simple model

In [349]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


# standarding 
ss = StandardScaler()
ss.fit(X_train)


# linear regression model
lr = LinearRegression()


X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr = LinearRegression()

# Getting the R score
lr.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr.score(X_standardized_train, y_train)}')



y_pred = lr.predict(X_standardized_test)

print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')

R^2 value: 0.2832002804103214
MAE: 0.2301399917125462
Root Mean Squared Error 0.3135552080440262


# Adding Ordinal Data

In [350]:
from sklearn.preprocessing import OrdinalEncoder
# ordering the grade column

cat_subset = df[['grade','condition']]

grade_order = ['2 Substandard','3 Poor', '4 Low', '5 Fair', '6 Low Average', '7 Average', '8 Good', '9 Better', 
               '10 Very Good', '11 Excellent', '12 Luxury','13 Mansion']

con_order = ['Poor','Fair','Average', 'Good','Very Good']

view_order = ['NONE','FAIR','AVERAGE','GOOD','EXCELLENT']

# reg_grade_order = ['7 Average', '9 Better', '8 Good', '6 Low Average', '5 Fair',
#        '4 Low', '10 Very Good', '11 Excellent', '3 Poor', '12 Luxury']

o_enc = OrdinalEncoder(categories = [grade_order,con_order])
o_enc.fit(cat_subset)

OrdinalEncoder(categories=[['2 Substandard', '3 Poor', '4 Low', '5 Fair',
                            '6 Low Average', '7 Average', '8 Good', '9 Better',
                            '10 Very Good', '11 Excellent', '12 Luxury',
                            '13 Mansion'],
                           ['Poor', 'Fair', 'Average', 'Good', 'Very Good']])

In [351]:
X_subset_or = pd.DataFrame(o_enc.transform(cat_subset), columns = cat_subset.columns)
X_subset_or.head()

Unnamed: 0,grade,condition
0,5.0,2.0
1,5.0,2.0
2,7.0,2.0
3,5.0,2.0
4,7.0,3.0


In [352]:
X_all = pd.concat([X.reset_index(),X_subset_or],axis=1)
X_all = X_all.drop('index', axis=1)

In [353]:
data = pd.concat([y.reset_index(),X_all],axis=1)
data = data.drop('index', axis=1)
data.head()

Unnamed: 0,log_price,sqft_living,bedrooms,floors,waterfront,greenbelt,sqft_above,yr_built,sqft_patio,sqft_basement,bathrooms,sqft_garage,grade,condition
0,13.732129,2770,5,1.0,0,0,1570,1950,240,1570,2.5,0,5.0,2.0
1,12.647548,2880,6,1.0,0,0,1580,1956,0,1580,2.0,0,5.0,2.0
2,13.560618,2160,3,2.0,0,0,1090,2010,270,1070,3.0,200,7.0,2.0
3,13.345507,1190,2,1.0,0,0,1190,1948,0,0,1.0,300,5.0,2.0
4,14.091039,2080,3,1.0,0,0,2080,1951,150,0,3.0,0,7.0,3.0


# Modeling with Ordinal Data

 # (Standardize)

In [354]:

Ox = data.drop('log_price', axis=1)
Oy = data['log_price']

X_train, X_test, y_train, y_test = train_test_split(Ox, Oy, test_size=0.3)


# standarding 
ss = StandardScaler()
ss.fit(X_train)

# linear regression model
lr = LinearRegression()


X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr = LinearRegression()

# Getting the R score
lr.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr.score(X_standardized_train, y_train)}')



y_pred = lr.predict(X_standardized_test)

print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')



R^2 value: 0.3817789396810566
MAE: 0.21734022113671814
Root Mean Squared Error 0.30989567425767783


# Adding Nomial Data

In [355]:
from sklearn.preprocessing import OneHotEncoder
onehot_enc = OneHotEncoder(sparse=False)
# heat source
nominal_data = ['zip_code']

# tranforming the nomial subset
X_nom_trans = onehot_enc.fit_transform(df[nominal_data])

X_norm = pd.DataFrame(X_nom_trans, columns = onehot_enc.get_feature_names())
X_norm.head()

Unnamed: 0,x0_98102,x0_98103,x0_98105,x0_98106,x0_98107,x0_98108,x0_98109,x0_98112,x0_98115,x0_98116,...,x0_98146,x0_98148,x0_98155,x0_98166,x0_98168,x0_98177,x0_98178,x0_98188,x0_98198,x0_98199
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [356]:
new_data = pd.concat([data.reset_index(),X_norm],axis=1)
new_data = new_data.drop('index', axis=1)
new_data.head()

Unnamed: 0,log_price,sqft_living,bedrooms,floors,waterfront,greenbelt,sqft_above,yr_built,sqft_patio,sqft_basement,...,x0_98146,x0_98148,x0_98155,x0_98166,x0_98168,x0_98177,x0_98178,x0_98188,x0_98198,x0_98199
0,13.732129,2770,5,1.0,0,0,1570,1950,240,1570,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,12.647548,2880,6,1.0,0,0,1580,1956,0,1580,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,13.560618,2160,3,2.0,0,0,1090,2010,270,1070,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,13.345507,1190,2,1.0,0,0,1190,1948,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,14.091039,2080,3,1.0,0,0,2080,1951,150,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Modeling with Nomial Data 

# Standardize

In [358]:
px = new_data.drop('log_price', axis=1)
py = new_data['log_price']


X_train, X_test, y_train, y_test = train_test_split(px, py, test_size=0.3)


# standarding 
ss = StandardScaler()
ss.fit(X_train)

# linear regression model
lr = LinearRegression()


X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr = LinearRegression()

# Getting the R score

lr.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr.score(X_standardized_train, y_train)}')



y_pred = lr.predict(X_standardized_test)

print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')



R^2 value: 0.5479336387145659
MAE: 0.17471310754513286
Root Mean Squared Error 0.26803468731413316
