In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import string
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

In [16]:
# adding the data
df = pd.read_csv('data/kc_house_data.csv')

In [17]:
# dropping the duplicated row
df.drop_duplicates()
# dropping the Nulls
df = df.dropna()

In [18]:
# removoing unnessary data
df = df.drop(['id', 'date', 'lat', 'long','yr_renovated'], axis = 1)

In [19]:
# getting the zip code
code = [x.split()[-3] for x in df['address']]
zip_code = [x.translate(str.maketrans('', '', string.punctuation)) for x in code]
df['zip_code'] = zip_code

In [20]:
# making the waterfront greenbelt and nuisance into binary
binary = ['waterfront','greenbelt','nuisance']
df[binary] = (df[binary] == 'YES').astype('int')

In [21]:

# only getting zip code from kings county
king_county = ['98101', '98102', '98103', '98104', '98105', '98106', '98107' , '98108' , '98109' , '98110', 
        '98111' , '98112', '98114' , '98115', '98116' , '98117', '98118' , '98119' , '98121' , '98122' , 
        '98124' , '98125' , '98126' , '98129' , '98131' , '98132' , '98133' , '98134' , '98136' , '98138' , 
        '98144' , '98145' , '98146' , '98148' , '98151' , '98154' , '98155' , '98158' , '98160' , '98161' , 
        '98164' , '98166' , '98168' , '98170' , '98171' , '98174' , 
        '98177' , '98178' , '98181' , '98184' , '98185' , '98188' , 
        '98190' , '98191' , '98195' , '98198' , '98199' ]

In [22]:
len(king_county)

57

In [23]:
df = df[df['zip_code'].isin(king_county)]

In [24]:
df.shape

(11967, 21)

In [25]:
# creating a housing csv to have separate
df.to_csv('housing.csv', index=False)

In [26]:
# only number columns
df.select_dtypes(exclude=['object']).head().columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'greenbelt', 'nuisance', 'sqft_above', 'sqft_basement',
       'sqft_garage', 'sqft_patio', 'yr_built'],
      dtype='object')

In [27]:
X = df[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'greenbelt', 'nuisance', 'sqft_above', 'sqft_basement',
       'sqft_garage', 'sqft_patio', 'yr_built']]
y = df["price"]

In [30]:
# creating a simple model
import statsmodels.api as sm
model = sm.OLS(y, sm.add_constant(X))
results = model.fit()

print(results.summary())


                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.482
Model:                            OLS   Adj. R-squared:                  0.481
Method:                 Least Squares   F-statistic:                     855.7
Date:                Tue, 30 May 2023   Prob (F-statistic):               0.00
Time:                        19:11:35   Log-Likelihood:            -1.7455e+05
No. Observations:               11967   AIC:                         3.491e+05
Df Residuals:                   11953   BIC:                         3.492e+05
Df Model:                          13                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const          4.762e+06   3.68e+05     12.933

In [31]:
from sklearn.linear_model import LinearRegression


sklearn_third_model = LinearRegression()
sklearn_third_model.fit(X, y)


y_pred = results.predict(sm.add_constant(X)) 


from sklearn.metrics import mean_absolute_error, mean_squared_error


print(f"""
scikit-learn R-Squared:   {sklearn_third_model.score(X, y)}
MAE: {mean_absolute_error(y, y_pred)}
MSRE {mean_squared_error(y, y_pred, squared=False)}

""")


scikit-learn R-Squared:   0.48204719866334544
MAE: 281337.7808576937
MSRE 522877.3747663305




# Standardize

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


# standarding 
ss = StandardScaler()
ss.fit(X_train)

# linear regression model
lr = LinearRegression()


X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr = LinearRegression()

# Getting the R score
lr.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr.score(X_standardized_train, y_train)}')



y_pred = lr.predict(X_standardized_test)

print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')

coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(lr.coef_))], axis = 1)

R^2 value: 0.4529306827916003
MAE: 291593.08603264997
Root Mean Squared Error 468913.73216258135


# Adding Ordinal Data

In [33]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,nuisance,view,...,grade,heat_source,sewer_system,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,address,zip_code
1,920000.0,5,2.5,2770,6703,1.0,0,0,1,AVERAGE,...,7 Average,Oil,PUBLIC,1570,1570,0,240,1950,"11231 Greenwood Avenue North, Seattle, Washing...",98133
2,311000.0,6,2.0,2880,6156,1.0,0,0,0,AVERAGE,...,7 Average,Gas,PUBLIC,1580,1580,0,0,1956,"8504 South 113th Street, Seattle, Washington 9...",98178
3,775000.0,3,3.0,2160,1400,2.0,0,0,0,AVERAGE,...,9 Better,Gas,PUBLIC,1090,1070,200,270,2010,"4079 Letitia Avenue South, Seattle, Washington...",98118
5,625000.0,2,1.0,1190,5688,1.0,0,0,1,NONE,...,7 Average,Electricity,PUBLIC,1190,0,300,0,1948,"1602 North 185th Street, Shoreline, Washington...",98133
6,1317227.0,3,3.0,2080,27574,1.0,0,0,0,AVERAGE,...,9 Better,Oil,PRIVATE,2080,0,0,150,1951,"2633 Southwest 164th Place, Burien, Washington...",98166


In [34]:
from sklearn.preprocessing import OrdinalEncoder
# ordering the grade column

cat_subset = df[['grade','condition','view']]

grade_order = ['2 Substandard','3 Poor', '4 Low', '5 Fair', '6 Low Average', '7 Average', '8 Good', '9 Better', 
               '10 Very Good', '11 Excellent', '12 Luxury','13 Mansion']

con_order = ['Poor','Fair','Average', 'Good','Very Good']

view_order = ['NONE','FAIR','AVERAGE','GOOD','EXCELLENT']

# reg_grade_order = ['7 Average', '9 Better', '8 Good', '6 Low Average', '5 Fair',
#        '4 Low', '10 Very Good', '11 Excellent', '3 Poor', '12 Luxury']

o_enc = OrdinalEncoder(categories = [grade_order,con_order,view_order])
o_enc.fit(cat_subset)

OrdinalEncoder(categories=[['2 Substandard', '3 Poor', '4 Low', '5 Fair',
                            '6 Low Average', '7 Average', '8 Good', '9 Better',
                            '10 Very Good', '11 Excellent', '12 Luxury',
                            '13 Mansion'],
                           ['Poor', 'Fair', 'Average', 'Good', 'Very Good'],
                           ['NONE', 'FAIR', 'AVERAGE', 'GOOD', 'EXCELLENT']])

In [35]:
X_subset_or = pd.DataFrame(o_enc.transform(cat_subset), columns = cat_subset.columns)
X_subset_or.head()

Unnamed: 0,grade,condition,view
0,5.0,2.0,2.0
1,5.0,2.0,2.0
2,7.0,2.0,2.0
3,5.0,2.0,0.0
4,7.0,3.0,2.0


In [36]:
X_all = pd.concat([X.reset_index(),X_subset_or],axis=1)

In [37]:
X_all = X_all.drop('index', axis=1)

In [38]:
data = pd.concat([y.reset_index(),X_all],axis=1)

In [39]:
data = data.drop('index', axis=1)

In [40]:
data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,nuisance,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,grade,condition,view
0,920000.0,5,2.5,2770,6703,1.0,0,0,1,1570,1570,0,240,1950,5.0,2.0,2.0
1,311000.0,6,2.0,2880,6156,1.0,0,0,0,1580,1580,0,0,1956,5.0,2.0,2.0
2,775000.0,3,3.0,2160,1400,2.0,0,0,0,1090,1070,200,270,2010,7.0,2.0,2.0
3,625000.0,2,1.0,1190,5688,1.0,0,0,1,1190,0,300,0,1948,5.0,2.0,0.0
4,1317227.0,3,3.0,2080,27574,1.0,0,0,0,2080,0,0,150,1951,7.0,3.0,2.0


In [41]:
Zx = data.drop('price', axis=1)
Zy = data['price']

In [42]:
X_train, X_test, y_train, y_test = train_test_split(Zx, Zy, test_size=0.3)


# standarding 
ss = StandardScaler()
ss.fit(X_train)

# linear regression model
lr = LinearRegression()


X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr = LinearRegression()

# Getting the R score
lr.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr.score(X_standardized_train, y_train)}')



y_pred = lr.predict(X_standardized_test)

print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')

coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(lr.coef_))], axis = 1)

R^2 value: 0.5855901065260967
MAE: 261731.49343466506
Root Mean Squared Error 606028.3194946006


# Nomial Data

In [43]:
from sklearn.preprocessing import OneHotEncoder
onehot_enc = OneHotEncoder(sparse=False)
# heat source
nominal_data = ['zip_code']

# tranforming the nomial subset
X_nom_trans = onehot_enc.fit_transform(df[nominal_data])

X_norm = pd.DataFrame(X_nom_trans, columns = onehot_enc.get_feature_names())
X_norm.head()

Unnamed: 0,x0_98102,x0_98103,x0_98105,x0_98106,x0_98107,x0_98108,x0_98109,x0_98112,x0_98115,x0_98116,...,x0_98146,x0_98148,x0_98155,x0_98166,x0_98168,x0_98177,x0_98178,x0_98188,x0_98198,x0_98199
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
new_data = pd.concat([data.reset_index(),X_norm],axis=1)

In [45]:
new_data = new_data.drop('index', axis=1)
new_data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,nuisance,sqft_above,...,x0_98146,x0_98148,x0_98155,x0_98166,x0_98168,x0_98177,x0_98178,x0_98188,x0_98198,x0_98199
0,920000.0,5,2.5,2770,6703,1.0,0,0,1,1570,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,311000.0,6,2.0,2880,6156,1.0,0,0,0,1580,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,775000.0,3,3.0,2160,1400,2.0,0,0,0,1090,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,625000.0,2,1.0,1190,5688,1.0,0,0,1,1190,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1317227.0,3,3.0,2080,27574,1.0,0,0,0,2080,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
px = new_data.drop('price', axis=1)
py = new_data['price']


X_train, X_test, y_train, y_test = train_test_split(px, py, test_size=0.3)


# standarding 
ss = StandardScaler()
ss.fit(X_train)

# linear regression model
lr = LinearRegression()


X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr = LinearRegression()

# Getting the R score

lr.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr.score(X_standardized_train, y_train)}')



y_pred = lr.predict(X_standardized_test)

print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')

coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(lr.coef_))], axis = 1)

R^2 value: 0.6711326632565588
MAE: 233954.21227745916
Root Mean Squared Error 626339.4175637384


# POLY

In [1138]:
poly = PolynomialFeatures(interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(px) 


names = poly.get_feature_names(input_features=px.columns)
X_poly = pd.DataFrame(X_poly, columns=names, index = px.index)

X_train, X_test, y_train, y_test = train_test_split(X_poly, py, test_size = 0.3)


ss.fit(X_train)
X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr_raw = LinearRegression()
lr_raw.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr_raw.score(X_standardized_train, y_train)}')
y_pred = lr_raw.predict(X_standardized_test)
print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')


R^2 value: 0.8473263253790224
MAE: 5505240893305737.0
Root Mean Squared Error 2.4123798006690538e+17
