In [573]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import string
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

In [574]:
df = pd.read_csv('data/kc_house_data.csv')

# Dropping Nulls and Duplicates

In [575]:
df.drop_duplicates()
# dropping the Nulls
df = df.dropna()

# Adding zip code

In [576]:
# getting the zip code
code = [x.split()[-3] for x in df['address']]
zip_code = [x.translate(str.maketrans('', '', string.punctuation)) for x in code]
df['zip_code'] = zip_code

# Making yes, no columns into Binarys

In [577]:
# making the waterfront greenbelt and nuisance into binary
binary = ['waterfront','greenbelt','nuisance']
df[binary] = (df[binary] == 'YES').astype('int')

In [578]:
df.shape

(30111, 26)

In [579]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,...,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,yr_renovated,address,lat,long,zip_code
0,7399300360,5/24/2022,675000.0,4,1.0,1180,7140,1.0,0,0,...,1180,0,0,40,1969,0,"2102 Southeast 21st Court, Renton, Washington ...",47.461975,-122.19052,98055
1,8910500230,12/13/2021,920000.0,5,2.5,2770,6703,1.0,0,0,...,1570,1570,0,240,1950,0,"11231 Greenwood Avenue North, Seattle, Washing...",47.711525,-122.35591,98133
2,1180000275,9/29/2021,311000.0,6,2.0,2880,6156,1.0,0,0,...,1580,1580,0,0,1956,0,"8504 South 113th Street, Seattle, Washington 9...",47.502045,-122.2252,98178
3,1604601802,12/14/2021,775000.0,3,3.0,2160,1400,2.0,0,0,...,1090,1070,200,270,2010,0,"4079 Letitia Avenue South, Seattle, Washington...",47.56611,-122.2902,98118
4,8562780790,8/24/2021,592500.0,2,2.0,1120,758,2.0,0,0,...,1120,550,550,30,2012,0,"2193 Northwest Talus Drive, Issaquah, Washingt...",47.53247,-122.07188,98027


# Getting Kings County zip codes

In [580]:
king_county_zip = [98052,98092,98115,98103,98003,98023,98105,98042,98034,98133,98118,98125,98058,98031,98122,98033,98032,98004,98059,98001,98006,98056,98030,98155,98168,98117,98002,98198,98038,98029,98027,98109,98074,98144,98007,98040,98072,98146,98011,98107,98106,98188,98008,98178,98108,98075,98028,98112,98119,98055,98053,98102,98022,98126,98116,98005,98121,98199,98177,98166,98136,98045,98077,98101,98057,98065,98104,98019,98148,98014,98354,98070,98010,98024,98051,98047,98151,98251,98134,98039,98195,98158,98154,98161,98174,98068,98256,98288,98050,98132,98171,98184,98054,98164,98181,98224,98191,98009,98013,98015,98025,98035,98041,98062,98064,98063,98071,98073,98083,98089,98093,98111,98113,98114,98124,98127,98131,98129,98139,98138,98141,98145,98160,98165,98170,98175,98185,98190,98194,98471,98481]





len(king_county_zip)

131

In [581]:
kings_county = list(map(str, king_county_zip))

In [582]:
df = df[df['zip_code'].isin(kings_county)]

In [583]:
df.shape

(29175, 26)

# Setting our dataset to only 4M and lower

In [584]:
df = df[df['price'] <= 4000000]

In [585]:
df['price'].max()

4000000.0

In [586]:
X = df[['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'greenbelt', 'nuisance', 'sqft_above', 'sqft_basement',
       'sqft_garage', 'sqft_patio', 'yr_built']]
y = df["price"]

# Modeling 

In [587]:
from sklearn.linear_model import LinearRegression


sklearn_third_model = LinearRegression()
sklearn_third_model.fit(X, y)


y_pred = results.predict(sm.add_constant(X)) 


from sklearn.metrics import mean_absolute_error, mean_squared_error


print(f"""
scikit-learn R-Squared:   {sklearn_third_model.score(X, y)}
MAE: {mean_absolute_error(y, y_pred)}
MSRE {mean_squared_error(y, y_pred, squared=False)}

""")


scikit-learn R-Squared:   0.4515752349364974
MAE: 327435.962764726
MSRE 456984.76356014557




# Standardize

In [588]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


# standarding 
ss = StandardScaler()
ss.fit(X_train)

# linear regression model
lr = LinearRegression()


X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr = LinearRegression()

# Getting the R score
lr.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr.score(X_standardized_train, y_train)}')



y_pred = lr.predict(X_standardized_test)

print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')

coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(lr.coef_))], axis = 1)

R^2 value: 0.4552658495622859
MAE: 326944.30270532804
Root Mean Squared Error 454505.42091060866


# Adding Ordinal Data

In [589]:
from sklearn.preprocessing import OrdinalEncoder
# ordering the grade column

cat_subset = df[['grade','condition','view']]

grade_order = ['2 Substandard','3 Poor', '4 Low', '5 Fair', '6 Low Average', '7 Average', '8 Good', '9 Better', 
               '10 Very Good', '11 Excellent', '12 Luxury','13 Mansion']

con_order = ['Poor','Fair','Average', 'Good','Very Good']

view_order = ['NONE','FAIR','AVERAGE','GOOD','EXCELLENT']

# reg_grade_order = ['7 Average', '9 Better', '8 Good', '6 Low Average', '5 Fair',
#        '4 Low', '10 Very Good', '11 Excellent', '3 Poor', '12 Luxury']

o_enc = OrdinalEncoder(categories = [grade_order,con_order,view_order])
o_enc.fit(cat_subset)

OrdinalEncoder(categories=[['2 Substandard', '3 Poor', '4 Low', '5 Fair',
                            '6 Low Average', '7 Average', '8 Good', '9 Better',
                            '10 Very Good', '11 Excellent', '12 Luxury',
                            '13 Mansion'],
                           ['Poor', 'Fair', 'Average', 'Good', 'Very Good'],
                           ['NONE', 'FAIR', 'AVERAGE', 'GOOD', 'EXCELLENT']])

In [590]:
X_subset_or = pd.DataFrame(o_enc.transform(cat_subset), columns = cat_subset.columns)
X_subset_or.head()

Unnamed: 0,grade,condition,view
0,5.0,3.0,0.0
1,5.0,2.0,2.0
2,5.0,2.0,2.0
3,7.0,2.0,2.0
4,5.0,2.0,0.0


In [591]:
X_all = pd.concat([X.reset_index(),X_subset_or],axis=1)
X_all = X_all.drop('index', axis=1)
data = pd.concat([y.reset_index(),X_all],axis=1)
data = data.drop('index', axis=1)
data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,nuisance,sqft_above,sqft_basement,sqft_garage,sqft_patio,yr_built,grade,condition,view
0,675000.0,4,1.0,1180,7140,1.0,0,0,0,1180,0,0,40,1969,5.0,3.0,0.0
1,920000.0,5,2.5,2770,6703,1.0,0,0,1,1570,1570,0,240,1950,5.0,2.0,2.0
2,311000.0,6,2.0,2880,6156,1.0,0,0,0,1580,1580,0,0,1956,5.0,2.0,2.0
3,775000.0,3,3.0,2160,1400,2.0,0,0,0,1090,1070,200,270,2010,7.0,2.0,2.0
4,592500.0,2,2.0,1120,758,2.0,0,0,1,1120,550,550,30,2012,5.0,2.0,0.0


In [592]:
Zx = data.drop('price', axis=1)
Zy = data['price']

In [593]:
X_train, X_test, y_train, y_test = train_test_split(Zx, Zy, test_size=0.3)


# standarding 
ss = StandardScaler()
ss.fit(X_train)

# linear regression model
lr = LinearRegression()


X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr = LinearRegression()

# Getting the R score
lr.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr.score(X_standardized_train, y_train)}')



y_pred = lr.predict(X_standardized_test)

print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')

coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(lr.coef_))], axis = 1)

R^2 value: 0.5344089705786625
MAE: 302242.25844895537
Root Mean Squared Error 429745.10654043953


# Nomial Data

In [594]:
from sklearn.preprocessing import OneHotEncoder
onehot_enc = OneHotEncoder(sparse=False)
# heat source
nominal_data = ['zip_code']

# tranforming the nomial subset
X_nom_trans = onehot_enc.fit_transform(df[nominal_data])

X_norm = pd.DataFrame(X_nom_trans, columns = onehot_enc.get_feature_names())
X_norm.head()

Unnamed: 0,x0_98001,x0_98002,x0_98003,x0_98004,x0_98005,x0_98006,x0_98007,x0_98008,x0_98010,x0_98011,...,x0_98168,x0_98177,x0_98178,x0_98188,x0_98198,x0_98199,x0_98224,x0_98251,x0_98288,x0_98354
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [595]:
new_data = pd.concat([data.reset_index(),X_norm],axis=1)
new_data = new_data.drop('index', axis=1)
new_data.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,greenbelt,nuisance,sqft_above,...,x0_98168,x0_98177,x0_98178,x0_98188,x0_98198,x0_98199,x0_98224,x0_98251,x0_98288,x0_98354
0,675000.0,4,1.0,1180,7140,1.0,0,0,0,1180,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,920000.0,5,2.5,2770,6703,1.0,0,0,1,1570,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,311000.0,6,2.0,2880,6156,1.0,0,0,0,1580,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,775000.0,3,3.0,2160,1400,2.0,0,0,0,1090,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,592500.0,2,2.0,1120,758,2.0,0,0,1,1120,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [609]:
px = new_data.drop('price', axis=1)
py = new_data['price']


X_train, X_test, y_train, y_test = train_test_split(px, py, test_size=0.3)


# standarding 
ss = StandardScaler()
ss.fit(X_train)

# linear regression model
lr = LinearRegression()


X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr = LinearRegression()

# Getting the R score

lr.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr.score(X_standardized_train, y_train)}')



y_pred = lr.predict(X_standardized_test)

print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')

coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(lr.coef_))], axis = 1)

R^2 value: 0.7448337111720784
MAE: 213134.60467809738
Root Mean Squared Error 321050.9454909575


# Plotting our Predit and Actual prices

In [607]:
import plotly.express as px

px.scatter(x=y_pred,y=y_test,
          labels= {'y':'Actual', 'x':'Predicit'}, trendline='ols',opacity=0.6)