In [200]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import string
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

df = pd.read_csv('housing.csv')

In [201]:
# finding the all the price ranges in the box plot
p_med = df['price'].median()
p_max = df['price'].max()
p_min = df['price'].min()
q3, q1 = np.percentile(df['price'], [75 ,25])
iqr = q3 - q1
p_avg = df['price'].mean()
up_fence = q3 + (1.5 * iqr) 
low_fence = abs(q1 - (1.5 * iqr))
print(f' The maximum: {p_max}\n The Upper Fence:{up_fence}\n The 75th percentile: {q3}\n The median: {p_med}\n The 25th percentile: {q1}\n The lower Fence {low_fence}\n The InterQuartile Range: {iqr}\n The minimum: {p_min}\n The mean: {p_avg}')

 The maximum: 30750000.0
 The Upper Fence:1955000.0
 The 75th percentile: 1175000.0
 The median: 850000.0
 The 25th percentile: 655000.0
 The lower Fence 125000.0
 The InterQuartile Range: 520000.0
 The minimum: 27360.0
 The mean: 1014127.0734519929


In [202]:
fig = px.box(df, y="price")
fig.show()

In [203]:
df.shape

(11967, 21)

In [204]:
new_df = df[(df['price'] <= up_fence) & (df['price'] >= low_fence)]
new_df.shape

(11170, 21)

In [205]:
fig = px.box(new_df, y="price")
fig.show()

In [206]:
df.columns

Index(['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'greenbelt', 'nuisance', 'view', 'condition', 'grade',
       'heat_source', 'sewer_system', 'sqft_above', 'sqft_basement',
       'sqft_garage', 'sqft_patio', 'yr_built', 'address', 'zip_code'],
      dtype='object')

# Adding all nessary Variables for the model

In [207]:
X = new_df[['sqft_living','bedrooms','floors','waterfront', 'greenbelt','sqft_above',
        'yr_built','sqft_patio','sqft_basement','bathrooms','sqft_garage']]
y = new_df["price"]


from sklearn.preprocessing import OrdinalEncoder
# ordering the grade column

cat_subset = new_df[['grade','condition','view']]

grade_order = ['2 Substandard','3 Poor', '4 Low', '5 Fair', '6 Low Average', '7 Average', '8 Good', '9 Better', 
               '10 Very Good', '11 Excellent', '12 Luxury','13 Mansion']

con_order = ['Poor','Fair','Average', 'Good','Very Good']

view_order = ['NONE','FAIR','AVERAGE','GOOD','EXCELLENT']

# reg_grade_order = ['7 Average', '9 Better', '8 Good', '6 Low Average', '5 Fair',
#        '4 Low', '10 Very Good', '11 Excellent', '3 Poor', '12 Luxury']

o_enc = OrdinalEncoder(categories = [grade_order,con_order,view_order])
o_enc.fit(cat_subset)

OrdinalEncoder(categories=[['2 Substandard', '3 Poor', '4 Low', '5 Fair',
                            '6 Low Average', '7 Average', '8 Good', '9 Better',
                            '10 Very Good', '11 Excellent', '12 Luxury',
                            '13 Mansion'],
                           ['Poor', 'Fair', 'Average', 'Good', 'Very Good'],
                           ['NONE', 'FAIR', 'AVERAGE', 'GOOD', 'EXCELLENT']])

In [208]:
X_subset_or = pd.DataFrame(o_enc.transform(cat_subset), columns = cat_subset.columns)
X_subset_or.head()

Unnamed: 0,grade,condition,view
0,5.0,2.0,2.0
1,5.0,2.0,2.0
2,7.0,2.0,2.0
3,5.0,2.0,0.0
4,7.0,3.0,2.0


In [209]:
X_all = pd.concat([X.reset_index(),X_subset_or],axis=1)
X_all = X_all.drop('index', axis=1)

In [210]:
data = pd.concat([y.reset_index(),X_all],axis=1)
data = data.drop('index', axis=1)
data.head()

Unnamed: 0,price,sqft_living,bedrooms,floors,waterfront,greenbelt,sqft_above,yr_built,sqft_patio,sqft_basement,bathrooms,sqft_garage,grade,condition,view
0,920000.0,2770,5,1.0,0,0,1570,1950,240,1570,2.5,0,5.0,2.0,2.0
1,311000.0,2880,6,1.0,0,0,1580,1956,0,1580,2.0,0,5.0,2.0,2.0
2,775000.0,2160,3,2.0,0,0,1090,2010,270,1070,3.0,200,7.0,2.0,2.0
3,625000.0,1190,2,1.0,0,0,1190,1948,0,0,1.0,300,5.0,2.0,0.0
4,1317227.0,2080,3,1.0,0,0,2080,1951,150,0,3.0,0,7.0,3.0,2.0


In [211]:
from sklearn.preprocessing import OneHotEncoder
onehot_enc = OneHotEncoder(sparse=False)
# heat source
nominal_data = ['zip_code']

# tranforming the nomial subset
X_nom_trans = onehot_enc.fit_transform(new_df[nominal_data])

X_norm = pd.DataFrame(X_nom_trans, columns = onehot_enc.get_feature_names())
X_norm.head()

Unnamed: 0,x0_98102,x0_98103,x0_98105,x0_98106,x0_98107,x0_98108,x0_98109,x0_98112,x0_98115,x0_98116,...,x0_98146,x0_98148,x0_98155,x0_98166,x0_98168,x0_98177,x0_98178,x0_98188,x0_98198,x0_98199
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [212]:
new_data = pd.concat([data.reset_index(),X_norm],axis=1)
new_data = new_data.drop('index', axis=1)
new_data.head()

Unnamed: 0,price,sqft_living,bedrooms,floors,waterfront,greenbelt,sqft_above,yr_built,sqft_patio,sqft_basement,...,x0_98146,x0_98148,x0_98155,x0_98166,x0_98168,x0_98177,x0_98178,x0_98188,x0_98198,x0_98199
0,920000.0,2770,5,1.0,0,0,1570,1950,240,1570,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,311000.0,2880,6,1.0,0,0,1580,1956,0,1580,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,775000.0,2160,3,2.0,0,0,1090,2010,270,1070,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,625000.0,1190,2,1.0,0,0,1190,1948,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1317227.0,2080,3,1.0,0,0,2080,1951,150,0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


# Modeling with ordianl and nomial data

In [213]:
px = new_data.drop('price', axis=1)
py = new_data['price']

X_train, X_test, y_train, y_test = train_test_split(px, py, test_size=0.3)


# standarding }
ss = StandardScaler()
ss.fit(X_train)

# linear regression model
lr = LinearRegression()


X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr = LinearRegression()

# Getting the R score

lr.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr.score(X_standardized_train, y_train)}')



y_pred = lr.predict(X_standardized_test)

print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')



R^2 value: 0.6296182797359406
MAE: 152792.3854451343
Root Mean Squared Error 215730.73433529513


# Getting rid of more outliers

In [214]:
# finding the all the price ranges in the box plot
n_med = new_df['price'].median()
n_max = new_df['price'].max()
n_min = new_df['price'].min()
nq3, nq1 = np.percentile(new_df['price'], [75 ,25])
niqr = nq3 - nq1
n_avg = new_df['price'].mean()
nup_fence = nq3 + (1.5 * niqr) 
nlow_fence = abs(nq1 - (1.5 * niqr))
print(f' The maximum: {n_max}\n The Upper Fence:{nup_fence}\n The 75th percentile: {nq3}\n The median: {n_med}\n The 25th percentile: {nq1}\n The lower Fence {nlow_fence}\n The InterQuartile Range: {niqr}\n The minimum: {n_min}\n The mean: {n_avg}')

 The maximum: 1955000.0
 The Upper Fence:1744375.0
 The 75th percentile: 1087750.0
 The median: 825000.0
 The 25th percentile: 650000.0
 The lower Fence 6625.0
 The InterQuartile Range: 437750.0
 The minimum: 125000.0
 The mean: 896640.6743061773


In [215]:
import plotly.express as px
fig = px.box(new_df, y="price")
fig.show()

In [216]:
ndf = new_df[(new_df['price'] <= nup_fence) & (new_df['price'] >= n_min)]
ndf.shape

(10869, 21)

In [217]:
import plotly.express as px
fig = px.box(ndf, y="price")
fig.show()

In [218]:
i_med = ndf['price'].median()
i_max = ndf['price'].max()
i_min = ndf['price'].min()
iq3, iq1 = np.percentile(ndf['price'], [75 ,25])
iiqr = iq3 - iq1
iup_fence = iq3 + (1.5 * iiqr) 
ilow_fence = abs(nq1 - (1.5 * iiqr))

print(f"""The maximum: {i_max}\n The Upper Fence:{iup_fence}\n The 75th percentile: {iq3}\nThe median: {i_med}\n The 25th percentile: {iq1}\n The lower Fence {ilow_fence}\n The InterQuartile Range: {iiqr}\n The minimum: {i_min}\n """)


The maximum: 1743400.0
 The Upper Fence:1653450.0
 The 75th percentile: 1050000.0
The median: 817500.0
 The 25th percentile: 647700.0
 The lower Fence 46550.0
 The InterQuartile Range: 402300.0
 The minimum: 125000.0
 


In [239]:
dd = ndf[(ndf['price'] <= 1600000) & (ndf['price'] >= n_min)]
dd.shape

(10611, 21)

In [240]:
import plotly.express as px
fig = px.box(dd, y="price")
fig.show()

In [241]:
dd.to_csv('clean_house.csv', index=False)

In [242]:
i_med = dd['price'].median()
i_max = dd['price'].max()
i_min = dd['price'].min()
iq3, iq1 = np.percentile(dd['price'], [75 ,25])
iiqr = iq3 - iq1
iup_fence = iq3 + (1.5 * iiqr) 
ilow_fence = abs(nq1 - (1.5 * iiqr))

print(f"""The maximum: {i_max}\n The Upper Fence:{iup_fence}\n The 75th percentile: {iq3}\nThe median: {i_med}\n The 25th percentile: {iq1}\n The lower Fence {ilow_fence}\n The InterQuartile Range: {iiqr}\n The minimum: {i_min}\n """)


The maximum: 1600000.0
 The Upper Fence:1602500.0
 The 75th percentile: 1025000.0
The median: 809000.0
 The 25th percentile: 640000.0
 The lower Fence 72500.0
 The InterQuartile Range: 385000.0
 The minimum: 125000.0
 


In [259]:
X = dd[['sqft_living','bedrooms','floors','waterfront', 'greenbelt','sqft_above',
        'yr_built','sqft_patio','sqft_basement','nuisance','bathrooms','sqft_garage']]
y = dd["price"]




# CREATING ORDINAL DATA
from sklearn.preprocessing import OrdinalEncoder
# ordering the grade column

cat_subset = dd[['grade','condition','view']]

grade_order = ['2 Substandard','3 Poor', '4 Low', '5 Fair', '6 Low Average', '7 Average', '8 Good', '9 Better', 
               '10 Very Good', '11 Excellent', '12 Luxury','13 Mansion']

con_order = ['Poor','Fair','Average', 'Good','Very Good']

view_order = ['NONE','FAIR','AVERAGE','GOOD','EXCELLENT']

# reg_grade_order = ['7 Average', '9 Better', '8 Good', '6 Low Average', '5 Fair',
#        '4 Low', '10 Very Good', '11 Excellent', '3 Poor', '12 Luxury']

o_enc = OrdinalEncoder(categories = [grade_order,con_order,view_order])
o_enc.fit(cat_subset)
X_subset_or = pd.DataFrame(o_enc.transform(cat_subset), columns = cat_subset.columns)
X_all = pd.concat([X.reset_index(),X_subset_or],axis=1)
X_all = X_all.drop('index', axis=1)
data = pd.concat([y.reset_index(),X_all],axis=1)
data = data.drop('index', axis=1)




# CREATING NOMIAL DATA
from sklearn.preprocessing import OneHotEncoder
onehot_enc = OneHotEncoder(sparse=False)
# heat source
nominal_data = ['zip_code']

# tranforming the nomial subset
X_nom_trans = onehot_enc.fit_transform(dd[nominal_data])

X_norm = pd.DataFrame(X_nom_trans, columns = onehot_enc.get_feature_names())
new_data = pd.concat([data.reset_index(),X_norm],axis=1)
new_data = new_data.drop('index', axis=1)








px = new_data.drop('price', axis=1)
py = new_data['price']

X_train, X_test, y_train, y_test = train_test_split(px, py, test_size=0.3)


# standarding }
ss = StandardScaler()
ss.fit(X_train)

# linear regression model
lr = LinearRegression()


X_standardized_train = ss.transform(X_train)
X_standardized_test = ss.transform(X_test)


lr = LinearRegression()

# Getting the R score

lr.fit(X_standardized_train, y_train)
print(f'R^2 value: {lr.score(X_standardized_train, y_train)}')



y_pred = lr.predict(X_standardized_test)

print(f'MAE: {mean_absolute_error(y_pred, y_test)}')
rms = mean_squared_error(y_test,y_pred, squared=False)

print(f'Root Mean Squared Error {rms}')



R^2 value: 0.6020165711459798
MAE: 137939.29985910258
Root Mean Squared Error 194515.96410450933


In [244]:
import plotly.express as px
fig = px.histogram(dd, x='price')
fig.show()