In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
df = pd.read_csv('data.csv')

In [9]:
df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [10]:
df.shape


(4600, 18)

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [12]:
df.isnull().sum()

date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64

In [13]:
df.nunique()

date               70
price            1741
bedrooms           10
bathrooms          26
sqft_living       566
sqft_lot         3113
floors              6
waterfront          2
view                5
condition           5
sqft_above        511
sqft_basement     207
yr_built          115
yr_renovated       60
street           4525
city               44
statezip           77
country             1
dtype: int64

In [None]:
df.unique()

In [24]:
df.columns

Index(['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'street', 'city',
       'statezip', 'country'],
      dtype='object')

In [26]:
columns_to_remove = ['date', 'yr_renovated', 'street','city','statezip','country']

In [30]:
df = df.drop(columns = columns_to_remove)

In [32]:
df.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built
0,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955
1,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921
2,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966
3,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963
4,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976


In [36]:
import scipy.stats as stat
z_scores = stats.zscore(df)
threshold=3
print(df.shape)
outliers_df = df[(z_scores > threshold).any(axis=1)]
df = df[(z_scores < threshold).all(axis=1)]
print(df.shape)

(4600, 12)
(4244, 12)


In [40]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)
df_scaled = pd.DataFrame(scaler.transform(df), columns=df.columns)

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [44]:
x = df.drop('price',axis=1)
y = df['price']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25, random_state = 42)


In [46]:
print(x_train.shape)
print(x_test.shape)

(3183, 11)
(1061, 11)


In [48]:
models = [ 
    ('Random Forest', RandomForestRegressor()),
    ('Linear Regression', LinearRegression()),
    ('Decision Tree', DecisionTreeRegressor()),
    ('KNN', KNeighborsRegressor())
]

In [50]:
from sklearn.metrics import mean_squared_error , r2_score, mean_absolute_error


In [54]:
for name,model in models:
    print(name)
    print()
    model.fit(x_train,y_train)
    y_pred= model.predict(x_test)
    print("mean squared error", mean_squared_error(y_test,y_pred))
    print('\n')
    print("mean absolute error", mean_absolute_error(y_test,y_pred))
    print('\n')
    print("R squared (R2)", r2_score(y_test,y_pred))
    print('\n')

Random Forest

mean squared error 44464786624.27979


mean absolute error 141296.72226748156


R squared (R2) 0.40382552162193863


Linear Regression

mean squared error 41194072741.05764


mean absolute error 139590.95600748394


R squared (R2) 0.4476785633497743


Decision Tree

mean squared error 71736851558.603


mean absolute error 181938.51215297927


R squared (R2) 0.03816742853587718


KNN

mean squared error 54270710864.73361


mean absolute error 155957.7490738805


R squared (R2) 0.2723497581494757


