In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn import linear_model
from sklearn.metrics import r2_score

In [2]:
df = pd.read_csv("KC_housing_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [3]:
df.isnull().sum()

date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
street           0
city             0
statezip         0
country          0
dtype: int64

In [4]:
x = df.drop(["price", "country",  "street", 'date'], axis=1)
y = df["price"]

In [5]:
categorical = x.select_dtypes(include=["object"]).columns
numerical = x.select_dtypes(include=["int64", "float64"]).columns


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical), # average = 0, variance = 1
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
    ])

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [7]:
model = linear_model.LinearRegression()

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

model.fit(x_train, y_train)
y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)

r2

0.727181709817958

In [8]:
home = pd.DataFrame({
    'bedrooms': [3], 'bathrooms': [1], 'sqft_living': [1400], 'sqft_lot': [9000], 'floors': [2], 'waterfront': [0], 'view': [0],
    'condition': [3], 'sqft_above': [1500], 'sqft_basement': [0], 'yr_built': [1960], 'yr_renovated': [0],'statezip':['WA 98133'] ,'city': ['seattle']
})
predicted_price = model.predict(home)
print(predicted_price)

[233127.87983136]
