# Import Packages

In [61]:
import pandas as pd
import numpy as np

# Import Data

In [62]:
df = pd.read_csv('kc_house_data.csv')

# Check Data

In [63]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  object 
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     21597 non-null  int64  
 9   view           21597 non-null  int64  
 10  condition      21597 non-null  int64  
 11  grade          21597 non-null  int64  
 12  sqft_above     21597 non-null  int64  
 13  sqft_basement  21597 non-null  int64  
 14  yr_built       21597 non-null  int64  
 15  yr_renovated   21597 non-null  int64  
 16  zipcode        21597 non-null  int64  
 17  lat            21597 non-null  float64
 18  long  

# Check for Missing Values

In [64]:
df.isnull().sum()

id               0
date             0
price            0
bedrooms         0
bathrooms        0
sqft_living      0
sqft_lot         0
floors           0
waterfront       0
view             0
condition        0
grade            0
sqft_above       0
sqft_basement    0
yr_built         0
yr_renovated     0
zipcode          0
lat              0
long             0
sqft_living15    0
sqft_lot15       0
dtype: int64

# Define Target/X Features

In [65]:
numerical_features = df.drop(['id','price','date'], axis = 1).select_dtypes(include = 'number').columns.tolist()
numerical_features

['bedrooms',
 'bathrooms',
 'sqft_living',
 'sqft_lot',
 'floors',
 'waterfront',
 'view',
 'condition',
 'grade',
 'sqft_above',
 'sqft_basement',
 'yr_built',
 'yr_renovated',
 'zipcode',
 'lat',
 'long',
 'sqft_living15',
 'sqft_lot15']

In [66]:
y = df['price']
X = df[numerical_features]

# Train Test Split

In [67]:
from sklearn.model_selection import train_test_split

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2, random_state=42)

# Tranformations

In [69]:
# Import Package
from sklearn.preprocessing import StandardScaler

In [70]:
# Create and Instance
scaler = StandardScaler()

In [71]:
# Fit
scaler.fit(X_train)

StandardScaler()

In [72]:
# Transform
X_train = scaler.transform(X_train)

In [73]:
X_test = scaler.transform(X_test)

# Select the top 10 Features

In [74]:
from sklearn.feature_selection import SelectKBest, f_regression

In [75]:
best_features = SelectKBest(f_regression, k = 15)

In [76]:
best_features.fit(X_train, y_train)

SelectKBest(k=15, score_func=<function f_regression at 0x00000137ADC8E670>)

In [77]:
X_train = best_features.transform(X_train)

In [78]:
X_test = best_features.transform(X_test)

# Build Model

In [79]:
# Import Package
from sklearn.linear_model import LinearRegression

In [80]:
# Create an Instance
lr = LinearRegression()

In [81]:
# Fit Model
lr.fit(X_train, y_train)

LinearRegression()

In [82]:
# Make Predicitons
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

# Performance

In [83]:
# Import metrics packages
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_percentage_error

In [84]:
print(r2_score(y_train, y_pred_train))
print(mean_squared_error(y_train, y_pred_train))
print(mean_squared_error(y_train, y_pred_train, squared=False))
print(mean_absolute_percentage_error(y_train, y_pred_train))

0.6635404864344707
45802247580.99997
214014.59665405995
0.26383247355336786


In [85]:
print(r2_score(y_test, y_pred_test))
print(mean_squared_error(y_test, y_pred_test))
print(mean_squared_error(y_test, y_pred_test, squared=False))
print(mean_absolute_percentage_error(y_test, y_pred_test))

0.6599369415315701
44281746477.74585
210432.28478003523
0.26740241145944127


# Feature Coeff

In [86]:
lr.intercept_

541519.0094345068

In [87]:
feat_dict = { 'feature_name': numerical_features, 'feature_coef' : lr.coef_}
feat_importance = pd.DataFrame(feat_dict)

ValueError: All arrays must be of the same length

In [None]:
feat_importance

In [None]:
feat_importance.plot.bar()