In [147]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [148]:
df = pd.read_csv('house_data.csv')

In [149]:
df.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [152]:
df.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [153]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21613 entries, 0 to 21612
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21613 non-null  int64  
 1   date           21613 non-null  object 
 2   price          21613 non-null  float64
 3   bedrooms       21613 non-null  int64  
 4   bathrooms      21613 non-null  float64
 5   sqft_living    21613 non-null  int64  
 6   sqft_lot       21613 non-null  int64  
 7   floors         21613 non-null  float64
 8   waterfront     21613 non-null  int64  
 9   view           21613 non-null  int64  
 10  condition      21613 non-null  int64  
 11  grade          21613 non-null  int64  
 12  sqft_above     21613 non-null  int64  
 13  sqft_basement  21613 non-null  int64  
 14  yr_built       21613 non-null  int64  
 15  yr_renovated   21613 non-null  int64  
 16  zipcode        21613 non-null  int64  
 17  lat            21613 non-null  float64
 18  long  

In [None]:
# use 'yr_built' to calculate age of building
from datetime import datetime as dt
df['age'] = dt.now().year - df['yr_built']

# use 'yr_renovated' to discount age
df['renovation_discount'] = dt.now().year - df['yr_renovated']

# use date of transaction to compensate for time value of money/inflation
df['date'] = df['date'].apply(lambda x: x[:4])
df['date'] = pd.to_numeric(df['date'])  # convert date from object dtype to datetime

df['transaction_age'] = dt.now().year - df['date']

# drop 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'id', 'date'
df = df.drop(['yr_built', 'yr_renovated', 'lat', 'long', 'id', 'date'], axis=1)
df.columns

In [None]:
df.describe().transpose()

'''I think there is a slight problem with the data as there are houses without bedrooms
and a less serious ones are those without a bathroom. I would not drop them due to insufficient 
information and/or knowledge about the data.
No missing values'''

## Data Visualisation

In [None]:
#sns.pairplot(data=df, corner=True)

In [None]:
correlation = df.corr()
correlation['price'].sort_values()

'bathrooms', 'sqft_living', 'grade', 'sqft_above', 'sqft_living15' are the dominant predictors

## Data preparation

In [None]:
# separate data into dependent and independent variables
X = df.drop('price', axis=1)
y = df['price']

In [None]:
# split data into train, test and cross validation sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42) # train and test
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=0.4, random_state=42) # train and test

In [None]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
X_val = scaler.fit_transform(X_val)

## Model

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# initialize and train model
model = LinearRegression(normalize=True)
model.fit(X_train,y_train)

In [None]:
# predict using model
y_pred = model.predict(X_test)

In [None]:
# check accuracy of predictions
from sklearn.metrics import mean_absolute_error, mean_squared_error
mae = mean_absolute_error(y_test, y_pred)
mae

In [None]:
df['price'].mean()

In [None]:
model.score(X_test, y_test)

In [None]:
from sklearn.linear_model import Ridge, Lasso

In [None]:
model3 = Lasso()
model3.fit(X_train,y_train)
y_pred3 = model3.predict(X_test)
model3.score(X_test, y_test)