In [77]:
# Imports
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

In [78]:
# Import Data
data_import = pd.read_csv(".//house_predictions.csv")
print(data_import)

        price  area  bedrooms  bathrooms  stories mainroad guestroom basement  \
0    13300000  7420         4          2        3      yes        no       no   
1    12250000  8960         4          4        4      yes        no       no   
2    12250000  9960         3          2        2      yes        no      yes   
3    12215000  7500         4          2        2      yes        no      yes   
4    11410000  7420         4          1        2      yes       yes      yes   
..        ...   ...       ...        ...      ...      ...       ...      ...   
540   1820000  3000         2          1        1      yes        no      yes   
541   1767150  2400         3          1        1       no        no       no   
542   1750000  3620         2          1        1      yes        no       no   
543   1750000  2910         3          1        1       no        no       no   
544   1750000  3850         3          1        2      yes        no       no   

    hotwaterheating aircond

## Prepare Data

In [79]:
# Create pandas dataframe
house_info = pd.DataFrame(data_import)
house_info.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,guestroom,basement,hotwaterheating,airconditioning,parking,prefarea,furnishingstatus
0,13300000,7420,4,2,3,yes,no,no,no,yes,2,yes,furnished
1,12250000,8960,4,4,4,yes,no,no,no,yes,3,no,furnished
2,12250000,9960,3,2,2,yes,no,yes,no,no,2,yes,semi-furnished
3,12215000,7500,4,2,2,yes,no,yes,no,yes,3,yes,furnished
4,11410000,7420,4,1,2,yes,yes,yes,no,yes,2,no,furnished


In [80]:
# Check for nulls
house_info.isnull().sum

<bound method DataFrame.sum of      price   area  bedrooms  bathrooms  stories  mainroad  guestroom  \
0    False  False     False      False    False     False      False   
1    False  False     False      False    False     False      False   
2    False  False     False      False    False     False      False   
3    False  False     False      False    False     False      False   
4    False  False     False      False    False     False      False   
..     ...    ...       ...        ...      ...       ...        ...   
540  False  False     False      False    False     False      False   
541  False  False     False      False    False     False      False   
542  False  False     False      False    False     False      False   
543  False  False     False      False    False     False      False   
544  False  False     False      False    False     False      False   

     basement  hotwaterheating  airconditioning  parking  prefarea  \
0       False            False    

In [81]:
# Drop unnecessary columns
house_df = house_info.drop(columns=["guestroom", "prefarea", "furnishingstatus"], axis=1)
house_df.head()

Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,basement,hotwaterheating,airconditioning,parking
0,13300000,7420,4,2,3,yes,no,no,yes,2
1,12250000,8960,4,4,4,yes,no,no,yes,3
2,12250000,9960,3,2,2,yes,yes,no,no,2
3,12215000,7500,4,2,2,yes,yes,no,yes,3
4,11410000,7420,4,1,2,yes,yes,no,yes,2


In [82]:
# Convert Yes/No to 1/0 for scaling
converted_house_df = house_df.copy()
converted_house_df[["mainroad", "basement", "hotwaterheating", "airconditioning"]] = converted_house_df[["mainroad", "basement", "hotwaterheating", "airconditioning"]].replace({"yes": 1, "no": 0})
converted_house_df.head()

  converted_house_df[["mainroad", "basement", "hotwaterheating", "airconditioning"]] = converted_house_df[["mainroad", "basement", "hotwaterheating", "airconditioning"]].replace({"yes": 1, "no": 0})


Unnamed: 0,price,area,bedrooms,bathrooms,stories,mainroad,basement,hotwaterheating,airconditioning,parking
0,13300000,7420,4,2,3,1,0,0,1,2
1,12250000,8960,4,4,4,1,0,0,1,3
2,12250000,9960,3,2,2,1,1,0,0,2
3,12215000,7500,4,2,2,1,1,0,1,3
4,11410000,7420,4,1,2,1,1,0,1,2


In [83]:
# Split X and Y
y = converted_house_df["price"]

x = converted_house_df.copy()
x = x.drop(columns="price")

In [84]:
# Train Test Split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=4)

In [85]:
# Scale numeric data
scaler = StandardScaler().fit(x_train)
scaled_x_train = scaler.transform(x_train)
scaled_x_test = scaler.transform(x_test)

## Create Model

In [86]:
# Initial Model - Linear Regression
linear_regression_model = LinearRegression()

linear_regression_model.fit(scaled_x_train, y_train)
lr_model_predictions = linear_regression_model.predict(scaled_x_test)


In [87]:
# Score model
lr_model_r2_score = metrics.r2_score(y_test, lr_model_predictions)
lr_model_rmse = metrics.root_mean_squared_error(y_test, lr_model_predictions)
lr_model_mse = lr_model_rmse ** 2

print(f"R2 score: {lr_model_r2_score}  Mean Square Error: {lr_model_mse}  Root Mean Square Error: {lr_model_rmse}")

R2 score: 0.6101332275947471  Mean Square Error: 1252183947657.4944  Root Mean Square Error: 1119010.2535980153


In [88]:
# Alternate model - Random Forest
random_forest_model = RandomForestRegressor(n_estimators=1000, random_state=4)

random_forest_model.fit(scaled_x_train, y_train)
rf_model_predictions = random_forest_model.predict(scaled_x_test)

In [90]:
# Score model
rf_model_r2_score = metrics.r2_score(y_test, rf_model_predictions)
rf_model_rmse = metrics.root_mean_squared_error(y_test, rf_model_predictions)
rf_model_mse = rf_model_rmse ** 2

print(f"R2 score: {rf_model_r2_score}  Mean Square Error: {rf_model_mse}  Root Mean Square Error: {rf_model_rmse}")

R2 score: 0.5253645263629891  Mean Square Error: 1524446203533.5771  Root Mean Square Error: 1234684.6575274097


In [None]:
# Optimization 2

## Visualization

In [None]:
# Visualization