# Predicting Housing Prices

### *1. Importing libraries and data*

In [None]:
import numpy as np
import pandas as pd
from google.colab import drive
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score

In [None]:
drive.mount('/content/drive')
%cd /content/drive/My\ Drive/2.\ Career\ Development/Data\ Science/4.\ Data\ Science\ Projects/Predicting\ Housing\ Prices/

Mounted at /content/drive
/content/drive/My Drive/2. Career Development/Data Science/4. Data Science Projects/Predicting Housing Prices


In [None]:
cols = ['Id', 'Unnamed: 0', 'SalePrice']
x_scaled = pd.read_csv('predicting_housing_prices_data_scaled.csv').drop(cols, axis = 1)
x_unscaled = pd.read_csv('predicting_housing_prices_data_unscaled.csv').drop(cols, axis = 1)

In [None]:
y_scaled = pd.read_csv('predicting_housing_prices_data_scaled.csv')['SalePrice']
y_unscaled = pd.read_csv('predicting_housing_prices_data_unscaled.csv')['SalePrice']

__________________

### *2. Splitting Data into Training & Test Set*

**2.1 Scaled**

In [None]:
y_scaled = y_scaled.values
x_scaled = x_scaled.values

In [None]:
x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled = train_test_split(x_scaled, y_scaled, test_size = 0.2, random_state = 0)

**2.2 Unscaled**

In [None]:
y_unscaled = y_unscaled.values
x_unscaled = x_unscaled.values

In [None]:
x_train_unscaled, x_test_unscaled, y_train_unscaled, y_test_unscaled = train_test_split(x_unscaled, y_unscaled, test_size = 0.2, random_state = 0)

______________

### *3. Fitting a Multiple Linear Regression*

**3.1 Scaled**

In [None]:
#implements backward elimination
linear_regression_scaled = LinearRegression()
linear_regression_scaled.fit(x_train_scaled, y_train_scaled)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Cross Validation Training Set

In [None]:
r2_values = cross_val_score(estimator = linear_regression_scaled, X = x_train_scaled, y = y_train_scaled, cv = 10, scoring = 'r2')
print("R2: {:.2f} %".format(r2_values.mean()*100))
print("Standard Deviation: {:.2f} %".format(r2_values.std()*100))

R2: -2263254710848932282368.00 %
Standard Deviation: 3272011262255839051776.00 %


Test Set Performance

In [None]:
pred_lr_scaled = linear_regression_scaled.predict(x_test_scaled)

In [None]:
r2_score(y_test_scaled, pred_lr_scaled)

-1.502545125395107e+19

**3.2 Unscaled**

In [None]:
#implements backward elimination
linear_regression_unscaled = LinearRegression()
linear_regression_unscaled.fit(x_train_unscaled, y_train_unscaled)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Cross Validation Training Set

In [None]:
r2_values = cross_val_score(estimator = linear_regression_unscaled, X = x_train_unscaled, y = y_train_unscaled, cv = 10, scoring = 'r2')
print("R2: {:.2f} %".format(r2_values.mean()*100))
print("Standard Deviation: {:.2f} %".format(r2_values.std()*100))

R2: -47744521341.71 %
Standard Deviation: 66018376649.19 %


Test Set Performance

In [None]:
pred_lr_unscaled = linear_regression_scaled.predict(x_test_unscaled)

In [None]:
r2_score(y_test_unscaled, pred_lr_unscaled)

-5.709701454718341e+17

__________

### *4. Fitting a Decision Tree Regression*

**4.1 Scaled**

In [None]:
decision_tree_scaled = DecisionTreeRegressor(random_state=0)
decision_tree_scaled.fit(x_train_scaled,y_train_scaled)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

Cross Validation Training Set

In [None]:
r2_values = cross_val_score(estimator = decision_tree_scaled, X = x_train_scaled, y = y_train_scaled, cv = 10, scoring = 'r2')
print("R2: {:.2f} %".format(r2_values.mean()*100))
print("Standard Deviation: {:.2f} %".format(r2_values.std()*100))

R2: 68.53 %
Standard Deviation: 12.79 %


Test Set Performance

In [None]:
pred_dt_scaled = decision_tree_scaled.predict(x_test_scaled)

In [None]:
r2_score(y_test_scaled, pred_dt_scaled)

0.774920905307106

**4.2 Unscaled**

In [None]:
decision_tree_unscaled = DecisionTreeRegressor(random_state=0)
decision_tree_unscaled.fit(x_train_unscaled,y_train_unscaled)

DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=None,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=0, splitter='best')

Cross Validation Training Set

In [None]:
r2_values = cross_val_score(estimator = decision_tree_unscaled, X = x_train_unscaled, y = y_train_unscaled, cv = 10, scoring = 'r2')
print("R2: {:.2f} %".format(r2_values.mean()*100))
print("Standard Deviation: {:.2f} %".format(r2_values.std()*100))

R2: 68.03 %
Standard Deviation: 12.24 %


Test Set Performance

In [None]:
pred_dt_unscaled = decision_tree_unscaled.predict(x_test_unscaled)

In [None]:
r2_score(y_test_unscaled, pred_dt_unscaled)

0.8135851614541421

__________________

### *5. Fitting a Polynomial Regression*

**5.1 Scaled**

In [None]:
poly_reg_scaled = PolynomialFeatures(degree = 2)
x_poly_scaled = poly_reg_scaled.fit_transform(x_train_scaled)

In [None]:
pr_2_scaled = LinearRegression()
pr_2_scaled.fit(x_poly_scaled, y_train_scaled)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Test Set Performance

In [None]:
pred_pr_scaled = pr_2_scaled.predict(poly_reg_scaled.fit_transform(x_test_scaled))

In [None]:
r2_score(y_test_scaled, pred_pr_scaled)

0.8526993997522644

**5.2 Unscaled**

In [None]:
poly_reg_unscaled = PolynomialFeatures(degree = 2)
x_poly_unscaled = poly_reg_unscaled.fit_transform(x_train_unscaled)

In [None]:
pr_2_unscaled = LinearRegression()
pr_2_unscaled.fit(x_poly_unscaled, y_train_unscaled)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Test Set Performance

In [None]:
pred_pr_unscaled = pr_2_unscaled.predict(poly_reg_unscaled.fit_transform(x_test_unscaled))

In [None]:
r2_score(y_test_unscaled, pred_pr_unscaled)

-2.656682643189701

________________

### *6. Random Forest Regression*

**6.1 Unscaled**

In [None]:
random_forest_unscaled = RandomForestRegressor(n_estimators = 10, random_state = 0)
random_forest_unscaled.fit(x_train_unscaled, y_train_unscaled)

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=10, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)

Cross Validation Training Set

In [None]:
r2_values = cross_val_score(estimator = random_forest_unscaled, X = x_train_unscaled, y = y_train_unscaled, cv = 10, scoring = 'r2')
print("R2: {:.2f} %".format(r2_values.mean()*100))
print("Standard Deviation: {:.2f} %".format(r2_values.std()*100))

R2: 83.57 %
Standard Deviation: 6.41 %


Test Set Performance

In [None]:
pred_rf_unscaled = random_forest_unscaled.predict(x_test_unscaled)

In [None]:
r2_score(y_test_unscaled, pred_rf_unscaled)

0.8411767715751246

__________

### *7. Support Vector Regression*

**7.1 Scaled**

In [None]:
#Input kernel: RBF kernel - recommended
svr = SVR(kernel = 'rbf')
svr.fit(x_train_scaled,y_train_scaled)

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='scale',
    kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

Cross Validation Training Set

In [None]:
r2_values = cross_val_score(estimator = svr, X = x_train_scaled, y = y_train_scaled, cv = 10, scoring = 'r2')
print("R2: {:.2f} %".format(r2_values.mean()*100))
print("Standard Deviation: {:.2f} %".format(r2_values.std()*100))

R2: 86.20 %
Standard Deviation: 6.47 %


Test Set Performance

In [None]:
pred_svr = svr.predict(x_test_scaled)

In [None]:
r2_score(y_test_scaled, pred_svr)

0.7489065971489697