In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse

import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
house_prices = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()

In [None]:
clean_house_prices = house_prices.dropna(axis=1)

In [None]:
clean_house_prices.corr()

In [None]:
clean_house_prices['totalsf'] = clean_house_prices['totalbsmtsf'] + clean_house_prices['firstflrsf'] + clean_house_prices['secondflrsf']

clean_house_prices['int_over_sf'] = clean_house_prices['totalsf'] * clean_house_prices['overallqual']

In [None]:
Y = np.log1p(clean_house_prices['saleprice'])
X = clean_house_prices[['overallqual','garagecars','totalsf','int_over_sf']]
X = sm.add_constant(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

results = sm.OLS(y_train, X_train).fit()

results.summary()

In [None]:
y_preds = results.predict(X_test)

plt.scatter(y_test, y_preds)
plt.plot(y_test, y_test, color="red")
plt.xlabel("true values")
plt.ylabel("predicted values")
plt.title("Charges: true and predicted values")
plt.show()

print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds) / y_test)) * 100))

In [None]:
clean_house_prices = pd.concat([clean_house_prices,pd.get_dummies(clean_house_prices.mszoning, prefix="mszoning", drop_first=True)], axis=1)
dummy_column_names = list(pd.get_dummies(clean_house_prices.mszoning, prefix="mszoning", drop_first=True).columns)

In [None]:
Y = np.log1p(clean_house_prices['saleprice'])
X = clean_house_prices[['overallqual','garagecars','totalsf','int_over_sf']+ dummy_column_names]
X = sm.add_constant(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

results = sm.OLS(y_train, X_train).fit()

results.summary()

In [None]:
clean_house_prices.describe()