In [None]:
import pandas as pd
import plotly.express as px
import matplotlib.pylab as plt
import seaborn as sns
import numpy as np
import utils as utils

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn import tree
from sklearn import metrics
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression

from sklearn.neural_network import MLPRegressor
from sklearn.datasets import make_regression

%reload_ext autoreload
%autoreload 2

In [None]:
def customAccuracy(predicted, actual, accuracy):
    total = 0
    true = 0
    for i,y in zip(predicted,actual):
        temp = y * accuracy/100
        bottomRange = y - temp
        topRange = y + temp
        if (i > bottomRange and i < topRange):
            true += 1
            total += 1
        else:
            total += 1
            
    return true/total

In [None]:
url1 = "https://raw.githubusercontent.com/MikePlekan/Real-Estate-project/Michael/data.csv"
url2 = "https://raw.githubusercontent.com/MikePlekan/Real-Estate-project/Michael/sales.csv"

df = pd.read_csv(url1)
sales = pd.read_csv(url2)
df['prices'] = sales['saleamt']
df = df.dropna(axis=0)
df

In [None]:
price = df['prices']
median = np.median(price)
q3,q1 = np.percentile(price,[75 ,25])
print(f"The price for the data is: ${median}")
print(f"The Q1 for the data is: ${q1}")
print(f"The Q2 for the data is: ${q3}")
print(f"The IQR for the data is: ${q3-q1}")

fig, ax = plt.subplots()
df['prices'].plot.hist(bins=50, figsize=(12,6))
plt.ticklabel_format(style='plain', axis='x')
plt.title("Housing prices")
plt.xlabel("Price ($)")

#ax.set_xlim([0, 200000])

In [None]:
min(price)

In [None]:
# Model
model = LinearRegression()

x = df['universalsize'].values.reshape(-1,1)
y = df['prices'].values

# Perform regression
model.fit(x,y)

# Calculate r^2
r_sq = model.score(x, y)
print(f'Coefficient of determination: {r_sq:.4f}')

cc = df[['universalsize','prices']].corr().iloc[0][1]
print(f'corr coeff:                   {cc:.4f}')
print(f'corr coeff^2:                 {cc**2:.4f}')

# Slope and intercept of the line-of-best-fit
print(f"The intercept for the line-of-best-fit is {model.intercept_:.2f}")
print(f"The slope for the line-of-best-fit is     {model.coef_[0]:.2f}")

# The equation for a line is
# y = mx + b
b = model.intercept_
m = model.coef_
xpts = np.linspace(200,7500)
ypts = m*xpts + b

# Plotting
df.plot.scatter(x = 'universalsize' ,y = 'prices',label=f'corr coef={cc:.2f}', alpha=0.2)
plt.title("Size vs Price")
plt.xlabel(f"Size (ft^2)", fontsize=14)
plt.ylabel("Price ($)", fontsize=14)
plt.plot(xpts,ypts,'r-', label='Linear regression line')
plt.ticklabel_format(style='plain', axis='y')
plt.legend(fontsize=14)

# HistGradientBoosting

In [None]:
X_trainHist, X_testHist, y_trainHist, y_testHist = train_test_split(df[['yearbuilt','universalsize','beds','bathstotal','zipcode']],df['prices'], test_size=0.6, random_state=42)

clf = HistGradientBoostingClassifier()

clf.fit(X_trainHist, y_trainHist)

predicationsHist = clf.predict(X_testHist)

accuracy = metrics.adjusted_rand_score(y_testHist, predicationsHist)

print(f"accuracy: {accuracy:.2f}")
customAccuracy(predicationsHist, y_testHist, 10)

In [None]:
checkHist = pd.DataFrame(y_testHist)
checkHist['predicitons'] = predicationsHist
checkHist['percent'] = ((predicationsHist-y_testHist)/y_testHist)*100
pd.set_option('display.float_format', lambda x: f'{x:.2f}')

In [None]:
print(customAccuracy(predicationsHist, y_testHist, 20))
checkHist.head(15)

# Regression

In [None]:
X_trainReg, X_testReg, y_trainReg, y_testReg = train_test_split(df[['yearbuilt','universalsize','beds','bathstotal','zipcode']],df['prices'], test_size=0.6, random_state=42)

regr = MLPRegressor(random_state=42,max_iter = 5000).fit(X_trainReg, y_trainReg)
predictionsReg = regr.predict(X_testReg)

regr.score(X_testReg, y_testReg)

In [None]:
checkReg = pd.DataFrame(y_testReg)
checkReg['predicitons'] = predictionsReg
checkReg['percent'] = ((predictionsReg-y_testReg)/y_testReg)*100
pd.set_option('display.float_format', lambda x: f'{x:.2f}')

In [None]:
print(customAccuracy(predictionsReg, y_testReg, 20))
checkReg.head(15)

In [None]:
fig, ax = plt.subplots()
ax.set_ylim([0, 6000000])
ax.set_xlim([0, 6000000])
plt.scatter(y_testReg, predictionsReg, alpha = 0.3) 
plt.xlabel(f"Actual Value", fontsize=14)
plt.ylabel("Predicted Value", fontsize=14)
ax.axline((0, 0), slope=1)
plt.ticklabel_format(style='plain', axis='y')
plt.ticklabel_format(style='plain', axis='x')
plt.legend(fontsize=14)