# linear regression

#### imports and misc

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model as lm
import sklearn.metrics as metrics
from dataset import x_train, x_test, y_train, y_test
from pprint import pprint
import seaborn as sns


In [None]:
def set_color(_fig, _ax):
    _fig.patch.set_facecolor('#1b212c')
    _ax.patch.set_facecolor('#1b212c')
    _ax.spines['bottom'].set_color('white')
    _ax.spines['top'].set_color('white')
    _ax.spines['left'].set_color('white')
    _ax.spines['right'].set_color('white')
    _ax.xaxis.label.set_color('white')
    _ax.yaxis.label.set_color('white')
    _ax.grid(alpha=0.1)
    _ax.title.set_color('white')
    _ax.tick_params(axis='x', colors='white')
    _ax.tick_params(axis='y', colors='white')

#### scatter plot of values + linear regression func

In [None]:
regr = lm.LinearRegression()
regr.fit(x_train, y_train)
y_pred = regr.predict(x_test)
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred))

fig, ax = plt.subplots(figsize=(10,10), dpi=300)
plt.scatter(y_test, y_pred, color='black')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=4)
plt.xlabel('Measured')
plt.ylabel('Predicted')
set_color(fig, ax)
plt.savefig("../images/linear_reg/scatter_plot.png")
plt.show()

#### intercept and coefficients

In [None]:
coef = pd.DataFrame(regr.coef_, x_train.columns, columns=['Coefficient'])
cl = coef.to_dict()["Coefficient"]
cl = {k: v for k, v in sorted(cl.items(), key=lambda item: abs(item[1]), reverse=True)}
print("Intercept:", regr.intercept_)
print("Coefficients sorted by impact: ")
pprint(cl, sort_dicts=False)

#### show MAE and the differences between y_test and y_pred

In [None]:
df = pd.DataFrame()
df['actual'] = y_test
df['predicted'] = y_pred
df['diff'] = abs(y_test - y_pred)
df2 = df[df["diff"] < 30000]

sns.set_theme(style="ticks", palette="pastel")
fig, ax = plt.subplots(figsize=(10, 6), dpi=300)
sns.distplot(df2["diff"], color="#81acc3", hist_kws=dict(alpha=0.4), fit_kws=dict(alpha=1), ax=ax)

plt.axvline(np.mean(df["diff"]), color='r', linestyle='--', label='Median')

sns.despine(offset=10, trim=True)
set_color(fig, ax)
plt.xlabel('Price deviation')
plt.xlim(0, None)
plt.savefig('../images/linear_reg/regr_error_dist.png', dpi=300)
plt.show()