In [48]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import statsmodels.formula.api as sm
import numpy as np


In [49]:
data = pd.read_csv('../Orginal Data/ex1/exercise.csv')
data.head()

Unnamed: 0,y,x1,x2
0,15.68,6.87,14.09
1,6.18,4.4,4.35
2,18.1,0.43,18.09
3,9.07,2.73,8.65
4,17.97,3.25,17.68


In [50]:
# train set
train = data[:40]
# test set
test = data[40:]
train.describe()

Unnamed: 0,y,x1,x2
count,40.0,40.0,40.0
mean,13.59025,5.3775,11.7815
std,5.279126,3.153414,5.946086
min,3.29,0.26,0.72
25%,9.325,2.8425,6.6525
50%,15.59,5.69,13.245
75%,18.0025,8.0425,17.3875
max,21.63,9.81,19.68


In [51]:
# create model
model = sm.ols(formula="y ~ x1 + x2", data=train).fit()
print(model.params)

Intercept    1.315135
x1           0.514810
x2           0.806920
dtype: float64


In [61]:
# model descriptions
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.972
Model:                            OLS   Adj. R-squared:                  0.971
Method:                 Least Squares   F-statistic:                     652.4
Date:                Tue, 10 Dec 2019   Prob (F-statistic):           1.41e-29
Time:                        23:51:56   Log-Likelihood:                -50.985
No. Observations:                  40   AIC:                             108.0
Df Residuals:                      37   BIC:                             113.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      1.3151      0.388      3.392      0.0

In [74]:
# residual
r = train.y - model.predict(train[['x1', 'x2']])

fig2 = go.Figure()
fig2.add_trace(go.Scatter(y=r))
fig2.update_layout(
    title="Residual",
    xaxis_title="sample number",
    yaxis_title="value of residual")
fig2.show()

print(r.describe())

count    4.000000e+01
mean    -7.327472e-16
std      8.766518e-01
min     -9.585359e-01
25%     -5.864645e-01
50%     -3.355516e-01
75%      3.973277e-01
max      2.854778e+00
dtype: float64


Model jest dobrze dopasowany do danych świadczy o tym wysoka wartość wspołczynnika R^2

In [24]:
# prediction for last 20 points
predicted = model.predict(test[['x1', 'x2']])


In [None]:
# create regression surface

min_x = data.x1.min()
max_x = data.x1.max()
min_y = data.x2.min()
max_y = data.x2.max()

x = np.linspace(min_x, max_x, 300)
y = np.linspace(min_y, max_y, 300)
m = []

coef = model.params
for row in range(300):
    row_v = []
    for col in range(300):
        row_v.append((coef[0]*x[row] + coef[1]*y[col] + coef[2]))
    m.append(row_v)

In [79]:
# display data set, model and prediction
fig = go.Figure()
fig.add_trace(go.Scatter3d(x=train.x1, y=train.x2, z=train.y, mode='markers', name='training data'))
fig.add_trace(go.Scatter3d(x=test.x1,y=test.x2,z=predicted, mode='markers', name='predicted'))
fig.add_surface(x=x, y=y, z=m, opacity=0.7, name='data linear regression', showscale=False)

fig.show()