# Regresja Liniowa

### Generowanie danych

In [None]:
import numpy as np
import pandas as pd
# 1d
n = 100
X = np.random.rand(n)
y = 4 + 3*X + np.random.randn(n)
pd.DataFrame({"zmienna_1": X, "target": y}).to_csv("dane_1.csv", index=False)

# 2d
n = 100
X1 = np.random.rand(n)
X2 = np.random.rand(n)
y = 4 -3*X1 + 5*X2 + np.random.randn(n) / 100
pd.DataFrame({"zmienna_1": X1, 
              "zmienna_2": X2, 
              "target": y}).to_csv("dane_2.csv", index=False)

# non linear
n = 1_000
X = np.random.uniform(-10, 10, n)
y =  X*X + 2*X -3 + 5 * np.random.randn(n)
pd.DataFrame({"zmienna_1": X, "target": y}).to_csv("dane_sq.csv", index=False)

n = 100
X = np.random.uniform(-10, 10, n)
y =  X**3 + 3*X*X + 2*X -3 + 50 * np.random.randn(n)
pd.DataFrame({"zmienna_1": X, "target": y}).to_csv("dane_cb.csv", index=False)

n = 10
X = np.random.uniform(-10, 10, n)
y =  0.2*X -3 + 10* np.random.randn(n)
pd.DataFrame({"zmienna_1": X, "target": y}).to_csv("dane_pl.csv", index=False)

### Regresja jednowymiarowa

In [None]:
import pandas as pd

df = pd.read_csv("dane_1.csv")
df['intercept'] = 1
df

In [None]:
X = df[['zmienna_1', 'intercept']].values
y = df[['target']].values

$\hat{\theta} = (X^T \cdot X) ^ {-1} \cdot X^T \cdot y$

In [None]:
theta = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
theta

In [None]:
y_pred = theta[1] + theta[0] * df['zmienna_1']

In [None]:
y_pred = theta[1] + theta[0] * df['zmienna_1']
plt.scatter(df['zmienna_1'], df['target'], alpha=0.3)
plt.plot(df['zmienna_1'], y_pred, c='red')
plt.show()

### Metoda najmniejszych kwadratów `statsmodel`

http://net-informations.com/ds/mla/ols.htm

In [4]:
!pip install statsmodels

Collecting statsmodels
  Downloading statsmodels-0.13.2-cp39-cp39-macosx_10_9_x86_64.whl (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m23.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting patsy>=0.5.2
  Downloading patsy-0.5.2-py2.py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.7/233.7 KB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: patsy, statsmodels
Successfully installed patsy-0.5.2 statsmodels-0.13.2


In [5]:
import pandas
import statsmodels.api as sm
df = pandas.read_csv("restaurants.csv")
X = df['Food_Quality']
Y = df['Price']
X = sm.add_constant(X)
model = sm.OLS(Y, X).fit()
summary = model.summary()
print(summary)

                            OLS Regression Results                            
Dep. Variable:                  Price   R-squared:                       0.321
Model:                            OLS   Adj. R-squared:                  0.300
Method:                 Least Squares   F-statistic:                     15.15
Date:                Sun, 10 Apr 2022   Prob (F-statistic):           0.000474
Time:                        22:48:41   Log-Likelihood:                -111.50
No. Observations:                  34   AIC:                             227.0
Df Residuals:                      32   BIC:                             230.0
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const          -23.9019     17.718     -1.349   

### Regresja wielowymiarowa

In [None]:
import pandas as pd

df = pd.read_csv("dane_2.csv")
df['intercept'] = 1
df

In [None]:
X = df[['zmienna_1', 'zmienna_2', 'intercept']].values
y = df[['target']].values

In [None]:
theta = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
theta

In [None]:
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df['zmienna_1'], df['target'], df['zmienna_2'])

x = df['zmienna_1'].values
z = df['zmienna_2'].values

(xx, zz) = np.meshgrid(np.arange(x.min(), x.max(), 0.01), np.arange(z.min(), z.max(), 0.01))
y = xx + zz

# x = np.array([[1, 3], [2, 4]])
# y = np.array([[5, 6], [7, 8]])
# z = np.array([[9, 12], [10, 11]])

ax.plot_surface(xx, zz, y, alpha=0.3)


### Feature selection - backward elimination

### Regresja wielomianowa

In [None]:
import pandas as pd

df = pd.read_csv("dane_sq.csv")
df['intercept'] = 1

In [None]:
X = df[['zmienna_1', 'intercept']].values
y = df[['target']].values

In [None]:
theta = np.linalg.inv(X.T.dot(X)).dot(X.T).dot(y)
theta

In [None]:
y_pred = theta[1] + theta[0] * df['zmienna_1']
plt.scatter(df['zmienna_1'], df['target'], alpha=0.3)
plt.plot(df['zmienna_1'], y_pred, c='red')
plt.show()

In [None]:
from sklearn.linear_model import LinearRegression
import pandas as pd

df = pd.read_csv("dane_sq.csv")

X = df[['zmienna_1']].values
y = df[['target']].values

model = LinearRegression()
model.fit(X, y)

In [None]:
print(f"Coeficients: {model.coef_}")
print(f"Intercept:   {model.intercept_}")

In [None]:
y_pred = model.predict(X)
plt.scatter(df['zmienna_1'], df['target'], alpha=0.3)
plt.plot(df['zmienna_1'], y_pred, c='red')
plt.show()

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import pandas as pd

df = pd.read_csv("dane_sq.csv")
X = df[['zmienna_1']].values
y = df[['target']].values

In [None]:
X_poly = PolynomialFeatures(degree=2, include_bias=False).fit_transform(X)

In [None]:
model = LinearRegression()
model.fit(X_poly, y)

print(f"Coeficients: {model.coef_}")
print(f"Intercept:   {model.intercept_}")

In [None]:
y_pred = model.predict(X_poly)
plt.scatter(df['zmienna_1'], df['target'], alpha=0.3)
plt.scatter(df['zmienna_1'], y_pred, c='red', s=0.5)
plt.show()

In [None]:
df = pd.read_csv("dane_pl.csv").sort_values('zmienna_1')
X = df[['zmienna_1']].values
y = df[['target']].values

poly = PolynomialFeatures(degree=10, include_bias=False)
X_poly = poly.fit_transform(X)

model = LinearRegression()
model.fit(X_poly, y)

print(f"Coeficients: {model.coef_}")
print(f"Intercept:   {model.intercept_}")

y_pred = model.predict(X_poly)

plt.scatter(df['zmienna_1'], y_pred, c='red', alpha=0.3, marker='s', s=100)

xt = np.linspace(X.min(), X.max(), 100).reshape(-1, 1)
yt = model.predict(poly.transform(xt))
plt.plot(xt, yt, c='red')

plt.scatter(df['zmienna_1'], df['target'], color='black')

plt.show()

### Gradient prosty

In [None]:
import pandas as pd

df = pd.read_csv("dane_1.csv")
df['intercept'] = 1
X = df[['zmienna_1', 'intercept']].values
y = df[['target']].values

In [None]:
eta = 0.1  # learning rate
n_iterations = 20
m = df.shape[0]

theta = np.random.randn(2,1)  # random initialization

plt.figure(figsize=(10, 7))
plt.scatter(df['zmienna_1'], df['target'], alpha=0.3)

for iteration in range(n_iterations):
    gradients = 2/m * X.T.dot(X.dot(theta) - y)
    theta = theta - eta * gradients
    
    y_pred = theta[1] + theta[0] * df['zmienna_1']
    plt.plot(df['zmienna_1'], y_pred, c='red', linewidth=0.5,
             linestyle='--', alpha=0.5)

plt.show()
print(theta)