# Simple Linear Regression

In [1]:
import numpy as np
from sklearn.linear_model import LinearRegression

In [2]:
x = np.array([5, 15, 25, 35, 45, 55]).reshape((-1, 1))
y = np.array([5, 20, 14, 32, 22, 38])


In [3]:
print(x)

[[ 5]
 [15]
 [25]
 [35]
 [45]
 [55]]


In [4]:
print(y)

[ 5 20 14 32 22 38]


In [5]:
model = LinearRegression()

In [6]:
model.fit(x, y)

LinearRegression()

In [7]:
model = LinearRegression().fit(x, y)

In [8]:
r_sq = model.score(x, y)
print('coefficient of determination:', r_sq)

coefficient of determination: 0.715875613747954


In [9]:
print('intercept:', model.intercept_)

intercept: 5.633333333333333


In [10]:
print('slope:', model.coef_)

slope: [0.54]


In [11]:
new_model = LinearRegression().fit(x, y.reshape((-1, 1)))
print('intercept:', new_model.intercept_)

intercept: [5.63333333]


In [12]:
print('slope:', new_model.coef_)

slope: [[0.54]]


In [13]:
y_pred = model.predict(x)
print('predicted response:', y_pred, sep='\n')

predicted response:
[ 8.33333333 13.73333333 19.13333333 24.53333333 29.93333333 35.33333333]


In [14]:
y_pred = model.intercept_ + model.coef_ * x
print('predicted response:', y_pred, sep='\n')

predicted response:
[[ 8.33333333]
 [13.73333333]
 [19.13333333]
 [24.53333333]
 [29.93333333]
 [35.33333333]]


In [15]:
x_new = np.arange(5).reshape((-1, 1))
print(x_new)


[[0]
 [1]
 [2]
 [3]
 [4]]


In [16]:
y_new = model.predict(x_new)
print(y_new)

[5.63333333 6.17333333 6.71333333 7.25333333 7.79333333]


# Multiple Linear Regression

In [17]:
import numpy as np
from sklearn.linear_model import LinearRegression

In [18]:
x = [[0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]]
y = [4, 5, 20, 14, 32, 22, 38, 43]
x, y = np.array(x), np.array(y)

In [19]:
print(x)

[[ 0  1]
 [ 5  1]
 [15  2]
 [25  5]
 [35 11]
 [45 15]
 [55 34]
 [60 35]]


In [20]:
print(y)

[ 4  5 20 14 32 22 38 43]


In [21]:
model = LinearRegression().fit(x, y)

In [22]:
r_sq = model.score(x, y)
print('coefficient of determination:', r_sq)

coefficient of determination: 0.8615939258756776


In [23]:
print('intercept:', model.intercept_)

intercept: 5.52257927519819


In [24]:
print('slope:', model.coef_)

slope: [0.44706965 0.25502548]


In [25]:
y_pred = model.predict(x)
print('predicted response:', y_pred, sep='\n')

predicted response:
[ 5.77760476  8.012953   12.73867497 17.9744479  23.97529728 29.4660957
 38.78227633 41.27265006]


In [26]:
y_pred = model.intercept_ + np.sum(model.coef_ * x, axis=1)
print('predicted response:', y_pred, sep='\n')

predicted response:
[ 5.77760476  8.012953   12.73867497 17.9744479  23.97529728 29.4660957
 38.78227633 41.27265006]


In [27]:
x_new = np.arange(10).reshape((-1, 2))
print(x_new)


[[0 1]
 [2 3]
 [4 5]
 [6 7]
 [8 9]]


In [28]:
y_new = model.predict(x_new)
print(y_new)

[ 5.77760476  7.18179502  8.58598528  9.99017554 11.3943658 ]


# Polynomial Regression

In [29]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [30]:
x = np.array([5, 15, 25, 35, 45, 55]).reshape((-1, 1))
y = np.array([15, 11, 2, 8, 25, 32])

In [31]:
transformer = PolynomialFeatures(degree=2, include_bias=False)

In [32]:
transformer.fit(x)

PolynomialFeatures(include_bias=False)

In [33]:
x_ = transformer.transform(x)

In [34]:
x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(x)

In [35]:
print(x_)

[[   5.   25.]
 [  15.  225.]
 [  25.  625.]
 [  35. 1225.]
 [  45. 2025.]
 [  55. 3025.]]


In [36]:
model = LinearRegression().fit(x_, y)

In [37]:
r_sq = model.score(x_, y)
print('coefficient of determination:', r_sq)

coefficient of determination: 0.8908516262498563


In [38]:
print('intercept:', model.intercept_)

intercept: 21.372321428571418


In [39]:
print('coefficients:', model.coef_)

coefficients: [-1.32357143  0.02839286]


In [40]:
x_ = PolynomialFeatures(degree=2, include_bias=True).fit_transform(x)

In [41]:
print(x_)

[[1.000e+00 5.000e+00 2.500e+01]
 [1.000e+00 1.500e+01 2.250e+02]
 [1.000e+00 2.500e+01 6.250e+02]
 [1.000e+00 3.500e+01 1.225e+03]
 [1.000e+00 4.500e+01 2.025e+03]
 [1.000e+00 5.500e+01 3.025e+03]]


In [42]:
model = LinearRegression(fit_intercept=False).fit(x_, y)

In [43]:
r_sq = model.score(x_, y)
print('coefficient of determination:', r_sq)

coefficient of determination: 0.8908516262498564


In [44]:
print('intercept:', model.intercept_)

intercept: 0.0


In [45]:
print('coefficients:', model.coef_)

coefficients: [21.37232143 -1.32357143  0.02839286]


In [46]:
y_pred = model.predict(x_)

In [47]:
print('predicted response:', y_pred, sep='\n')

predicted response:
[15.46428571  7.90714286  6.02857143  9.82857143 19.30714286 34.46428571]


# Multiple input variables

In [48]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

In [49]:
x = [[0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]]
y = [4, 5, 20, 14, 32, 22, 38, 43]
x, y = np.array(x), np.array(y)

In [50]:
x_ = PolynomialFeatures(degree=2, include_bias=False).fit_transform(x)

In [51]:
model = LinearRegression().fit(x_, y)

In [52]:
r_sq = model.score(x_, y)
intercept, coefficients = model.intercept_, model.coef_

In [53]:
y_pred = model.predict(x_)

In [54]:
print('coefficient of determination:', r_sq)

coefficient of determination: 0.945370144912782


In [55]:
print('intercept:', intercept)

intercept: 0.8430556452396445


In [56]:
print('coefficients:', coefficients, sep='\n')

coefficients:
[ 2.44828275  0.16160353 -0.15259677  0.47928683 -0.4641851 ]


In [57]:
print('predicted response:', y_pred, sep='\n')

predicted response:
[ 0.54047408 11.36340283 16.07809622 15.79139    29.73858619 23.50834636
 39.05631386 41.92339046]


# Advanced linear regression with stats models

In [58]:
import numpy as np
import statsmodels.api as sm

In [59]:
x = [[0, 1], [5, 1], [15, 2], [25, 5], [35, 11], [45, 15], [55, 34], [60, 35]]
y = [4, 5, 20, 14, 32, 22, 38, 43]
x, y = np.array(x), np.array(y)

In [60]:
x = sm.add_constant(x)

In [61]:
print(x)

[[ 1.  0.  1.]
 [ 1.  5.  1.]
 [ 1. 15.  2.]
 [ 1. 25.  5.]
 [ 1. 35. 11.]
 [ 1. 45. 15.]
 [ 1. 55. 34.]
 [ 1. 60. 35.]]


In [62]:
print(y)

[ 4  5 20 14 32 22 38 43]


In [63]:
model = sm.OLS(y, x)

In [64]:
results = model.fit()

In [65]:
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.862
Model:                            OLS   Adj. R-squared:                  0.806
Method:                 Least Squares   F-statistic:                     15.56
Date:                Mon, 24 Jan 2022   Prob (F-statistic):            0.00713
Time:                        13:14:49   Log-Likelihood:                -24.316
No. Observations:                   8   AIC:                             54.63
Df Residuals:                       5   BIC:                             54.87
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          5.5226      4.431      1.246      0.2



In [66]:
 print('coefficient of determination:', results.rsquared)

coefficient of determination: 0.8615939258756776


In [67]:
print('adjusted coefficient of determination:', results.rsquared_adj)

adjusted coefficient of determination: 0.8062314962259487


In [68]:
print('regression coefficients:', results.params)

regression coefficients: [5.52257928 0.44706965 0.25502548]


In [69]:
print('predicted response:', results.fittedvalues, sep='\n')

predicted response:
[ 5.77760476  8.012953   12.73867497 17.9744479  23.97529728 29.4660957
 38.78227633 41.27265006]


In [70]:
print('predicted response:', results.predict(x), sep='\n')

predicted response:
[ 5.77760476  8.012953   12.73867497 17.9744479  23.97529728 29.4660957
 38.78227633 41.27265006]


In [71]:
x_new = sm.add_constant(np.arange(10).reshape((-1, 2)))
print(x_new)

[[1. 0. 1.]
 [1. 2. 3.]
 [1. 4. 5.]
 [1. 6. 7.]
 [1. 8. 9.]]


In [72]:
y_new = results.predict(x_new)
print(y_new)

[ 5.77760476  7.18179502  8.58598528  9.99017554 11.3943658 ]


# Installing sklearn

In [73]:
!python -m pip install -U "scikit-learn==0.23.1"

Collecting scikit-learn==0.23.1
  Using cached scikit_learn-0.23.1-cp38-cp38-win_amd64.whl (6.8 MB)
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.0.2
    Uninstalling scikit-learn-1.0.2:
      Successfully uninstalled scikit-learn-1.0.2


ERROR: Could not install packages due to an OSError: [WinError 5] Access is denied: 'C:\\Users\\LEELA\\anaconda3\\Lib\\site-packages\\~-learn\\.libs\\vcomp140.dll'
Consider using the `--user` option or check the permissions.



# train_test_split()

In [74]:
import numpy as np
from sklearn.model_selection import train_test_split

In [75]:
x = np.arange(1, 25).reshape(12, 2)
y = np.array([0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])

In [76]:
x

array([[ 1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8],
       [ 9, 10],
       [11, 12],
       [13, 14],
       [15, 16],
       [17, 18],
       [19, 20],
       [21, 22],
       [23, 24]])

In [77]:
y

array([0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0])

In [78]:
 x_train, x_test, y_train, y_test = train_test_split(x, y)

In [79]:
x_train

array([[19, 20],
       [23, 24],
       [21, 22],
       [ 5,  6],
       [ 3,  4],
       [13, 14],
       [17, 18],
       [ 9, 10],
       [15, 16]])

In [80]:
x_test

array([[ 7,  8],
       [11, 12],
       [ 1,  2]])

In [81]:
y_train

array([0, 0, 1, 1, 1, 0, 1, 1, 1])

In [82]:
y_test

array([0, 0, 0])

In [83]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=4, random_state=4)

In [84]:
x_train

array([[17, 18],
       [ 5,  6],
       [23, 24],
       [ 1,  2],
       [ 3,  4],
       [11, 12],
       [15, 16],
       [21, 22]])

In [85]:
x_test

array([[ 7,  8],
       [ 9, 10],
       [13, 14],
       [19, 20]])

In [86]:
y_train

array([1, 1, 0, 0, 1, 0, 1, 1])

In [87]:
y_test

array([0, 1, 0, 0])

In [88]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=4, stratify=y)

In [89]:
x_train

array([[21, 22],
       [ 1,  2],
       [15, 16],
       [13, 14],
       [17, 18],
       [19, 20],
       [23, 24],
       [ 3,  4]])

In [90]:
x_test

array([[11, 12],
       [ 7,  8],
       [ 5,  6],
       [ 9, 10]])

In [91]:
y_train

array([1, 0, 1, 0, 1, 0, 0, 1])

In [92]:
y_test

array([0, 0, 1, 1])

In [93]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, shuffle=False)

In [94]:
x_train

array([[ 1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8],
       [ 9, 10],
       [11, 12],
       [13, 14],
       [15, 16]])

In [95]:
x_test

array([[17, 18],
       [19, 20],
       [21, 22],
       [23, 24]])

In [96]:
y_train

array([0, 1, 1, 0, 1, 0, 0, 1])

In [97]:
y_test

array([1, 0, 1, 0])

# Supervised Machine Learning With train_test_split()

In [98]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [99]:
x = np.arange(20).reshape(-1, 1)

In [100]:
y = np.array([5, 12, 11, 19, 30, 29, 23, 40, 51, 54, 74,62, 68, 73, 89, 84, 89, 101, 99, 106])


In [101]:
x

array([[ 0],
       [ 1],
       [ 2],
       [ 3],
       [ 4],
       [ 5],
       [ 6],
       [ 7],
       [ 8],
       [ 9],
       [10],
       [11],
       [12],
       [13],
       [14],
       [15],
       [16],
       [17],
       [18],
       [19]])

In [102]:
y

array([  5,  12,  11,  19,  30,  29,  23,  40,  51,  54,  74,  62,  68,
        73,  89,  84,  89, 101,  99, 106])

In [103]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=8, random_state=0)

In [104]:
 model = LinearRegression().fit(x_train, y_train)

In [105]:
model.intercept_

3.1617195496417523

In [106]:
model.coef_

array([5.53121801])

In [107]:
model.score(x_train, y_train)

0.9868175024574795

In [108]:
model.score(x_test, y_test)

0.9465896927715023

# Regression example

In [3]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split

In [4]:
x, y = load_boston(return_X_y=True)

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0)

# Linear Regression

In [6]:
from sklearn.linear_model import LinearRegression

In [7]:
model = LinearRegression().fit(x_train, y_train)

In [8]:
model.score(x_train, y_train)

0.7668160223286261

In [9]:
model.score(x_test, y_test)

0.6882607142538013

#  GradientBoostingRegressor

In [10]:
from sklearn.ensemble import GradientBoostingRegressor

In [11]:
model = GradientBoostingRegressor(random_state=0).fit(x_train, y_train)

In [12]:
model.score(x_train, y_train)

0.9859065238883613

In [13]:
model.score(x_test, y_test)

0.8530127436482149

# RandomForestRegressor

In [14]:
from sklearn.ensemble import RandomForestRegressor

In [15]:
 model = RandomForestRegressor(random_state=0).fit(x_train, y_train)

In [16]:
model.score(x_train, y_train)

0.9811695664860354

In [17]:
model.score(x_test, y_test)

0.8325867908704008