# Linear Regression with scikit-learn

This section creates and measures a linear regression model using sklearn.

* Method: Ordinary Least Squares
* Dataset: Sample Stocks
* Models correlate the relation between dividends and returns


In [1]:
#Import Dependencies
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
%matplotlib inline

In [8]:
# Import the data
stock_data = pd.read_csv("./data/sample_stocks.csv")
stock_data.head(5)

FileNotFoundError: File b'./data/sample_stocks.csv' does not exist

In [None]:
# Check the data types
stock_data.dtypes

# Fit a Linear Regression Model

In [None]:
# Create the X and y
X = stock_data[['dividendyield']]
y = stock_data[['returns']]

In [None]:
# Create a linear regression model
lm = LinearRegression()
lm

In [None]:
# Split the dataset into training and testing datasets
X_train, X_test, Y_train, Y_test = \
    train_test_split(X, y, test_size=0.33, random_state=5)

In [None]:
# Fit (train) the model with the training data
lm.fit(X_train, Y_train)

Intercept Coefficient: represents the mean change in the response variable for one unit of change in the predictor variable while holding everything else constant. It isolates the role of one variable from all others.

In [None]:
# Print the intercept coefficient
print('Estimated intercept coefficient: {}'.format(lm.intercept_))

In [None]:
# Number of coefficients: 
print('Number of coefficients: {}'.format(len(lm.coef_)))

Note: the correlation coefficients (below) give an idea of the strength of the relationship between two variables.

In [None]:
# Create a dataframe with the features and coefficients
fc_df = pd.DataFrame(list(zip(X.columns, lm.coef_)), columns=['features', 'coefficients'])
fc_df

In [None]:
# Create a plot 
fig = plt.figure(figsize=(20,10))
plt.scatter(stock_data.returns, stock_data.dividendyield)
# plt.xlabel("Average number of rooms per dwelling (RM)")
# plt.ylabel("Housing Price")
# plt.title("Relationship between RM and Price")
plt.show()

In [None]:
# Use the test data to create predictions and show the first 5
y_pred = lm.predict(X_test)
y_pred[0:5]

In [None]:
print(f"Training Data Score: {lm.score(X_train, Y_train)}")
print(f"Testing Data Score: {lm.score(X_test, Y_test)}")

In [None]:
# Create a plot to compare actual returns (Y_test) and the predicted returns (pred_test)
fig = plt.figure(figsize=(20,10))
plt.scatter(Y_test, y_pred)
# plt.xlabel("Actual Prices: $Y_i$")
# plt.ylabel("Predicted Prices: $\hat{Y}_i$")
# plt.title("Actual vs. Predicted Prices: $Y_i$ vs. $\hat{Y}_i$")
plt.show()

## Model Evaluation

### Mean Squared Error

* A measure of the average magnitude of the errors without consideration for their direction; measures accuracy for continuous variables.
* Always non-negative
* Values closer to zero (0) are better

In [None]:
# Get the Mean Squared Error (MSE) for all predictions
mse = mean_squared_error(Y_train, lm.predict(X_train))
print("MSE Training Data: {}".format(mse))

In [None]:
# Get the MSE for the test data
print("MSE Test Data: {}".format(mean_squared_error(Y_test, lm.predict(X_test))))

### Variance (R^2)

* Explains how much of the variability of a factor can be caused or explained by its relationship to another factor; how well the model is predicting.
* A score of 1 means a perfect prediction
* A score of 0 means the model always predicts the expected value of y, disregarding the input features

In [None]:
print("Variance Score: %.2f" % r2_score(Y_test, y_pred))

### Residual Plot

**Residuals**: the difference between the predictions and the actuals.


**Interpretation**: If the model is working well then the data should be randomly scattered around line zero. If there is structure in the data, that means the model is not capturing something, perhaps interaction between two variables or it's time dependent. Check the parameters of your model.

In [None]:
# Create a residual plot
fig = plt.figure(figsize=(20,10))
plt.scatter(lm.predict(X_train), lm.predict(X_train) - Y_train, c='b', s=40, alpha=0.5)
plt.scatter(lm.predict(X_test), lm.predict(X_test) - Y_test, c='g', s=40)
plt.hlines(y=0, xmin=0, xmax=50)
plt.ylabel("Residuals")
plt.title("Residual Plot Using Training (Blue) and Test (Green) Data")
plt.show()

## Logistic Regression with Sklearn

This section creates and measures a logistic regression model with sklearn.

* Method: [Logistic Regression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state=42)

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
print(f"Training Data Score: {lm.score(X_train, y_train)}")
print(f"Testing Data Score: {lm.score(X_test, y_test)}")

In [None]:
# Create a plot to compare actual labels (Y_test) and the predicted labels (predictions)
plt.scatter(y_test, y_pred)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs. Predicted")
plt.show()

In [None]:
print("Accuracy Score: %.2f" % accuracy_score(y_test, y_pred))

In [None]:
# Get scores for 5 folds over the data
clf = LogisticRegression()
scores = cross_val_score(clf, X_train, y_train, cv=5)
print(scores)

In [None]:
print(classification_report(y_test, y_pred))

## Lasso Regression with scikit-learn

This section creates and measures a LASSO regression model using sklearn.


In [None]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

import matplotlib.pyplot as plt
%matplotlib inline
    

In [None]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Create an instance of a Lasso Regression model
model = Lasso(alpha=0.3, normalize=True)
model.fit(X_train, y_train)

In [None]:
# Print the intercept coefficient
print('Estimated intercept coefficient: {}'.format(model.intercept_))

In [None]:
# Create a dataframe with the features and coefficients
fc_df = pd.DataFrame(list(zip(X.columns, model.coef_)), columns=['features', 'coefficients'])
fc_df.head()

In [None]:
len(fc_df)

In [None]:
y_pred = model.predict(X_test)

In [None]:
# Create a plot to compare actual sales (Y_test) and the predicted sales (pred_test)
fig = plt.figure(figsize=(20,10))
plt.scatter(y_test, y_pred)
# plt.xlabel("Actual Sales: $Y_i$")
# plt.ylabel("Predicted Sales: $\hat{Y}_i$")
# plt.title("Actual vs. Predicted Sales: $Y_i$ vs. $\hat{Y}_i$")
plt.show()

In [None]:
print(f"Training Data Score: {lm.score(X_train, y_train)}")
print(f"Testing Data Score: {lm.score(X_test, y_test)}")

In [None]:
# Get the Mean Squared Error (MSE) for all predictions
mse = mean_squared_error(y_train, model.predict(X_train))
print("MSE Training Data: {}".format(mse))

In [None]:
# Get the MSE for the test data
print("MSE Test Data: {}".format(mean_squared_error(y_test, model.predict(X_test))))

In [None]:
print("Variance Score: %.2f" % r2_score(y_test, y_pred))

In [None]:
# # Create a residual plot
# fig = plt.figure(figsize=(20,10))
# plt.scatter(model.predict(X_train), model.predict(X_train) - y_train, c='b', s=40, alpha=0.5)
# plt.scatter(model.predict(X_test), model.predict(X_test) - y_test, c='g', s=40)
# plt.hlines(y=0, xmin=0, xmax=50)
# # plt.ylabel("Residuals")
# # plt.title("Residual Plot Using Training (Blue) and Test (Green) Data")
# plt.show()