<div style="text-align:center">
    <img src="../files/monolearn-logo.png" height="150px">
    <h1>ML course</h1>
    <h3>Session 04: Linear Regression(Simple and Multiple), Boston house price prediction project</h3>
    <h4><a href="https://amzenterprise.ir/">Ali Momenzadeh</a></h5>
</div>

### Linear Regression

#### Import libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
#When using the 'inline' backend, your matplotlib graphs will be included in your notebook, next to the code.

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#### Load and prepare data

In [None]:
df = sns.load_dataset('tips')

#### EDA

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
# df = df.dropna()

In [None]:
df.sample(5)

In [None]:
df.groupby('day').count()

In [None]:
df2=df.groupby('day').sum() # sum per day
df2.drop('size',inplace=True,axis=1) # sum of size column is not relevant
df2['percent'] = df2['tip']/df2['total_bill']*100 # add percents
df2

In [None]:
df3=df.groupby('smoker').sum()
df3['percent'] = df3['tip']/df3['total_bill']*100
df3

In [None]:
df4= df.groupby(['day','size']).sum()
df4['percent'] = df4['tip']/df4['total_bill']*100
#df4 = df4.dropna() # drop null rows
df4.dropna(inplace=True)
df4

### Storytelling - Visualization

In [None]:
sns.catplot(x="day" , kind="count", data=df)

In [None]:
sns.catplot(x="day", hue="size", kind="count", data=df)

In [None]:
corr = df.corr()
sns.heatmap(corr, annot=True)

In [None]:
corr

#### Convert non-numeric values (Encoding the independent variables)

In [None]:
df

#### 1. First approach

In [None]:
# df.replace({ 'sex': {'Male':0 , 'Female':1}, 'smoker' : {'No': 0, 'Yes': 1}}, inplace=True)
# # df = df.replace({ 'sex': {'Male':0 , 'Female':1}, 'smoker' : {'No': 0 , 'Yes': 1}} )

# df.head()

In [None]:
# days = pd.get_dummies(df['day'])
# days.sample(5)

In [None]:
# days = pd.get_dummies(df['day'])
# df = pd.concat([df,days],axis=1)
# df

In [None]:
# times = pd.get_dummies(df['time'])
# df = pd.concat([df,times],axis=1)

In [None]:
# df.head()

#### 2. Second approach

In [None]:
categorical_cols = ['sex', 'smoker', 'day', 'time']
df = pd.get_dummies(df, columns=categorical_cols)

In [None]:
df.head()

#### Train and test (Regression)

In [None]:
X = df[['total_bill','size']]
Y = df[['tip']]

In [None]:
X

In [None]:
Y

In [None]:
import sklearn

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [None]:
X_train, X_test , y_train , y_test = train_test_split(X, Y, test_size=0.25, random_state=0)

In [None]:
regressor = LinearRegression()
regressor.fit(X_train, y_train)
predictions = regressor.predict(X_test)

In [None]:
print(predictions)

In [None]:
X_test

In [None]:
X_test.iloc[0, :]

In [None]:
predictions[0]

In [None]:
y_test

In [None]:
sns.histplot(y_test - predictions)

In [None]:
map = sns.cubehelix_palette(as_cmap=True)
f, ax = plt.subplots()
points = ax.scatter(y_test, predictions, cmap=map)
f.colorbar(points)
plt.show()

### Model evaluation

<img src="../files/4/metrics.jpg">
<br/>
<img src="../files/4/sst-see-ssr.jpg">
<br/>
<img src="../files/4/regression-error.png">
<br/>
<img src="../files/4/adjusted-r2.png">

𝑅^2 : It is a measure of the linear relationship between X and Y. It is interpreted as the proportion of the variance in the dependent variable that is predictable from the independent variable.

Adjusted 𝑅^2 :The adjusted R-squared compares the explanatory power of regression models that contain different numbers of predictors.

MAE (Mean Absolute Error): It is the mean of the absolute value of the errors. It measures the difference between two continuous variables, here actual and predicted values of y. 

MSE (Mean Squared Error): The mean square error (MSE) is just like the MAE, but squares the difference before summing them all instead of using the absolute value. 

RMSE (Root Mean Squared Error): It is the square root of the mean of the squared errors (squre of Mean Absolute Error or MSE).

In [None]:
print("R2 Score:", metrics.r2_score(y_test, predictions))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, predictions))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, predictions))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [None]:
# r2score = regressor.score(X_test, y_test)
# r2score

#### Do Not Predict Or Evaluate On Train Data (Because it is pointless!)

- When we say that evaluation should never be done on the training set, it means that the real performance of the model can only be estimated on a separate test set.
<br/>
- It's totally fine to calculate the performance of a system on the training data, and it's often useful (e.g. to avoid overfitting). Of course the obtained result does not represent in any way the real performance of the system, so it's important to make sure that there's no confusion by mentioning it clearly.

In [None]:
# train_predictions = regressor.predict(X_train)
# print("R2 Score:", metrics.r2_score(y_train, train_predictions))
# print("Mean Absolute Error:", metrics.mean_absolute_error(y_train, train_predictions))
# print("Mean Squared Error:", metrics.mean_squared_error(y_train, train_predictions))
# print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_train, train_predictions)))

In [None]:
# r2score = regressor.score(X_train, y_train)
# r2score

In [None]:
regressor.coef_

In [None]:
regressor.intercept_

### Predict on new data

In [None]:
X.head()

In [None]:
new_customer = np.array([16.99, 2]).reshape(1,-1)
regressor.predict(new_customer)

<hr/>

### Multiple-linear Regression

#### Import libraries

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
#When using the 'inline' backend, your matplotlib graphs will be included in your notebook, next to the code.

import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

#### Load and prepare data

##### Boston House Dataset

<img src="../files/4/Boston-house-price-prediction.png" width=75% />

The problem that we are going to solve here is that given a set of features that describe a house in Boston, our machine learning model must predict the house price. To train our machine learning model with boston housing data, we will be using scikit-learn’s boston dataset.

In this dataset, each row describes a boston town or suburb. There are 506 rows and 13 attributes (features) with a target column (price). https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.names

In [None]:
#imports datasets from scikit-learn
from sklearn import datasets 

#loads Boston dataset from datasets library 
from sklearn.datasets import load_boston
boston = load_boston() 

In [None]:
print(boston.DESCR)

In [None]:
boston.feature_names

In [None]:
# Initializing the dataframe
data = pd.DataFrame(boston.data, columns=boston.feature_names)

In [None]:
# See head of the dataset
data.head()

In [None]:
#Adding target variable to dataframe
data['PRICE'] = boston.target 
# Median value of owner-occupied homes in $1000s

In [None]:
data.head()

In [None]:
#Check the shape of dataframe
data.shape

In [None]:
data.columns

In [None]:
data.dtypes

In [None]:
# Identifying the unique number of values in the dataset
data.nunique()

In [None]:
# Check for missing values
data.isnull().sum()

In [None]:
# Viewing the data statistics
data.describe()

In [None]:
# Finding out the correlation between the features
corr = data.corr()
corr.shape

In [None]:
# Plotting the heatmap of correlation between features
plt.figure(figsize=(25,25))
sns.heatmap(corr, cbar=True, square= True, fmt='.1f', annot=True, annot_kws={'size':15}, cmap='Greens')

In [None]:
corr

### Simple VS Multiple Linear Regression

#### Simple Linear Regression (SLR)

In [None]:
X = data[['RM']]
#X = data["ZN"].values.reshape((-1,1))
y = data['PRICE']

In [None]:
# Splitting to training and testing data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# Import library for Linear Regression
from sklearn.linear_model import LinearRegression

# Create a Linear regressor
slr = LinearRegression()

# Train the model using the training sets 
slr.fit(X_train, y_train)

In [None]:
slr.intercept_

In [None]:
slr.coef_

In [None]:
#Converting the coefficient values to a dataframe
coeffcients = pd.DataFrame([X_train.columns,slr.coef_]).T
coeffcients = coeffcients.rename(columns={0: 'Attribute', 1: 'Coefficients'})
coeffcients

#### Model evaluation

In [None]:
from sklearn import metrics

In [None]:
# Model prediction on train data
y_pred = slr.predict(X_test)

In [None]:
print('R^2:',metrics.r2_score(y_test, y_pred))
print('MAE:',metrics.mean_absolute_error(y_test, y_pred))
print('MSE:',metrics.mean_squared_error(y_test, y_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
# Visualizing the differences between actual prices and predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()

In [None]:
sns.displot(y_test-y_pred)
plt.title("Histogram of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

In [None]:
sns.regplot(x=X_test, y=y_test, line_kws={'color': 'g'})

In [None]:
ax = plt.axes()
ax.scatter(X_test, y_test)
ax.plot(X_test, y_pred)

ax.set_xlabel('RM')
ax.set_ylabel('Price')

ax.axis('tight')

#### Multiple Linear Regression (MLR)

In [None]:
# Spliting target variable and independent variables
X = data.drop(['PRICE'], axis = 1)
y = data['PRICE']

In [None]:
# Splitting to training and testing data

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
# Import library for Linear Regression
from sklearn.linear_model import LinearRegression

# Create a Linear regressor
mlr = LinearRegression()

# Train the model using the training sets 
mlr.fit(X_train, y_train)

In [None]:
mlr.intercept_

In [None]:
mlr.coef_

In [None]:
#Converting the coefficient values to a dataframe
coeffcients = pd.DataFrame([X_train.columns,mlr.coef_]).T
coeffcients = coeffcients.rename(columns={0: 'Attribute', 1: 'Coefficients'})
coeffcients

#### Model evaluation

In [None]:
# Model prediction on train data
y_pred = mlr.predict(X_test)

In [None]:
# Model Evaluation
print("R2 Score:", metrics.r2_score(y_test, y_pred))
print("Mean Absolute Error:", metrics.mean_absolute_error(y_test, y_pred))
print("Mean Squared Error:", metrics.mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error:", np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

In [None]:
# Visualizing the differences between actual prices and predicted values
plt.scatter(y_test, y_pred)
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()

In [None]:
sns.displot(y_test-y_pred)
plt.title("Histogram of Residuals")
plt.xlabel("Residuals")
plt.ylabel("Frequency")
plt.show()

#### Polynomial Regression

Polynomial regression extends the linear model by adding extra predictors, obtained by raising each of the original predictors to a power. For example, a cubic regression uses three variables, X, X2, and X3, as predictors. This approach provides a simple way to provide a non-linear fit to data.
<br/>
For example, if a dataset had one input feature X, then a polynomial feature would be the addition of a new feature (column) where values were calculated by squaring the values in X, e.g. X^2. This process can be repeated for each input variable in the dataset, creating a transformed version of each.

The features created after transformation include:

* The bias (the value of 1.0)
* Values raised to a power for each degree (e.g. x^1, x^2, x^3, …)
* Interactions between all pairs of features (e.g. x1 * x2, x1 * x3, …)

For example, with two input variables with values 2 and 3 and a degree of 2, the features created would be:

* 1 (the bias)
* 2^1 = 2
* 3^1 = 3
* 2^2 = 4
* 3^2 = 9
* 2 * 3 = 6

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
# demonstrate the types of features created
from numpy import asarray
from sklearn.preprocessing import PolynomialFeatures
# define the dataset
data = asarray([[2,3],[2,3],[2,3]])
print(data)
# perform a polynomial features transform of the dataset
trans = PolynomialFeatures(degree=2)
data = trans.fit_transform(data)
print(data)

##### A polynomial of degree 2 (quadratic polynomial)

In [None]:
X_train

In [None]:
poly_features = PolynomialFeatures(degree=2)
   
# transform the features to higher degree features.
X_train_quadratic = poly_features.fit_transform(X_train)
x_test_quadratic = poly_features.fit_transform(X_test)

# fit the transformed features to Linear Regression
quadratic = LinearRegression()

quadratic.fit(X_train_quadratic, y_train)
     
# predicting on training data-set
y_train_predicted = quadratic.predict(X_train_quadratic)
   
# predicting on test data-set
y_test_predicted = quadratic.predict(x_test_quadratic)

In [None]:
X_train_quadratic

In [None]:
X_train_quadratic[0]

In [None]:
len(X_train_quadratic[0])

In [None]:
metrics.r2_score(y_train, y_train_predicted)

In [None]:
metrics.r2_score(y_test, y_test_predicted)

In [None]:
plt.scatter(y_test, y_test_predicted)
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()

##### A polynomial of degree 3 (cubic polynomial)

In [None]:
poly_features = PolynomialFeatures(degree=3)
   
# transform the features to higher degree features.
X_train_cubic = poly_features.fit_transform(X_train)
x_test_cubic = poly_features.fit_transform(X_test)

# fit the transformed features to Linear Regression
cubic = LinearRegression()

cubic.fit(X_train_cubic, y_train)
     
# predicting on training data-set
y_train_predicted = cubic.predict(X_train_cubic)
   
# predicting on test data-set
y_test_predicted = cubic.predict(x_test_cubic)

In [None]:
metrics.r2_score(y_train, y_train_predicted)

In [None]:
metrics.r2_score(y_test, y_test_predicted)

In [None]:
plt.scatter(y_test, y_test_predicted)
plt.xlabel("Prices")
plt.ylabel("Predicted prices")
plt.title("Prices vs Predicted prices")
plt.show()