# LINEAR REGRESSION

## Linear Regression performed on 1-D and multidimensional dataset using 

- Scikit Learn 
- Pure Python

### Installing and importing Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


In [None]:
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

## Linear Regression on 1-D dataset of Salary v/s Years Experience

### Using Scikit Learn

#### STEP - 1
- Importing csv file containing our dataset
- Splitting the dataset into training and test set
- Reshaping the dimensions
- Visualizing dataset using graphs

In [None]:
salary_dataset = pd.read_csv('salary_data.csv')

In [None]:
salary_dataset

In [None]:
X = salary_dataset['YearsExperience']
y = salary_dataset['Salary']

print(X.shape)
print(y.shape)

In [None]:
X = X.values.reshape((-1, 1))
y = y.values.reshape((-1, 1))

print(X.shape)
print(y.shape)

In [None]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
print(train_x.shape)
print(test_x.shape)
print(train_y.shape)
print(test_y.shape)

In [None]:
train_x = train_x.values.reshape((-1, 1))
test_x = test_x.values.reshape((-1, 1))
train_y = train_y.values.reshape((-1, 1))
test_y = test_y.values.reshape((-1, 1))

In [None]:
print(train_x.shape)
print(test_x.shape)
print(train_y.shape)
print(test_y.shape)

In [None]:
%matplotlib inline
plt.xlabel("Experience (in Years)")
plt.ylabel("Salary")
plt.title("Experience v/s Salary")
plt.scatter(train_x, train_y, color="red", marker="o");

In [None]:
%matplotlib inline
plt.title("Experience v/s Salary")
plt.xlabel("Experience (in Years)")
plt.ylabel("Salary")
plt.scatter(train_x, train_y, marker='+')
plt.scatter(test_x, test_y, marker='+')

In [None]:
%matplotlib inline
plt.xlabel("Experience (in Years)")
plt.ylabel("Salary")
plt.title("Experience v/s Salary")
plt.scatter(train_x, train_y, color="red", marker="o")
plt.scatter(test_x, test_y, color="blue", marker="o");

#### STEP-2

- modelling the linear regression model
- fitting the training set
- getting coefficient and intercept

In [None]:
model = linear_model.LinearRegression()

In [None]:
model.fit(train_x, train_y)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
model.predict([[10.5]])

https://stackoverflow.com/questions/45554008/error-in-python-script-expected-2d-array-got-1d-array-instead

#### STEP-3

- Predicting for test set
- Getting cost, r2 score
- Plotting the predicted result

In [None]:
y_test_predict = model.predict(test_x)

In [None]:
y_test_predict

In [None]:
mse = mean_squared_error(test_y, y_test_predict)
print(mse)

In [None]:
r2 = r2_score(test_y, y_test_predict)
print(r2)

In [None]:
plt.xlabel("Experience (in Years)")
plt.ylabel("Salary")
plt.title("Experience v/s Salary")
plt.scatter(X, y, marker='o')
plt.plot(X, model.predict(X), '-r');

In [None]:
plt.xlabel("Experience (in Years)")
plt.ylabel("Salary")
plt.title("Experience v/s Salary")
plt.scatter(train_x, train_y, color="red", marker="o")
plt.scatter(test_x, test_y, color="blue", marker="o")
plt.plot(test_x, y_test_predict, '-g');

In [None]:
model.score(X, y)

### Using Python

#### STEP-1

- defining functions for forward prop, backprop, cost computations, parameters update and parameters initialization

In [None]:
def parameters_initialization(n_x, m):
    parameters = {}
    
    parameters['w'] = np.zeros((1, n_x))
    parameters['b'] = np.zeros((1, 1))
    
    return parameters

def compute_cost(y_hat, y):
    m = y.shape[1]
    
    return (1/(2*m))*np.sum(np.square(y_hat - y))

def compute_gradients(X, y, y_hat):
    m = y.shape[1]
    grads = {}
    
    dZ = y_hat - y
    grads['dw'] = (1/m)*np.dot(dZ, X.T)
    grads['db'] = (1/m)*np.sum(dZ, axis=1, keepdims=True)
    
    return grads

def update_parameters(grads, parameters, learning_rate):
    parameters['w'] -= learning_rate*grads['dw']
    parameters['b'] -= learning_rate*grads['db']
    
    return parameters

#### STEP-2

- defining model of operations which optimize gradient descent, plot cost and returns updated parameters
- define predict function that predict the output for newer datasets or test set.

In [None]:
def model(X, y, num_iterations, learning_rate):
    
    n_x = X.shape[0]
    m = X.shape[1]
    costs = []
    
    parameters = parameters_initialization(n_x, m)
    for i in range(num_iterations):
        y_hat = np.dot(parameters['w'], X) + parameters['b']
        cost = compute_cost(y_hat, y)
        gradients = compute_gradients(X, y, y_hat)
        update_parameters(gradients, parameters, learning_rate)
        
        if(i%1000==0):
            costs.append(cost)
        
    plt.plot(costs)
    plt.ylabel("Cost")
    plt.xlabel("Num Iterations")
    plt.title("Cost v/s Num Iterations")
    plt.show()
        
    return parameters

In [None]:
def predict(X, y, parameters):
    y_predict = np.dot(parameters['w'], X) + parameters['b']
    cost = compute_cost(y_predict, y)
    
    return y_predict, cost

#### STEP-3

- define linear regression function that takes the input and perform GD optimization and predictions, along with plotting

In [None]:
def LinearRegression(train_x, test_x, train_y, test_y, num_iterations=7500, learning_rate=0.01):
    parameters = model(train_x, train_y, num_iterations, learning_rate)
    print("Coef: ", parameters['w'])
    print("Intercept: ", parameters['b'])
    y_predict, test_cost = predict(test_x, test_y, parameters)
    print("Cost:", test_cost)
    
    X = np.hstack((train_x, test_x))
    y = np.hstack((train_y, test_y))
    pred, cost = predict(X, y, parameters)
    
    plt.xlabel("Experience (in Years)")
    plt.ylabel("Salary")
    plt.title("Experience v/s Salary")
    plt.scatter(X, y, marker='o')
    plt.scatter(X, pred)
    plt.show()

In [None]:
salary_dataset = pd.read_csv("salary_data.csv")

train_x, test_x, train_y, test_y = train_test_split(salary_dataset['YearsExperience'], salary_dataset['Salary'], test_size=0.25, random_state=42)

X = salary_dataset['YearsExperience']
y = salary_dataset['Salary']

train_x = train_x.values.reshape((1, -1))
test_x = test_x.values.reshape((1, -1))
train_y = train_y.values.reshape((1, -1))
test_y = test_y.values.reshape((1, -1))

LinearRegression(train_x, test_x, train_y, test_y, 5000, 0.03)

## Linear Regression on multidimensional dataset

### Using Scikit Learn

### STEP-1
- Downloading and Cleaning Dataset

In [None]:
df = pd.read_csv("50_startups.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.State.value_counts()

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [None]:
X

In [None]:
y.sample(10)

In [None]:
states = pd.get_dummies(X['State'], drop_first=True)

In [None]:
X.drop('State', axis=1, inplace=True)

In [None]:
X.head()

In [None]:
X = pd.concat([X, states], axis=1)

In [None]:
X.sample(10)

In [None]:
print(X.shape)
print(y.shape)

In [None]:
means = X.mean()
print(means)

stddvs = X.std()
print(stddvs)

In [None]:
X[['R&D Spend', 'Administration', 'Marketing Spend']] = (X[['R&D Spend', 'Administration', 'Marketing Spend']] - means[['R&D Spend', 'Administration', 'Marketing Spend']])/stddvs[['R&D Spend', 'Administration', 'Marketing Spend']]

In [None]:
X

#### STEP-2
- dividing dataset to train and test set


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

#### STEP-3
- Model the dataset
- Predict on test set
- get coefficient and intercept
- Calculate cost and r2 score

In [None]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [None]:
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
mse = mean_squared_error(y_test, y_pred)
print("MSE: ", mse)

r2 = r2_score(y_test, y_pred)
print("r2 score: ", r2)

In [None]:
model.coef_

In [None]:
model.intercept_

In [None]:
model.score(X, y)

### Using Python

##### As this is multiple Linear Regression, it does not involve plotting on multidimension.

#### STEP-1

- defining functions for forward prop, backprop, cost computations, parameters update and parameters initialization

In [None]:
def parameters_initialization(n_x, m):
    parameters = {}
    
    parameters['w'] = np.zeros((1, n_x))
    parameters['b'] = np.zeros((1, 1))
    
    return parameters

def compute_cost(y_hat, y):
    m = y.shape[1]
    
    return (1/(2*m))*np.sum(np.square(y_hat - y))

def compute_gradients(X, y, y_hat):
    grads = {}
    m = X.shape[1]
    
    dZ = y_hat - y
    grads['dw'] = (1/m)*np.dot(dZ, X.T)
    grads['db'] = (1/m)*np.sum(dZ)
    
    return grads

def update_parameters(parameters, gradients, learning_rate):
    parameters['w'] -= learning_rate*gradients['dw']
    parameters['b'] -= learning_rate*gradients['db']
    
    return parameters

#### STEP-2

- defining model of operations which optimize gradient descent, plot cost and returns updated parameters
- define predict function that predict the output for newer datasets or test set.

In [None]:
def model(X, y, num_iterations, learning_rate):
    
    n_x = X.shape[0]
    m = X.shape[1]
    costs = []
    
    parameters = parameters_initialization(n_x, m)
    
    for i in range(num_iterations):
        y_hat = np.dot(parameters['w'], X) + parameters['b']
        cost = compute_cost(y_hat, y)
        gradients = compute_gradients(X, y, y_hat)
        parameters = update_parameters(parameters, gradients, learning_rate)
        
        if i%1000 == 0:
            costs.append(cost)
            
    plt.plot(costs)
    plt.xlabel("Num Iterations")
    plt.ylabel("Cost")
    plt.title("Cost v/s Num Iterations")
    plt.show()
    
    return parameters

In [None]:
def predict(X, y, parameters):
    y_predict = np.dot(parameters['w'], X) + parameters['b']
    cost = compute_cost(y_predict, y)
    
    return y_predict, cost

#### STEP-3

- define linear regression function that takes the input and perform GD optimization and predictions

In [None]:
def LinearRegression(train_x, test_x, train_y, test_y, num_iterations=7500, learning_rate=0.01):
    parameters = model(train_x, train_y, num_iterations, learning_rate)
    print("Coef:", parameters['w'])
    print("Intercept:", parameters['b'])
    y_predict, test_cost = predict(test_x, test_y, parameters)
    print("Cost on test set:", test_cost)
    
    X = np.hstack((train_x, test_x))
    y = np.hstack((train_y, test_y))
    prediction, cost = predict(X, y, parameters)

In [None]:
X

In [None]:
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=0)

print(train_x.shape)
print(test_x.shape)
print(train_y.shape)
print(test_y.shape)

train_x = train_x.T
test_x = test_x.T
train_y = train_y.values.reshape((1, -1))
test_y = test_y.values.reshape((1, -1))

print(train_x.shape)
print(test_x.shape)
print(train_y.shape)
print(test_y.shape)

In [None]:
LinearRegression(train_x, test_x, train_y, test_y, 5000, 0.03)