# Regression Analysis

In this notebook, we'll explore how regression models work in a practical way. 

But we will not use pre-defined functions here. Instead, let's dive into the calculational steps of the process!! 

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mp
import sympy as sym

In [2]:
df = pd.read_csv("Employee.csv")

In [3]:
df

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2017,Bangalore,3,34,Male,No,0,0
1,Bachelors,2013,Pune,1,28,Female,No,3,1
2,Bachelors,2014,New Delhi,3,38,Female,No,2,0
3,Masters,2016,Bangalore,3,27,Male,No,5,1
4,Masters,2017,Pune,3,24,Male,Yes,2,1
...,...,...,...,...,...,...,...,...,...
4648,Bachelors,2013,Bangalore,3,26,Female,No,4,0
4649,Masters,2013,Pune,2,37,Male,No,2,1
4650,Masters,2018,New Delhi,3,27,Male,No,5,1
4651,Bachelors,2012,Bangalore,3,30,Male,Yes,2,0


In [4]:
df.shape

(4653, 9)

In [5]:
df.columns


Index(['Education', 'JoiningYear', 'City', 'PaymentTier', 'Age', 'Gender',
       'EverBenched', 'ExperienceInCurrentDomain', 'LeaveOrNot'],
      dtype='object')

In [6]:
df.isna().sum()

Education                    0
JoiningYear                  0
City                         0
PaymentTier                  0
Age                          0
Gender                       0
EverBenched                  0
ExperienceInCurrentDomain    0
LeaveOrNot                   0
dtype: int64

#  Linear Regression

In [7]:
### 1. Define cost function for one feature 
### 2. Write Optimization algorithm 
## 3. Scale up and write for multiple features

In [14]:
def cost_function(data, weight, bias):
    error_sum = 0
    for x in range(len(data)):
        error_sum += (weight*data[x][0] + bias-data[x][1])**2
    mean_sq_error = error_sum / (2 * len(data))
    return mean_sq_error

In [15]:
def gradient_descent(data, weight, bias):
    learning_rate = 1
    weight_sym = sym.Symbol('weight')
    bias_sym = sym.Symbol('bias')
    for i in range(10):
        weight -= learning_rate*sym.diff(cost_function(data,weight_sym,bias_sym),weight_sym)
        bias -= learning_rate*sym.diff(cost_function(data,weight_sym,bias_sym),bias_sym)
        learning_rate = learning_rate * 0.999
    return (weight.evalf(),bias.evalf())
    

In [10]:
array = [[x, y] for x, y in zip(df['JoiningYear'], df['LeaveOrNot'])]

In [16]:
print(gradient_descent(array,1,1))

(-20060.1932525129*bias - 40422587.1551976*weight + 6900.58265645255, -9.95511979025179*bias - 20060.1932525129*weight + 4.42320904027571)


In [11]:
print(array)

[[2017, 0], [2013, 1], [2014, 0], [2016, 1], [2017, 1], [2016, 0], [2015, 0], [2016, 1], [2016, 0], [2017, 0], [2012, 1], [2016, 0], [2018, 1], [2016, 0], [2012, 0], [2017, 0], [2014, 0], [2014, 0], [2015, 1], [2016, 1], [2012, 0], [2017, 0], [2017, 0], [2015, 0], [2017, 0], [2013, 0], [2016, 0], [2015, 0], [2013, 1], [2017, 0], [2013, 0], [2014, 0], [2018, 1], [2016, 0], [2017, 1], [2014, 0], [2012, 0], [2018, 1], [2017, 1], [2014, 0], [2015, 0], [2017, 0], [2013, 1], [2014, 0], [2015, 0], [2014, 0], [2012, 0], [2013, 0], [2017, 0], [2013, 0], [2017, 0], [2015, 1], [2016, 0], [2014, 0], [2017, 0], [2013, 0], [2016, 0], [2014, 0], [2013, 0], [2017, 1], [2013, 0], [2015, 0], [2013, 1], [2014, 1], [2016, 0], [2018, 1], [2013, 0], [2014, 0], [2015, 0], [2017, 1], [2018, 1], [2017, 0], [2014, 0], [2014, 0], [2017, 1], [2015, 1], [2014, 0], [2017, 0], [2012, 0], [2015, 0], [2017, 0], [2013, 1], [2016, 0], [2017, 0], [2015, 1], [2016, 1], [2017, 1], [2015, 0], [2014, 0], [2012, 0], [2018, 1]