In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr

Let's explore this airline datset:

In [2]:
df = pd.read_csv('data/airlines.csv')
df.head()

Unnamed: 0,Airline,Mishandled baggage (per 1000 passengers),Percentage On Time Arrivals
0,American West,4.36,81.9
1,United,4.0,80.9
2,Southwest,4.42,78.4
3,US Airways,7.16,78.3
4,Continental,4.62,75.7


View the scatterplot:

In [None]:
X = df['Mishandled baggage (per 1000 passengers)']
Y = df['Percentage On Time Arrivals']
plt.plot(X, Y, '.')
plt.xlabel('Mishandled baggage (per 1000 passengers)')
plt.ylabel('Percentage On Time Arrivals')

Let's run a linear regression model:

In [None]:
# Create an empty model
model = LinearRegression()
# Choose the response variable, which in this case is y
y = df['Percentage On Time Arrivals']
# Choose the predictor variable, which in this case is X:
X=df.drop(['Airline', 'Percentage On Time Arrivals'], axis=1)
# Fit the model to the full dataset
model.fit(X, y)
# print out intercept
print('b: ', model.intercept_)
# print out other coefficients
print('m: ', model.coef_)

Watch these videos by Andrew Ng on Gradient Descent.

https://www.coursera.org/learn/machine-learning/lecture/8SpIM/gradient-descent

https://www.coursera.org/learn/machine-learning/lecture/GFFPB/gradient-descent-intuition


Finally, watch the following video, paying special attention to the formulas explained between 0:00 and 4:30:

https://www.coursera.org/lecture/machine-learning/gradient-descent-for-linear-regression-kCvQc

<img src="images/gradient1.png" width=500>
<img src="images/gradient2.png" width=500>
<img src="images/simultaneous.png" width=700>

### Exercise: Gradient Descent
Write a function called gradient_descent that takes in an input array, X, and output array, Y, a learning rate, alpha, and a tolerance, tol, and prints the $\theta_0$ and $\theta_1$ values obtained by the algorithm above. 


For example, if
```
X = df['Mishandled baggage (per 1000 passengers)']
Y = df['Percentage On Time Arrivals']
```
then 
```gradient_descent(X,Y,0.05,0.0001)```
should give us out something close (but not exactly) to the values we found above:

b (theta0) = 87.20288782505382 and m (theta1) = -2.15244604.


Hint: you may want to use an outer while loop that deals with the tolerance and updates the $\theta_0$ and $\theta_1$ values and an inner for loop that updates the $\frac{\partial}{\partial \theta_0}J(\theta_0, \theta_1)$ and $\frac{\partial}{\partial \theta_1}J(\theta_0, \theta_1)$ values.

In [3]:
def gradient_descent(X,Y,alpha=0.05,tol=0.0000001):
    m = len(Y)
    theta0=100
    theta1=100
    count = 0
    def gradientx(theta0,theta1):
        value = 0
        for i in range(0,m):
            value += theta0+theta1*X[i]-Y[i]
        value = value/m
        return value

    def gradienty(theta0,theta1):
        value = 0
        for i in range(0,m):
            value += (theta0+theta1*X[i]-Y[i])*X[i]
        value = value/m
        return value

    newtheta0 = theta0-alpha*gradientx(theta0, theta1)
    newtheta1 = theta1-alpha*gradienty(theta0, theta1)
    while abs(newtheta1-theta1)> tol:
        nowa = newtheta0-alpha*gradientx(newtheta0,newtheta1)
        nowb = newtheta1-alpha*gradienty(newtheta0, newtheta1)
        newtheta0=nowa
        newtheta1=nowb
        nowa = theta0-alpha*gradientx(theta0, theta1)
        nowb = theta1-alpha*gradienty(theta0, theta1)
        theta0=nowa
        theta1=nowb
        count += 1
    return newtheta0,newtheta1,count

x = df['Mishandled baggage (per 1000 passengers)']
y = df['Percentage On Time Arrivals']

gradient_descent(x,y)

(87.20264410238664, -2.1524056672134826, 3931)

$ \theta_j := \theta_j -\alpha (h_{\theta}(x{(i)})-y{(i)})x_j^{(i)} $
```

```

In [85]:
import math
alpha=0.001# Learning Rate
x=df['Mishandled baggage (per 1000 passengers)'];# Dataset
y=df['Percentage On Time Arrivals'];
m=len(y) #Size
Theta0=100
Theta1=100
#初始坐标
def gradientx(Theta0,Theta1):#Differentiate theta0
    ans=0
    for i in range(0,m):
        ans=ans+Theta0+Theta1*x[i]-y[i]
    ans=ans/m
    return ans
def gradienty(Theta0,Theta1):#Differentiate theta1
    ans=0
    for i in range(0,m):
        ans=ans+(Theta0+Theta1*x[i]-y[i])*x[i]
    ans=ans/m
    return ans

nowTheta0 = Theta0-alpha*gradientx(Theta0, Theta1) # Update
nowTheta1 = Theta1-alpha*gradienty(Theta0, Theta1)
#print(nowTheta0,nowTheta1)
while math.fabs(nowTheta1-Theta1)>0.0000001: #Gradient Descent
    nowa = nowTheta0-alpha*gradientx(nowTheta0,nowTheta1)
    nowb = nowTheta1-alpha*gradienty(nowTheta0, nowTheta1)
    nowTheta0=nowa
    nowTheta1=nowb
    nowa = Theta0-alpha*gradientx(Theta0, Theta1)
    nowb = Theta1-alpha*gradienty(Theta0, Theta1)
    Theta0=nowa
    Theta1=nowb
    print(abs(nowTheta1-Theta1))
    
print(nowTheta0,nowTheta1)
#87.20 -2.15

3.488606764067214
3.3642565279255905
3.2443387564485846
3.1286954544134318
3.0171742583378176
2.909628235736406
2.805915691532988
2.7058999813729514
2.6094493315903264
2.5164366655920674
2.4267394364308217
2.3402394653457463
2.256822786058457
2.17637949461907
2.0988036046044627
2.0239929074780747
1.9518488379270522
1.8822763439995853
1.81518376187109
1.7504826950744388
1.6880878980349507
1.627917163756841
1.56989121551306
1.513933602395852
1.4599705985903952
1.4079311062388769
1.3577465617669446
1.3093508455491474
1.2626801947943918
1.2176731195365456
1.1742703216196055
1.1324146165706033
1.0920508582573483
1.0531258662318024
1.015588355663212
0.9793888697688615
0.9444797146532657
0.9108148964700646
0.8783500608237489
0.8470424343314384
0.8168507682676811
0.7877352842180194
0.759657621669767
0.732580787470873
0.7064691070903599
0.6812881776160715
0.6570048224278011
0.6335870474861256
0.6110039991792906
0.5892259236726769
0.5682241277072304
0.5479709407952456
0.5284396787636734
0.509604

In [6]:
import math
alpha=0.05# Learning Rate
x=df['Mishandled baggage (per 1000 passengers)'];# Dataset
y=df['Percentage On Time Arrivals'];
m=len(y) #Size
Theta0=100
Theta1=100
count = 0
def gradientx(Theta0,Theta1):#Differentiate theta0
    ans=0
    for i in range(0,m):
        ans=ans+Theta0+Theta1*x[i]-y[i]
    ans=ans/m
    return ans
def gradienty(Theta0,Theta1):#Differentiate theta1
    ans=0
    for i in range(0,m):
        ans=ans+(Theta0+Theta1*x[i]-y[i])*x[i]
    ans=ans/m
    return ans

nowTheta0 = Theta0-alpha*gradientx(Theta0, Theta1) # Update
nowTheta1 = Theta1-alpha*gradienty(Theta0, Theta1)
#print(nowTheta0,nowTheta1)
while math.fabs(nowTheta1-Theta1)>0.0000001: #Gradient Descent
    nowa = nowTheta0-alpha*gradientx(nowTheta0,nowTheta1)
    nowb = nowTheta1-alpha*gradienty(nowTheta0, nowTheta1)
    nowTheta0=nowa
    nowTheta1=nowb
    nowa = Theta0-alpha*gradientx(Theta0, Theta1)
    nowb = Theta1-alpha*gradienty(Theta0, Theta1)
    Theta0=nowa
    Theta1=nowb
    count += 1
    print(abs(nowTheta1-Theta1), count)
    
print(nowTheta0,nowTheta1)
#87.20 -2.15

141.48867483197247 1
110.68245923252857 2
86.57847383787836 3
67.72892790795373 4
52.978078684359474 5
41.444997027717186 6
32.41748212743475 7
25.361451579571266 8
19.836139280638022 9
15.519679196566727 10
12.137424736979533 11
9.497343962787827 12
7.426465686079718 13
5.812181199167181 14
4.543760252341026 15
3.5571703044563985 16
2.7797913683929574 17
2.177291394049961 18
1.7003946059994979 19
1.3329192467467088 20
1.0398980604322574 21
0.8162319554568858 22
0.6357329988342495 23
0.5000600922943184 24
0.3884211081999025 25
0.30658672438068724 26
0.23709007575397 27
0.18819447113368315 28
0.14449119524158238 29
0.11574552344434053 30
0.08783140931462396 31
0.07141000678815312 32
0.05316329497521477 33
0.044277546086415764 34
0.03195220920928299 35
0.027671914612145798 36
0.018975672494069284 37
0.017507826270071858 38
0.011037969192482233 39
0.011285431468317064 40
0.006183591724386872 41
0.007475033645872875 42
0.003215934932933928 43
0.005140587833255994 44
0.0014027777496390392 4