In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
%matplotlib inline

In [3]:
df = pd.read_csv("exams.csv")

In [4]:
df.head()

Unnamed: 0,exam_1,exam_2,admitted
0,34.62366,78.024693,0
1,30.286711,43.894998,0
2,35.847409,72.902198,0
3,60.182599,86.308552,1
4,79.032736,75.344376,1


In [5]:
df.rename({0:"First Exam Score", 1:"Second Exam Score", 2:"Admission"}, axis="columns", inplace=True)

In [6]:
df.head()

Unnamed: 0,exam_1,exam_2,admitted
0,34.62366,78.024693,0
1,30.286711,43.894998,0
2,35.847409,72.902198,0
3,60.182599,86.308552,1
4,79.032736,75.344376,1


In [7]:
plot = sns.FacetGrid(df, hue="Admission", hue_kws={"marker": ["x", "o"]}, size=5)
plot.map(plt.scatter, "First Exam Score", "Second Exam Score", s=100, linewidth=.5, edgecolor="black")
plot.add_legend()
plt.show()



KeyError: 'Admission'

In [None]:
# initialize θ0, θ1, θ2 as 0
theta = np.zeros(3)
theta

In [None]:
# create a new column for df with X1 = 1
df["x1"] = 1

In [None]:
df = df[["x1", "First Exam Score", "Second Exam Score", "Admission"]]

In [None]:
df.head()

In [None]:
# sepeate the x and y column
x = df.iloc[:, :3]
y = df.iloc[:, 3]

In [None]:
x.head()

In [None]:
y = y.to_frame()

In [None]:
y.head()

In [None]:
# create a sigmoid function to train the x column
def sigmoid(z):
    return 1 / (1+ np.exp(-z))

In [None]:
# convert x,y column to array
x = np.array(x.values)
y = np.array(y.values)

In [None]:
result = sigmoid(np.matrix(x) * np.matrix(theta).T)
result

In [None]:
# create a function to calculate the cost function of the logistic regression
# the cost function for logistic regression is -log(hθ(x)) if y = 1, -log(1-hθ(x)) if y = 0
# the simplify version of the cost function is: -ylog(hθ(x)) - (1-y)log(1-hθ(x))
def costfunction(theta, x, y):
    x = np.matrix(x)
    y = np.matrix(y)
    theta = np.matrix(theta)
    return np.sum(np.multiply(-y, np.log(sigmoid(x * theta.T))) - np.multiply((1 - y), np.log(1 - sigmoid(x * theta.T)))) / (len(x))

In [None]:
costfunction(theta, x, y)

In [None]:
# test with different parameter
costfunction(np.matrix([-24,0.2,0.2]),x,y)

In [None]:
# create a gradient function to calculate the gradient of the logistic regression
def gradient(theta, x, y):
    theta = np.matrix(theta)
    x = np.matrix(x)
    y = np.matrix(y)
    grad = np.zeros(3)
    shape = int(theta.ravel().shape[1])
    error = sigmoid(x * theta.T) - y
    return (x.T * error) / len(x)

In [None]:
gradient(theta, x, y)

In [None]:
# create a gradientDescrnt function to calculate gradien descent
def gradientDescent(x, y, theta, alpha, num_iter):
    x = np.matrix(x)
    y = np.matrix(y)
    theta = np.matrix(theta)
    theta = theta.T
    for i in range(num_iter):
        h = sigmoid(x * theta)
        theta = theta - ((alpha/len(x)) * (x.T * (h - y)))
    return theta.T

In [None]:
a = gradientDescent(x,y,theta,0.001, 1000000)

In [None]:
costfunction(a,x,y)

In [None]:
a

In [None]:
# create a debug function to see if the gradient descent function is working properly
def debug(x,y,theta,alpha, num_iter):
    cost = []
    for i in range(num_iter):
        cost.append(costfunction(gradientDescent(x, y, theta, alpha, i),x, y))
    return cost

In [None]:
y_axis = debug(x,y,theta,0.001, 1000)

In [None]:
x_axis = np.arange(1000)

In [None]:
plt.plot(x_axis, y_axis)
plt.title("J(θ) vs num_iter")
plt.xlabel("num_iter")
plt.ylabel("J(θ)")
plt.show()
# the graph show the cost function decrease after iterated 1000 times with alpha = 0.001

In [None]:
# after iterated for 1000000 time with alpha as 0.001, the cost function is 0.224, which is much smaller than the original cost 0.69

In [None]:
import scipy.optimize as opt
result = opt.fmin_tnc(func=costfunction, x0=theta, fprime=gradient, args=(x, y))

In [None]:
result

In [None]:
# as we can see from the above output, the parameter that will minimized the cost function is θ1 = -25.16131848, θ2 = 0.20623159, θ3 = 0.20147148.

In [None]:
minimized_theta = np.array([-25.16131848, 0.20623159, 0.20147148])

In [None]:
result = sigmoid(np.matrix(x) * np.matrix(minimized_theta).T)

In [None]:
# craete a list that contain the value of the result
g = []
for i in result:
    if i >= 0.5:
        g.append(1)
    else:
        g.append(0)

In [None]:
# create a list that contain the value of the y axis
u = []
for i in y:
    u.append(i[0])

In [None]:
result = g
y_axis = u

In [None]:
#create a function to test the accuracy of the minimized_theta
def acurracy(result, y_axis):
    total = 0
    i = 0
    if len(result) == len(y_axis):
        length = len(result)
    while i < length:
        if result[i] == y_axis[i]:
            total += 1
        i += 1
    return total/length

In [None]:
acurracy(result, y_axis)
# as we see from the result, the accuracy of this model with the dataset is 89%, which is not bad.

In [None]:
a = np.arange(30, 101)

In [None]:
# the decision boundary formula for this dataset is θ1X1 + θ2Χ2 + θ3Χ3 >= 0
# which in this case is -25.16131848X1 + 0.20623159X2 + 0.20147148X2 >= 0
# with a little algebra, the formula become X3 = (25.16131848 - 0.20623159X1)/0.20147148
def formula(a):
    lis = []
    for i in a:
        lis.append((25.16131848-0.20623159*i)/0.20147148)
    return lis

In [None]:
Y_AXIS = formula(a)

In [None]:
X_AXIS = list(a)

In [None]:
# make a graph with decision boundary
plot = sns.FacetGrid(df, hue="Admission", hue_kws={"marker": ["x", "o"]}, size=5)
plot.map(plt.scatter, "First Exam Score", "Second Exam Score", s=100, linewidth=.5, edgecolor="black")
plot.add_legend()
plt.plot(X_AXIS,Y_AXIS)
plt.show()