Extra Credit: Create a function in python that performs 25 iterations of gradient descent on f.  The function should return the estimated x-value and its corresponding y-value. (f(x) = x^2 - 4x + 5).

In [1]:
#how to SYMBOLICALLY differentiate in python 
import sympy as sp
x = sp.Symbol("x") #telling what to differentiate with respect to. "Symbol" HAS to be capitalized
sp.diff(x**2 - 4*x + 5, x) 

2*x - 4

In [2]:
import numpy as np
from scipy.misc import derivative

In [3]:
def f(x):
    return x**2 - 4*x + 5

In [4]:
def gradient_descent(f, iteration_num, learning_rate):
    x1 = np.random.randint(0,1000)
    for i in range(iteration_num):
        x1 -= (learning_rate)*(derivative(f,x1))
    return x1, f(x1)

In [5]:
gradient_descent(f,25,0.2)

(2.0014613168047384, 1.0000021354468034)

In [6]:
#Gradient Descent

In [7]:
import numpy as np
import pandas as pd

In [8]:
data = pd.read_csv("ecommerce_data.csv")
data.head()
#data that will be used. Targets will be binary values of user_action column, features will be the all other columns.

Unnamed: 0,is_mobile,n_products_viewed,visit_duration,is_returning_visitor,time_of_day,user_action
0,1,0,0.65751,0,3,0
1,1,1,0.568571,0,2,1
2,1,0,0.042246,1,1,0
3,1,1,1.659793,1,1,2
4,0,1,2.014745,1,1,2


In [9]:
data2 = np.copy(data)
data2
#This is important to get the right shape (tuple form) for the targets. HAS TO BE (500,), not (1,500)
    #(Because we are trying to get only the binary values of the targets)

array([[1.        , 0.        , 0.65750995, 0.        , 3.        ,
        0.        ],
       [1.        , 1.        , 0.56857123, 0.        , 2.        ,
        1.        ],
       [1.        , 0.        , 0.042246  , 1.        , 1.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.1728534 , 1.        , 3.        ,
        0.        ],
       [1.        , 0.        , 0.2099644 , 0.        , 3.        ,
        0.        ],
       [0.        , 0.        , 2.61688195, 1.        , 3.        ,
        0.        ]])

In [10]:
targets = data2[:,-1]
targets.shape
#targets of the data
#!!! Note the shape

(500,)

In [11]:
X = data2[:,:-2]
#All features except for one hot encoded column

In [12]:
ohe_column = data2[:,-2].reshape(500,1)
N,D = ohe_column.shape
zero = np.zeros((N,D+3))
zero[np.arange(N),(ohe_column[:,0]-1).astype(int)] = 1
zero
#one hot encoding the categorical column
#!!!Note that there is no need to one hot encode columns of index [0] and [3] because they are already binary in value.

array([[0., 0., 1., 0.],
       [0., 1., 0., 0.],
       [1., 0., 0., 0.],
       ...,
       [0., 0., 1., 0.],
       [0., 0., 1., 0.],
       [0., 0., 1., 0.]])

In [13]:
X = np.hstack((X,zero))
X = X[targets <= 1]
X
#making all the features and removing data with non-binary targets

array([[1.        , 0.        , 0.65750995, ..., 0.        , 1.        ,
        0.        ],
       [1.        , 1.        , 0.56857123, ..., 1.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.042246  , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.1728534 , ..., 0.        , 1.        ,
        0.        ],
       [1.        , 0.        , 0.2099644 , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 2.61688195, ..., 0.        , 1.        ,
        0.        ]])

In [14]:
N,D = X.shape

In [15]:
targets = targets[targets <= 1]
targets.shape
#target data, with only binary values of targets

(398,)

In [16]:
def standardize(data,col_list):
    for i in range(len(col_list)):
        data[:,col_list[i]] = (data[:,col_list[i]] - data[:,col_list[i]].mean())/data[:,col_list[i]].std()
    return data

In [17]:
X = standardize(X,[1,2])
X
#standardizing the 2 numerical columns (n_products_viewed and visit_duration)

array([[ 1.        , -0.67398867, -0.27476744, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        ,  0.46265324, -0.37446708, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.        , -0.67398867, -0.96447376, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [ 0.        , -0.67398867, -0.81806383, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.        , -0.67398867, -0.77646267, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.        , -0.67398867,  1.92167401, ...,  0.        ,
         1.        ,  0.        ]])

In [18]:
W = np.random.randn(D)
W = W.reshape(1,D)
W
#randomly selecting weights

array([[ 0.79656335, -0.42133549,  0.8743829 ,  0.13211258, -1.23448104,
        -0.09930986,  0.28804321, -0.88577931]])

In [19]:
def sigmoid(z):
    return 1/(1+np.exp(-z))
#sigmoid function

In [20]:
predictions = sigmoid(np.dot(W,X.T))

In [21]:
np.mean(np.round(predictions) == targets)
#initial accuracy of model

0.3969849246231156

In [22]:
iteration_num = 10001
a = 0.2
while iteration_num > 0:
    predictions = sigmoid(np.dot(W,X.T))
    f_prime = np.dot((predictions - targets),X)
    
    W -= a*f_prime
    iteration_num -= 1

W
#optimal weights

array([[ 2.23667282,  8.94168273, -1.67204811,  2.58528138, -4.07370185,
        -4.0223233 , -4.84842388, -1.96846815]])

In [23]:
np.mean(np.round(predictions) == targets)
#accuracy of model after 10 000 iterations of gradient descent

0.9547738693467337

In [26]:
class Gradient_descent():
    def sigmoid(z):
            return 1/(1+ np.exp(-z))
        
    def __init__(self, feature_data, target_data, iteration_num, a):
        N,D = feature_data.shape
        W = np.random.randn(D)
        W = W.reshape(1,D)
        predictions = sigmoid(np.dot(W,feature_data.reshape(D,N)))
        print(np.mean(np.round(predictions)== target_data))
        
        for i in range(iteration_num):
            predictions = sigmoid(np.dot(W, feature_data.reshape(D,N)))
            cost_derivative = np.dot((predictions - target_data),feature_data)
            W -= a*cost_derivative
            print(np.mean(np.round(predictions)== target_data))
        print(W)
        print(np.mean(np.round(predictions)== target_data))

In [27]:
W_opt = Gradient_descent(X, targets, 10, 0.2)

0.5251256281407035
0.5251256281407035
0.49748743718592964
0.5
0.49748743718592964
0.5100502512562815
0.5125628140703518
0.5125628140703518
0.5125628140703518
0.5125628140703518
0.5125628140703518
[[-30.19034506 303.70852694 -91.20033056 -40.955365   -38.89645629
  -17.74358701   1.0593466  -24.25217564]]
0.5125628140703518
