In [1]:
# Import Modules/Libraries
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt

%matplotlib inline
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

np.random.seed(1)

import warnings
#suppress warnings with numpy for sigmoid function
warnings.filterwarnings('ignore')

In [2]:
# Load datasets
train = pd.read_csv('../datasets/clean/cleaned_train.csv', index_col = False)
test = pd.read_csv('../datasets/clean/cleaned_test.csv', index_col = False)

In [4]:
train.head(3)

Unnamed: 0,Astronomy,Herbology,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Charms,Flying,Gryffindor,Hufflepuff,Ravenclaw,Slytherin
0,-1.014194,0.878628,0.377371,1.021139,0.345639,0.512444,0.219633,-0.686183,1.204553,-0.50033,0,0,1,0
1,-1.137535,-1.36569,-2.109573,-0.540256,-1.204191,0.258503,0.653769,0.412462,-1.002983,-1.386928,0,0,0,1
2,-0.780078,1.261379,0.718622,1.828915,1.005195,0.133871,1.314249,0.882556,1.825184,0.086673,0,0,1,0


In [3]:
#Split X (independent variables) with the target value Y
target_columns = ['Gryffindor', 'Hufflepuff', 'Ravenclaw', 'Slytherin']
X_train = np.array(train.drop(columns = target_columns))
y_train = np.array(train[target_columns])

In [5]:
print ('The shape of X_train is: ' + str(X_train.shape))
print ('The shape of y_train is: ' + str(y_train.shape))
print ('We have m = %d training examples' % (len(y_train)))

The shape of X_train is: (1600, 10)
The shape of y_train is: (1600, 4)
We have m = 1600 training examples


In [171]:
def initialize(X):
    thetas = np.zeros((np.shape(X)[1]+1,1))
    X = np.c_[np.ones((np.shape(X)[0],1)),X]
    return thetas, X

In [172]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [173]:
print ("sigmoid(0) = " + str(sigmoid(0)))

sigmoid(0) = 0.5


In [237]:
def cost(X, y, thetas):
    z = np.dot(X,thetas)
    cost0 = y.T.dot(np.log(sigmoid(z)))
    cost1 = (1-y).T.dot(np.log(1-sigmoid(z)))
    cost = -((cost1 + cost0))/len(y) 
    return cost

In [238]:
thetas, X = initialize(X_train)

In [241]:
def fit(X,y,alpha=0.00001, iter=100):
    thetas, X = initialize(X)
    cost_list = np.zeros(iter,)
    for i in range(iter):
        thetas = thetas - alpha * np.dot(X.T, sigmoid(np.dot(X, thetas)) - np.reshape(y,(len(y),1)) )
        print(thetas.shape)
        cost_list[i] = cost(X, y, thetas)
    thetas = thetas
    return cost_list, thetas

In [234]:
cost_list, thetas

(array([0.68613234, 0.6792497 , 0.67249633, 0.66586933, 0.65936585,
        0.6529831 , 0.6467183 , 0.64056875, 0.63453179, 0.62860481,
        0.62278525, 0.61707061, 0.61145842, 0.6059463 , 0.60053189,
        0.5952129 , 0.58998708, 0.58485226, 0.57980628, 0.57484708,
        0.5699726 , 0.56518088, 0.56046997, 0.55583799, 0.55128311,
        0.54680353, 0.54239752, 0.53806337, 0.53379944, 0.52960412,
        0.52547584, 0.52141308, 0.51741435, 0.51347821, 0.50960326,
        0.50578812, 0.50203148, 0.49833203, 0.49468851, 0.49109971,
        0.48756442, 0.48408149, 0.48064979, 0.47726821, 0.4739357 ,
        0.4706512 , 0.46741371, 0.46422224, 0.46107583, 0.45797355,
        0.45491448, 0.45189774, 0.44892247, 0.44598782, 0.44309299,
        0.44023717, 0.43741958, 0.43463948, 0.43189611, 0.42918878,
        0.42651677, 0.4238794 , 0.42127602, 0.41870596, 0.4161686 ,
        0.41366333, 0.41118954, 0.40874664, 0.40633407, 0.40395126,
        0.40159768, 0.39927278, 0.39697606, 0.39

In [235]:
def predict(X, thetas):
    z = np.dot(X, thetas)
    lis = []
    for i in sigmoid(z):
        if i>0.5:
            lis.append(1)
        else:
            lis.append(0)
    return lis

In [236]:
(predict(X, weights) == y_train[:, 3]).astype(int).sum()/len(y)

0.99125

In [213]:
z = np.dot(initialize(X_train)[1], thetas)


In [221]:
y_train[:, 1]

array([0, 0, 0, ..., 0, 1, 1])