# Logreg train - Python 42

In [18]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
import copy

In [20]:
data_raw1 = pd.read_csv("dataset_train.csv")
data_raw2 = pd.read_csv("dataset_test.csv")
data_raw = pd.concat([data_raw1, data_raw2])
data_raw_1 = data_raw.drop(['Index'], axis=1)
data_raw_1.head()

Unnamed: 0,Hogwarts House,First Name,Last Name,Birthday,Best Hand,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,Ravenclaw,Tamara,Hsu,2000-03-30,Left,58384.0,-487.886086,5.72718,4.878861,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,0.715939,-232.79405,-26.89
1,Slytherin,Erich,Paredes,1999-10-14,Right,67239.0,-552.060507,-5.987446,5.520605,-5.612,-487.340557,367.760303,4.10717,1058.944592,7.248742,0.091674,-252.18425,-113.45
2,Ravenclaw,Stephany,Braun,1999-11-03,Left,23702.0,-366.076117,7.725017,3.660761,6.14,664.893521,602.585284,3.555579,1088.088348,8.728531,-0.515327,-227.34265,30.42
3,Gryffindor,Vesta,Mcmichael,2000-08-19,Left,32667.0,697.742809,-6.497214,-6.977428,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-0.01404,-256.84675,200.64
4,Gryffindor,Gaston,Gibbs,1998-09-27,Left,60158.0,436.775204,-7.820623,,2.236,-444.262537,599.324514,-3.444377,937.434724,4.311066,-0.26407,-256.3873,157.98


In [3]:
results = data_raw_1.describe()
results

Unnamed: 0,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
count,1953.0,1955.0,1956.0,1961.0,1955.0,1955.0,1957.0,1946.0,1955.0,1960.0,1952.0,2000.0,2000.0
mean,49724.612903,41.451669,1.189644,-0.417844,3.205737,-223.708112,495.785943,2.936453,1030.253905,5.915848,-0.038082,-243.335749,22.2399
std,16444.807977,518.651589,5.187046,5.193183,4.103768,488.49943,105.348963,4.40351,44.360287,3.161791,0.981402,8.770542,97.12546
min,-24370.0,-966.740546,-10.295663,-10.162119,-8.727,-1086.496835,283.869609,-8.858993,906.62732,-4.697484,-3.313676,-261.04892,-181.47
25%,38819.0,-488.265275,-4.300239,-5.243928,3.1825,-577.927473,398.497892,2.219926,1026.510177,3.554956,-0.660419,-250.58645,-40.7125
50%,49114.0,272.071636,3.50684,-2.70741,4.634,-415.425616,467.730624,4.355191,1045.78509,5.857253,-0.025599,-244.831995,-2.515
75%,60698.0,521.974961,5.416,4.881403,5.655,252.532155,596.599814,5.780673,1058.649546,8.236317,0.621809,-232.59871,49.4575
max,104956.0,1016.21194,11.612895,9.667405,10.032,1092.388611,745.39622,11.889713,1099.966073,13.536762,3.205525,-225.42814,282.43


Code for the logistic regression

In [63]:
class logreg:
    
    def __init__(self, lr=0.01, iterations=100000, fit_intercept=True):
        self.lr = lr
        self.iterations = iterations
        self.fit_intercept = fit_intercept
        
    def __add_intercept(self,X):
        cols = X.columns
        X['One'] = 1
        return X.filter(['One'] + cols)
        
    def __sigmoid(z):
        return 1/(1+np.exp(-z))
        
    def __loss(y,h):
        return (-y*np.log(h) - (1-y)*np.log(1-h))
        
    def __fit(self, X, y):
        if self.fit_intercept:
            X = self.add_intercept(X)
    
        #We initialize the theta
        self.theta = np.zeros(X.shape[1])
        
        for i in np.arange(0, self.iterations):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = 1/y.size * np.dot(X.T, (h-y))
            self.theta -= lr*gradient
        
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(np.dot(X, self.theta))
    
    def predict(self, X, threshold):
        return self.predict_prob(X) >= threshold

As we want to have a One-versus-all approach, we divide the "Hogwarts House" category into dummy variables - each representing one house. 

In [31]:
houses = pd.get_dummies(data_raw_1['Hogwarts House'])
new_data = pd.concat([data_raw_1, houses], axis = 1)
new_data.head()

Unnamed: 0,Hogwarts House,First Name,Last Name,Birthday,Best Hand,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,...,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying,Gryffindor,Hufflepuff,Ravenclaw,Slytherin
0,Ravenclaw,Tamara,Hsu,2000-03-30,Left,58384.0,-487.886086,5.72718,4.878861,4.722,...,5.231058,1039.788281,3.790369,0.715939,-232.79405,-26.89,0,0,1,0
1,Slytherin,Erich,Paredes,1999-10-14,Right,67239.0,-552.060507,-5.987446,5.520605,-5.612,...,4.10717,1058.944592,7.248742,0.091674,-252.18425,-113.45,0,0,0,1
2,Ravenclaw,Stephany,Braun,1999-11-03,Left,23702.0,-366.076117,7.725017,3.660761,6.14,...,3.555579,1088.088348,8.728531,-0.515327,-227.34265,30.42,0,0,1,0
3,Gryffindor,Vesta,Mcmichael,2000-08-19,Left,32667.0,697.742809,-6.497214,-6.977428,4.026,...,-4.809637,920.391449,0.821911,-0.01404,-256.84675,200.64,1,0,0,0
4,Gryffindor,Gaston,Gibbs,1998-09-27,Left,60158.0,436.775204,-7.820623,,2.236,...,-3.444377,937.434724,4.311066,-0.26407,-256.3873,157.98,1,0,0,0


In [68]:
model_gryffindor = logreg()

In [65]:
new_data['Hogwarts House'].unique()

array(['Ravenclaw', 'Slytherin', 'Gryffindor', 'Hufflepuff', nan],
      dtype=object)

In [66]:
X = new_data.drop(['Hogwarts House', 'First Name', 'Last Name', 'Birthday', 'Best Hand', 'Gryffindor', 'Ravenclaw', 'Hufflepuff', 'Slytherin'], axis = 1)
y = new_data.Gryffindor

In [69]:
model_gryffindor.fit(X,y)

AttributeError: 'logreg' object has no attribute 'fit'

In [54]:
model = LogisticRegression(lr=0.1, num_iter=300000)
X = data_raw1.drop(['Hogwarts House', 'First Name', 'Last Name', 'Birthday', 'Best Hand'], axis = 1)
y = data_raw1['Hogwarts House']
%time model.fit(X, y)  
preds = model.predict(X, 0.3)
# accuracy
(preds == y).mean()
1.0

TypeError: ufunc 'subtract' did not contain a loop with signature matching types dtype('<U32') dtype('<U32') dtype('<U32')



1.0

In [48]:
class LogisticRegression:
    def __init__(self, lr=0.01, num_iter=100000, fit_intercept=True):
        self.lr = lr
        self.num_iter = num_iter
        self.fit_intercept = fit_intercept
    
    def __add_intercept(self, X):
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)
    
    def __sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    def __loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def fit(self, X, y):
        if self.fit_intercept:
            X = self.__add_intercept(X)
        
        # weights initialization
        self.theta = np.zeros(X.shape[1])
        
        for i in range(self.num_iter):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = np.dot(X.T, (h - y)) / y.size
            self.theta -= self.lr * gradient
            
            if(self.verbose == True and i % 10000 == 0):
                z = np.dot(X, self.theta)
                h = self.__sigmoid(z)
                print(f'loss: {self.__loss(h, y)} \t')
    
    def predict_prob(self, X):
        if self.fit_intercept:
            X = self.__add_intercept(X)
    
        return self.__sigmoid(np.dot(X, self.theta))
    
    def predict(self, X, threshold):
        return self.predict_prob(X) >= threshold

In [10]:
model = LogisticRegression(lr=0.1, num_iter=300000)
X = data_raw1.drop(['Hogwarts House', 'First Name', 'Last Name', 'Birthday', 'Best Hand'], axis = 1)
y = data_raw1['Hogwarts House']
%time model.fit(X, y)  
preds = model.predict(X, 0.3)
# accuracy
(preds == y).mean()
1.0

TypeError: ufunc 'subtract' did not contain a loop with signature matching types dtype('<U32') dtype('<U32') dtype('<U32')



1.0