# Logreg train - Python 42

In [6]:
import pandas as pd
import numpy as np
import copy

In [7]:
train_data = pd.read_csv("dataset_train.csv")
train_data = train_data.drop(['Index'], axis=1)
train_data.head(4)

Unnamed: 0,Hogwarts House,First Name,Last Name,Birthday,Best Hand,Arithmancy,Astronomy,Herbology,Defense Against the Dark Arts,Divination,Muggle Studies,Ancient Runes,History of Magic,Transfiguration,Potions,Care of Magical Creatures,Charms,Flying
0,Ravenclaw,Tamara,Hsu,2000-03-30,Left,58384.0,-487.886086,5.72718,4.878861,4.722,272.035831,532.484226,5.231058,1039.788281,3.790369,0.715939,-232.79405,-26.89
1,Slytherin,Erich,Paredes,1999-10-14,Right,67239.0,-552.060507,-5.987446,5.520605,-5.612,-487.340557,367.760303,4.10717,1058.944592,7.248742,0.091674,-252.18425,-113.45
2,Ravenclaw,Stephany,Braun,1999-11-03,Left,23702.0,-366.076117,7.725017,3.660761,6.14,664.893521,602.585284,3.555579,1088.088348,8.728531,-0.515327,-227.34265,30.42
3,Gryffindor,Vesta,Mcmichael,2000-08-19,Left,32667.0,697.742809,-6.497214,-6.977428,4.026,-537.001128,523.982133,-4.809637,920.391449,0.821911,-0.01404,-256.84675,200.64


## Code for the logistic regression

### Defining the training and testing sets

In [57]:
class preprocess:
    
    def __init__(self):
        self=self
    
    def __cleaning(self,path,column):
        data = pd.read_csv(path)
        data = data.dropna().reset_index(drop=True)
        data = data.drop(['Index'], axis=1)
        spread = pd.get_dummies(data[str(column)])
        final_data = pd.concat([data, spread], axis=1)
        return final_data
    
    def __minimum(self,frame):
        m = frame.iloc[0]
        for i in np.arange(1, len(frame)):
            if (m > frame.iloc[i]): m = frame.iloc[i]
        return m
    
    def __maximum(self,frame):
        m = frame.iloc[0]
        for i in np.arange(1, len(frame)):
            if (m < frame.iloc[i]): m = frame.iloc[i]
        return m
    
    def __selection(self,data):
        inter = data._get_numeric_data()
        final_data = pd.DataFrame(index=data.index)
        for column in list(inter.columns):
            final_data[column] = (inter[column] - self.__minimum(inter[column]))/(self.__maximum(inter[column]) - self.__minimum(inter[column]))
        return final_data
    
    def __add_intercept(self,data):
        cols = data.columns
        data['Intercept'] = 1
        return data.filter(['Intercept'] + list(cols))
        
    def preprocess(self,path,column):
        data = self.__cleaning(path, column)
        inter_data = self.__selection(data)
        final_data = self.__add_intercept(inter_data)
        return final_data

We create a class for the Logistic Regresssion model

In [65]:
class LogReg:
    
    def __init__(self, lr, iterations):
        self.lr = lr
        self.iterations = iterations

    def __sigmoid(self,z):
        return 1 / (1 + np.exp(-z))
    
    def fit(self, X, y):
        self.theta = np.zeros(X.shape[1])
        
        for i in np.arange(0,self.iterations):
            z = np.dot(X, self.theta)
            h = self.__sigmoid(z)
            gradient = 1/y.size * np.dot(X.T, (h-y))
            self.theta -= self.lr*gradient
    
    def probability(self, X):
        return self.__sigmoid(np.dot(X, self.theta))

    def predict(self,X):
        return np.random.binomial(1, self.__sigmoid(np.dot(X, self.theta)))

Now that we have one regression model, a OVA solution would be to train 4 different models, one for each house and then keep the prediction with the highest probability for each student.

In [116]:
def weights(data, outputs,model):
    inter = data.drop(outputs, axis=1)
    results = pd.DataFrame(index=np.arange(0, len(list(inter.columns))))
    for target in outputs:
        model.fit(inter, data[target])
        partial = pd.DataFrame(model.theta)
        partial.columns = ['Weights '+str(target)]
        results = pd.concat([results, partial], axis=1)   
    return results

def accuracy(data,outputs,model):
    output = pd.DataFrame(data.filter(outputs, axis=1).idxmax(axis=1))
    inter = inter = data.drop(outputs, axis=1)
    probabilities = pd.DataFrame(index=np.arange(0, len(list(inter.columns))))

    for target in outputs:
        model.fit(inter, data[target])
        proba = pd.DataFrame(model.probability(inter))
        proba.columns = [str(target)]
        probabilities = pd.concat([probabilities, proba], axis=1)    

    probabilities['Results'] = probabilities.idxmax(axis=1)
    probabilities['Actual'] = output
    
    count = 0
    for i in np.arange(0,len(probabilities['Results'])):
        if (probabilities['Results'].iloc[i]==probabilities['Actual'].iloc[i]): count += 1
    return count/len(probabilities['Results'])

Wrap up function

In [119]:
def training(path,output_column):
    pre = preprocess()
    data = pre.preprocess(path,output_column)
    outputs = ['Gryffindor', 'Slytherin', 'Ravenclaw', 'Hufflepuff']
    model = LogReg(lr = 0.01, iterations = 1000)
    
    weights(data,outputs,model).to_csv('logreg_weights.csv')
    print(accuracy(data,outputs,model))

In [120]:
training('dataset_train.csv', 'Hogwarts House')

0.9280575539568345
