In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline


In [30]:
data = pd.read_csv('Admission_Predict.csv')

In [31]:
data.head(5)

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.0,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.8
4,5,314,103,2,2.0,3.0,8.21,0,0.65


In [32]:
X = data[data.columns[1:7]].values
y = data['Research'].values

In [33]:
print(X)

[[337.   118.     4.     4.5    4.5    9.65]
 [324.   107.     4.     4.     4.5    8.87]
 [316.   104.     3.     3.     3.5    8.  ]
 ...
 [330.   116.     4.     5.     4.5    9.45]
 [312.   103.     3.     3.5    4.     8.78]
 [333.   117.     4.     5.     4.     9.66]]


In [34]:
(X.shape[1],1)

(6, 1)

In [35]:
print(y)

[1 1 1 1 0 1 1 0 0 0 1 1 1 1 1 0 0 1 0 0 1 0 1 1 1 1 0 1 0 0 1 1 1 1 1 1 0
 0 0 0 1 1 1 0 1 1 1 0 1 1 1 1 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 1 1 1 1 1 1 1
 0 1 1 0 1 0 1 1 1 1 1 0 0 0 0 1 1 0 0 1 0 0 0 1 1 1 1 0 0 0 1 1 1 1 1 0 0
 1 1 0 1 1 0 0 0 1 1 1 0 0 0 1 1 1 1 1 1 0 0 0 1 1 0 1 1 0 1 1 1 1 1 1 0 1
 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0
 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 1 1 1 1 0 0 0 1 0 1 1 1 1 1 1 1 1 1 1 0 0 0
 0 0 0 0 0 0 0 1 0 1 0 0 1 1 1 1 0 0 0 0 1 1 0 1 0 0 1 1 1 0 1 1 0 0 0 1 1
 1 1 0 1 1 1 0 0 1 0 0 1 0 0 1 0 1 1 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 1 1 0
 0 0 1 0 0 0 1 1 0 1 1 1 0 0 1 1 1 0 0 0 0 0 1 1 1 1 0 0 0 1 0 0 1 0 1 1 1
 1 1 1 0 1 1 1 1 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 0 1 1 1 0 1 1 1 0 0 1
 0 1 1 1 0 0 0 0 0 1 1 1 1 0 1 1 0 0 0 1 0 0 1 0 1 1 1 1 0 1]


In [36]:
def train_test_split(X, y, split=0.2):
    indices = np.random.permutation(X.shape[0]) ## X.shape[0] == rows of data which is equal to 400. we are going to shuffle our data to prevent overfitting and to normalize our data
    # it would be ideal to shuffle data cause some data might already be sorted by their classification, the model might pick on this and overfit
    split = int(split * X.shape[0])

    train_indices = indices[split:]
    test_indices = indices[:split]

    x_train, x_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], y[test_indices]

    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test = train_test_split(X, y)

print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)


(320, 6) (80, 6) (320,) (80,)


### Defining the structure of our model

In [53]:
class LogisticRegression:
    def __init__(self,lr=0.01,num_iter=100):
        self.lr = lr
        self.num_iter = num_iter
        
    def predict(self, X):## given the array/features we pass, we will be given a predicted output
        X = self.normalize(X)
        linear = self.__linear(X)
        preds = self.non_linear(linear)
        return (preds >= 0.5).astype('int') ## 0.5 is the decision boundary
        
    def __linear(self,X):#2nd step
        ## linear regression, applied weights to the sum and then will be passed to the sigmoid function which will then be passed on to prediction
        ## now that we initialized our weights, we can proceed to dot product X with w then we add the bias
        return np.dot(X,self.weights) + self.bias
    
    def non_linear(self,X):##3rd step, sigmoid function
        return (1/1+np.exp(-X))
    
    def fit(self, x_train, y_train):##we fit our X & Y features and appply gradient descent, the weights will be adjusted after each step
        self.init_weights(x_train)
        
        ## https://www.statisticshowto.datasciencecentral.com/normalized/
        self.X_mean = x_train.mean().T
        self.X_std = y_train.std().T
        
        ## we normalize our data before starting gradient descent
        x_train = self.normalize(x_train)
        
        for i in range (self.num_iter):
            
            work = self.non_linear(self.__linear(x_train))
            gradient = work - y_train
            
            delta_w = np.mean(np.dot(gradient,x_train), axis=0, keepdims=True).T
            delta_b = np.mean(gradient)

            # update weights
            self.weights = self.weights - (self.lr * delta_w)
            self.bias = self.bias - (self.lr * delta_b)
        return self
                 
        
    def accuracy(self, X, y):## to calculate the accuracy of our model, it is equal to the mean of correct predictions
        preds = self.predict(X)
        return np.mean(preds == y)
    
        
    def normalize(self, X): ## we need to constantly shuffle our data during each step of gradient descent, 
        ## formula is equal to; x = x - mean of x/ std of x given (x1,x2...xn)
        X = (X - (self.X_mean)) / self.X_std ## we are standardizing our data
        return X
        
        
    def init_weights(self,X):#1st step
        self.weights = np.random.rand(X.shape[1],1) ## this will generate a (6x1) matrix. So when we will grab the input data of a  (400X6) matrix, when multiplied with the weights it will become a (400X1) matrix
        self.bias = np.zeros((1,)) ## bias set to 0 

    
    def loss(self, X, y):
        probs = self.non_linear(self.__linear(X))

        # entropy when true class is positive
        pos_log = y * np.log(probs + 1e-15)
        # entropy when true class is negative
        neg_log = (1 - y) * np.log((1 - probs) + 1e-15)

        l = -np.mean(pos_log + neg_log)
        return l
        

In [54]:
lr = LogisticRegression()
lr.fit(x_train, y_train)

<__main__.LogisticRegression at 0x27871647978>

In [55]:
print('Accuracy on test set: {:.2f}%'.format(lr.accuracy(x_test, y_test) * 100))
print('Loss on test set: {:.2f}'.format(lr.loss(x_test, y_test)))

Accuracy on test set: 51.25%
Loss on test set: nan


### As we can see from above, the loss result came out as NaN and the accuracy is not that great. This could come down either because the model overfitted the data and it needs to be normalized or the learning rate is too large. In this case, the learning rate is alright and we did standardize the data when fitting our model. So yeah, im not sure but its fun to think about.

### Using Sklearn Library

In [56]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

X = data[data.columns[1:7]].values
y = data['Research'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

lr = LogisticRegression()
lr.fit(X_train, y_train)
lr.predict(x_test[0].reshape(1,-1))
predictions = lr.predict(x_test)
score = lr.score(X_test, y_test)
print(score)

0.725


