## Logistic Regression

In [1]:
import numpy as np
import pandas
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split   # Split arrays or matrices into random train and test subsets
from sklearn.preprocessing import StandardScaler       # Scaling data (calculating the standard score)

# Loading 'breast cancer' dataset in iris variable
bc = datasets.load_breast_cancer()

# Initializing breast cancer data(excludes target column) and target column into variables
# x contains all data(excluding target column)
# y contains target column
x, y = bc['data'], bc['target']

n_samples, n_features = x.shape

# train_test_split creates random data for training and testing and split both
# number of samples will be always equal in both training and testing data
# x_train will get 80% rows with all columns(excluding target column); test_size is 0.2 which means 20% data is for test(x_test)
# y_train will get 80% rows of target column; test_size is 0.2 means 20% data is for test(y_test)
# random_state: Controls the shuffling applied to the data before applying the split.
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=1)


# StandardScalar calculates the standard score
# The standard score of a sample x is calculated as: z = (x - μ) / σ
# where μ (mean or avg) = ( Σ x ) / n ; σ = √[{(Σx - μ)**2}/n]
sc = StandardScaler()

# sc.fit_transform(data) calculates the value of each parameter and applies to the dataset
x_train = sc.fit_transform(x_train)

# sc.transform(data) applies the values of the parameters on the actual data 
# only transform() must  be applied on test dataset because it has to learn calculatation(how to map one input space to another) 
# from training dataset thus it would be able to apply same operation on unseen data
x_test = sc.transform(x_test)

# Convert numpy arrays to torch tensors as well as mention datatype
x_train = torch.tensor(x_train.astype(np.float32))
x_test = torch.tensor(x_test.astype(np.float32))
y_train = torch.tensor(y_train.astype(np.float32))
y_test = torch.tensor(y_test.astype(np.float32))

# Changing the shape of tensor
y_train = y_train.view(y_train.shape[0],1)
y_test = y_test.view(y_test.shape[0],1)


# manual class and function for Logistic Regression
class LogisticRegress(nn.Module):
    
    def __init__(self,n_input):
        super(LogisticRegress,self).__init__()
        self.a = nn.Linear(n_input, 1)
        
    def forward(self, x):
        y_pred = torch.sigmoid(self.a(x)) #clamp(min=0, max=1)
        return y_pred
    

# Logistic regression model
model = LogisticRegress(n_features)


learning_rate = 1e-2
epoch = 500

# Loss and optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
loss = nn.MSELoss()


# Train the model
for i in range(epoch):
    
    # Forward pass
    output = model(x_train)
    ls = loss(output,y_train)
    
    # Backward and optimize
    optimizer.zero_grad()
    ls.backward()
    optimizer.step()
    if i%50==0:
        [w,b] = model.parameters()
        print(f"epoch={i+1}, weight={w[0][0]:.5f}, loss={ls:.15f}")

        
# Test the model
# In test phase, we don't need to compute gradients (for memory efficiency)
# with torch.no_grad() temporarily set all the requires_grad flag to false
with torch.no_grad():
    pred = model(x_test).round()
    acc  = 100 * pred.eq(y_test).sum() / float(y_test.shape[0])
              
    print(f'\naccuracy={acc:.5f}%')
    

epoch=1, weight=0.05578, loss=0.321095347404480
epoch=51, weight=-0.01545, loss=0.138198614120483
epoch=101, weight=-0.05084, loss=0.094870850443840
epoch=151, weight=-0.07374, loss=0.077183559536934
epoch=201, weight=-0.09074, loss=0.067363113164902
epoch=251, weight=-0.10429, loss=0.060972627252340
epoch=301, weight=-0.11559, loss=0.056403946131468
epoch=351, weight=-0.12527, loss=0.052931535989046
epoch=401, weight=-0.13376, loss=0.050177294760942
epoch=451, weight=-0.14131, loss=0.047923214733601

accuracy=96.49123%
