# Importing libraries

In [1]:
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm_notebook as tqdm

# PyTorch

In [2]:
from torch import nn
import torchvision
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor, Compose
from torchsummary import summary

# Sklearn

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Device

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

# Helper functions

In [5]:
object = []
def object_identifier(dataset):
    for name, dtypes in dataset.dtypes.items():
        if dtypes == 'object':
            object.append(name)
    return object

In [6]:
label = LabelEncoder()
def label_encoder(dataset, object):
    for i in object:
        dataset[i] = label.fit_transform(dataset[i])
    return dataset

In [7]:
def unique_identifier(dataset):
    for i in dataset:
        unique = dataset[i].unique()
        print(f'{i}: {unique}, Length: {len(unique)}')

# Data

In [8]:
data = pd.read_csv('task_1a_dataset.csv')

In [9]:
data.head()

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,Bachelors,2012,Bangalore,3,37,Male,No,0,0
1,Masters,2017,New Delhi,2,28,Male,No,4,0
2,Bachelors,2017,New Delhi,2,36,Male,No,3,0
3,Bachelors,2015,Bangalore,3,27,Male,Yes,5,0
4,Bachelors,2017,Bangalore,3,29,Male,No,4,0


In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4633 entries, 0 to 4632
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Education                  4633 non-null   object
 1   JoiningYear                4633 non-null   int64 
 2   City                       4633 non-null   object
 3   PaymentTier                4633 non-null   int64 
 4   Age                        4633 non-null   int64 
 5   Gender                     4633 non-null   object
 6   EverBenched                4633 non-null   object
 7   ExperienceInCurrentDomain  4633 non-null   int64 
 8   LeaveOrNot                 4633 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 325.9+ KB


In [11]:
data.describe()

Unnamed: 0,JoiningYear,PaymentTier,Age,ExperienceInCurrentDomain,LeaveOrNot
count,4633.0,4633.0,4633.0,4633.0,4633.0
mean,2015.061731,2.698468,29.385711,2.908483,0.343622
std,1.864144,0.560995,4.821685,1.557005,0.474968
min,2012.0,1.0,22.0,0.0,0.0
25%,2013.0,3.0,26.0,2.0,0.0
50%,2015.0,3.0,28.0,3.0,0.0
75%,2017.0,3.0,32.0,4.0,1.0
max,2018.0,3.0,41.0,7.0,1.0


In [12]:
data.isna().sum()

Education                    0
JoiningYear                  0
City                         0
PaymentTier                  0
Age                          0
Gender                       0
EverBenched                  0
ExperienceInCurrentDomain    0
LeaveOrNot                   0
dtype: int64

In [13]:
# Object calling function 
object_identifier(data)

['Education', 'City', 'Gender', 'EverBenched']

In [14]:
# Label encoding
label_encoder(data, object)

Unnamed: 0,Education,JoiningYear,City,PaymentTier,Age,Gender,EverBenched,ExperienceInCurrentDomain,LeaveOrNot
0,0,2012,0,3,37,1,0,0,0
1,1,2017,1,2,28,1,0,4,0
2,0,2017,1,2,36,1,0,3,0
3,0,2015,0,3,27,1,1,5,0
4,0,2017,0,3,29,1,0,4,0
...,...,...,...,...,...,...,...,...,...
4628,0,2013,0,3,26,0,0,4,0
4629,1,2013,2,2,37,1,0,2,1
4630,1,2018,1,3,27,1,0,5,1
4631,0,2012,0,3,30,1,1,2,0


In [15]:
# Unique elements
unique_identifier(data)

Education: [0 1 2], Length: 3
JoiningYear: [2012 2017 2015 2013 2016 2014 2018], Length: 7
City: [0 1 2], Length: 3
PaymentTier: [3 2 1], Length: 3
Age: [37 28 36 27 29 22 23 31 30 25 26 40 34 39 35 38 32 24 33 41], Length: 20
Gender: [1 0], Length: 2
EverBenched: [0 1], Length: 2
ExperienceInCurrentDomain: [0 4 3 5 2 1 7 6], Length: 8
LeaveOrNot: [0 1], Length: 2


# Data spliting

In [16]:
X = data.drop(['LeaveOrNot'], axis=1)
y = data['LeaveOrNot']

In [17]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.3)

## Normalizing the dataset

In [18]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [19]:
type(x_train), type(x_test), type(y_train), type(y_test)

(numpy.ndarray,
 numpy.ndarray,
 pandas.core.series.Series,
 pandas.core.series.Series)

In [20]:
y_train = y_train.to_numpy()
y_test = y_test.to_numpy()

In [21]:
type(x_train), type(x_test), type(y_train), type(y_test)

(numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray)

In [22]:
x_train = torch.from_numpy(x_train).type(torch.float)
y_train = torch.from_numpy(y_train).type(torch.float)
x_test = torch.from_numpy(x_test).type(torch.float)
y_test = torch.from_numpy(y_test).type(torch.float)

In [23]:
type(x_train), type(x_test), type(y_train), type(y_test)

(torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor)

# Dataloaders

In [24]:
batch_size = 32
train_loader = DataLoader(list(zip(x_train, y_train)), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(list(zip(x_test, y_test)), batch_size=batch_size)

## Accuracy function

In [25]:
def accuracy_fn(y_true, y_pred):
    correct = torch.eq(y_true, y_pred).sum().item()
    acc = (correct/len(y_pred)) * 100
    return acc

# Model building

In [26]:
class BinaryClassification(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer_1 = nn.Linear(in_features=8, out_features=32)
        self.layer_2 = nn.Linear(in_features=32, out_features=32)
        self.layer_3 = nn.Linear(in_features=32, out_features=1)
        self.relu = nn.ReLU()

    def forward(self,x):
        return self.layer_3(self.relu(self.layer_2(self.relu(self.layer_1(x)))))

In [27]:
model = BinaryClassification().to(device)
model

BinaryClassification(
  (layer_1): Linear(in_features=8, out_features=32, bias=True)
  (layer_2): Linear(in_features=32, out_features=32, bias=True)
  (layer_3): Linear(in_features=32, out_features=1, bias=True)
  (relu): ReLU()
)

In [28]:
summary(model)

Layer (type:depth-idx)                   Param #
├─Linear: 1-1                            288
├─Linear: 1-2                            1,056
├─Linear: 1-3                            33
├─ReLU: 1-4                              --
Total params: 1,377
Trainable params: 1,377
Non-trainable params: 0


Layer (type:depth-idx)                   Param #
├─Linear: 1-1                            288
├─Linear: 1-2                            1,056
├─Linear: 1-3                            33
├─ReLU: 1-4                              --
Total params: 1,377
Trainable params: 1,377
Non-trainable params: 0

# Loss and optimizers

In [29]:
model = BinaryClassification()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCEWithLogitsLoss()

In [30]:
epochs = 500
for epoch in range(epochs):
    model.train()
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output.squeeze(), batch_y)
        loss.backward()
        optimizer.step()

    # Evaluate the model on the test set
    model.eval()
    test_predictions = []
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            output = model(batch_x)
            predictions = torch.round(torch.sigmoid(output))
            test_predictions.extend(predictions.tolist())
    
    accuracy = accuracy_score(y_test.tolist(), test_predictions)
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {loss.item():.4f}, Accuracy: {accuracy * 100:.2f}%")

Epoch 1/500, Loss: 0.5754, Accuracy: 74.03%
Epoch 2/500, Loss: 0.4835, Accuracy: 75.11%
Epoch 3/500, Loss: 0.7241, Accuracy: 77.63%
Epoch 4/500, Loss: 0.3560, Accuracy: 78.92%
Epoch 5/500, Loss: 0.1314, Accuracy: 79.06%
Epoch 6/500, Loss: 0.3533, Accuracy: 80.65%
Epoch 7/500, Loss: 0.3286, Accuracy: 81.15%
Epoch 8/500, Loss: 0.2609, Accuracy: 82.30%
Epoch 9/500, Loss: 0.6553, Accuracy: 82.16%
Epoch 10/500, Loss: 0.3592, Accuracy: 82.30%
Epoch 11/500, Loss: 0.4747, Accuracy: 83.17%
Epoch 12/500, Loss: 0.8056, Accuracy: 83.17%
Epoch 13/500, Loss: 0.7115, Accuracy: 82.88%
Epoch 14/500, Loss: 0.6114, Accuracy: 83.17%
Epoch 15/500, Loss: 0.1266, Accuracy: 82.88%
Epoch 16/500, Loss: 0.1074, Accuracy: 83.09%
Epoch 17/500, Loss: 0.1972, Accuracy: 83.17%
Epoch 18/500, Loss: 0.6837, Accuracy: 82.66%
Epoch 19/500, Loss: 0.3072, Accuracy: 83.17%
Epoch 20/500, Loss: 0.3345, Accuracy: 83.09%
Epoch 21/500, Loss: 0.1490, Accuracy: 82.81%
Epoch 22/500, Loss: 0.5412, Accuracy: 82.95%
Epoch 23/500, Loss: