Imports

In [58]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
import math

MODEL

In [59]:
class LogisticRegression(nn.Module):
    def __init__(self, n_input=29, n_hidden = 128, n_output = 1):
        super(LogisticRegression, self).__init__()
        
        self.linear1 = nn.Linear(n_input, n_hidden)
        self.linear2 = nn.Linear(n_hidden, 128)
        self.linear3 = nn.Linear(128, 32)
        self.linear4 = nn.Linear(32, 8)
        self.linear5 = nn.Linear(8,1)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu(x)
        x = self.linear3(x)
        x = self.relu(x)
        x = self.linear4(x)
        x = self.relu(x)
        x = self.linear5(x)
        x = self.sigmoid(x)
        return x



Dataset

In [60]:
class PassangerDataset(Dataset):
    def __init__(self):

        self.train = pd.read_csv('train.csv', index_col='PassengerId')
        
        X_train = self.train.drop(columns=['Name'])
        X_train[['Cabin_1', 'Cabin_2', 'Cabin_3']] = X_train['Cabin'].str.split('/').apply(pd.Series)
        X_train.drop(columns='Cabin', inplace=True)

        numeric_columns = X_train.select_dtypes(include=['float', 'int'])
        X_train.fillna(numeric_columns.mean(), inplace=True)
        X_train.fillna(0.5, inplace=True)

        X_train = pd.get_dummies(X_train, columns=['HomePlanet', 'Destination', 'Cabin_1', 'Cabin_3'])
        self.y = torch.tensor(X_train['Transported'].values)
        
        x_float32 = X_train.drop('Transported', axis=1).values.astype('float32')
        x_not_norm = torch.tensor(x_float32, dtype=torch.float32).view(-1, x_float32.shape[1])

        min_vals = torch.min(x_not_norm, dim=0).values
        max_vals = torch.max(x_not_norm, dim=0).values

        normalized_matrix = (x_not_norm - min_vals) / (max_vals - min_vals)
        self.x = normalized_matrix
    def __getitem__(self, index):
        return self.x[index],self.y[index]

    def __len__(self):
        return len(self.train)

Data cleaning - test

In [61]:
target = pd.read_csv('sample_submission.csv')
test = pd.read_csv('test.csv',index_col='PassengerId')

Y_test = torch.tensor(target['Transported'],dtype=torch.float32).view(-1,1)
X_test = test.drop(columns=['Name'])
X_test[['Cabin_1', 'Cabin_2', 'Cabin_3']] = X_test['Cabin'].str.split('/').apply(pd.Series)
X_test.drop(columns='Cabin', inplace=True)

numeric_columns = X_test.select_dtypes(include=['float', 'int'])
X_test.fillna(numeric_columns.mean(), inplace=True)
X_test.fillna(0.5, inplace=True)

X_test = pd.get_dummies(X_test, columns=['HomePlanet', 'Destination', 'Cabin_1', 'Cabin_3'])
X_test.head()
x_float32 = X_test.values.astype('float32')
X_test = torch.tensor(x_float32,dtype=torch.float32)

In [63]:
dataset = PassangerDataset()
dataloader = DataLoader(dataset=dataset,batch_size=64,shuffle=True,num_workers=2)

model = LogisticRegression(29)
loss_function = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.01)

total_samples = len(dataset)
n_iterations = math.ceil(total_samples/4)

num_epochs = 100
for epoch in range(num_epochs):
    total_loss = 0.0

    for i, (inputs, labels) in enumerate(dataloader):
        # Forward pass
        y_predicted = model(inputs)
        y = labels.float()
        y = y.view(-1,1)
        loss = loss_function(y_predicted, y)

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # print(f'Epoch {epoch + 1}/{num_epochs}, Step {i + 1}/{n_iterations}, Loss: {loss.item():.4f}')

    average_loss = total_loss / n_iterations
    print(f'Epoch {epoch + 1} - Average Loss: {average_loss:.4f}')
    with torch.no_grad():
        # print(X_test.shape)
        predictions = model(X_test)
        predictions_cls = predictions.round()
        acc = predictions_cls.eq(Y_test).sum() / float(Y_test.shape[0])
        print(f'Accuracy: {acc:.4f}')
        print(f'CHECK: PREDICTIONS 5: {predictions_cls[:5]} ACTUAL VALUES 5: {Y_test[:5]} PREDICTION RATE: {predictions_cls[:5].eq(Y_test[:5]).sum() / float(Y_test[:5].shape[0])}')

print('Training completed.')

Epoch 1 - Average Loss: 0.0440
Accuracy: 0.8614
CHECK: PREDICTIONS 5: tensor([[0.],
        [1.],
        [0.],
        [0.],
        [1.]]) ACTUAL VALUES 5: tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.]]) PREDICTION RATE: 0.6000000238418579
Epoch 2 - Average Loss: 0.0436
Accuracy: 0.8508
CHECK: PREDICTIONS 5: tensor([[0.],
        [1.],
        [0.],
        [0.],
        [1.]]) ACTUAL VALUES 5: tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.]]) PREDICTION RATE: 0.6000000238418579
Epoch 3 - Average Loss: 0.0435
Accuracy: 0.8473
CHECK: PREDICTIONS 5: tensor([[0.],
        [1.],
        [0.],
        [0.],
        [1.]]) ACTUAL VALUES 5: tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.]]) PREDICTION RATE: 0.6000000238418579


KeyboardInterrupt: 