Imports

In [98]:
import torch
import torch.nn as nn
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, random_split
import math

MODEL

In [None]:
class LogisticRegression(nn.Module):
    def __init__(self, n_input=29, n_hidden = 128, n_output = 1):
        super(LogisticRegression, self).__init__()
        
        self.linear1 = nn.Linear(n_input, n_hidden)
        self.linear2 = nn.Linear(n_hidden, 128)
        self.linear3 = nn.Linear(128, 32)
        self.linear4 = nn.Linear(32, 8)
        self.linear5 = nn.Linear(8,1)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.tanh = nn.Tanh()
    def forward(self, x):
        x = self.linear1(x)
        x = self.relu(x)
        x = self.linear2(x)
        x = self.relu(x)
        x = self.linear3(x)
        x = self.relu(x)
        x = self.linear4(x)
        x = self.relu(x)
        x = self.linear5(x)
        x = self.sigmoid(x)
        return x



Dataset

In [94]:
class PassangerDataset(Dataset):
    def __init__(self):

        self.train = pd.read_csv('train.csv', index_col='PassengerId')
        
        X_train = self.train.drop(columns=['Name'])
        X_train[['Cabin_1', 'Cabin_2', 'Cabin_3']] = X_train['Cabin'].str.split('/').apply(pd.Series)
        X_train.drop(columns='Cabin', inplace=True)

        numeric_columns = X_train.select_dtypes(include=['float', 'int'])
        X_train.fillna(numeric_columns.mean(), inplace=True)
        X_train.fillna(0.5, inplace=True)

        X_train = pd.get_dummies(X_train, columns=['HomePlanet', 'Destination', 'Cabin_1', 'Cabin_3'])
        self.y = torch.tensor(X_train['Transported'].values)
        
        x_float32 = X_train.drop('Transported', axis=1).values.astype('float32')
        x_not_norm = torch.tensor(x_float32, dtype=torch.float32).view(-1, x_float32.shape[1])

        min_vals = torch.min(x_not_norm, dim=0).values
        max_vals = torch.max(x_not_norm, dim=0).values

        normalized_matrix = (x_not_norm - min_vals) / (max_vals - min_vals)
        self.x = normalized_matrix
    def __getitem__(self, index):
        return self.x[index],self.y[index]

    def __len__(self):
        return len(self.train)

Data cleaning - test

In [99]:
target = pd.read_csv('sample_submission.csv')
test = pd.read_csv('test.csv',index_col='PassengerId')

X_test = test.drop(columns=['Name'])
X_test[['Cabin_1', 'Cabin_2', 'Cabin_3']] = X_test['Cabin'].str.split('/').apply(pd.Series)
X_test.drop(columns='Cabin', inplace=True)

numeric_columns = X_test.select_dtypes(include=['float', 'int'])
X_test.fillna(numeric_columns.mean(), inplace=True)
X_test.fillna(0.5, inplace=True)

X_test = pd.get_dummies(X_test, columns=['HomePlanet', 'Destination', 'Cabin_1', 'Cabin_3'])
X_test.head()
x_float32 = X_test.values.astype('float32')
X_test = torch.tensor(x_float32,dtype=torch.float32,requires_grad=True)

In [104]:
def calculate_accuracy(model, dataloader):
    correct = 0
    total = 0
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient tracking during evaluation
        for inputs, labels in dataloader:
            outputs = model(inputs)
            predicted = (outputs >= 0.5).float()  # Convert to binary predictions (0 or 1)
            print(labels.size(0))
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            print(correct)
    accuracy = 100 * correct / total
    return accuracy

In [105]:
dataset = PassangerDataset()

# Assuming you have a custom PassengerDataset called 'dataset'
total_samples = len(dataset)
train_size = int(0.8 * total_samples)  # 80% of the data for training

# Split the dataset into training and testing sets
train_dataset, test_dataset = random_split(dataset, [train_size, total_samples - train_size])

# Create data loaders for both sets
batch_size = 64
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)


model = LogisticRegression(29)
loss_function = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.002)

total_samples = len(train_dataset)
n_iterations = math.ceil(total_samples/batch_size)

num_epochs = 100
for epoch in range(num_epochs):
    total_loss = 0.0

    for i, (inputs, labels) in enumerate(train_dataloader):
        # Forward pass
        y_predicted = model(inputs)
        y = labels.float().view(-1,1)
        loss = loss_function(y_predicted, y)

        # Backpropagation and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # print(f'Epoch {epoch + 1}/{num_epochs}, Step {i + 1}/{n_iterations}, Loss: {loss.item():.4f}')

    average_loss = total_loss / n_iterations
    print(f'Epoch {epoch + 1} - Average Loss: {average_loss:.4f}')

    test_accuracy = calculate_accuracy(model,test_dataloader)
    print(f'Testing Accuracy: {test_accuracy:.2f}%')

print('Training completed.')


Epoch 1 - Average Loss: 0.5605
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
11
Testing Accuracy: 3187.75%
Epoch 2 - Average Loss: 0.4937
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
11
Testing Accuracy: 3188.84%
Epoch 3 - Average Loss: 0.4621
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
64
11
Testing Accuracy: 3170.50%


KeyboardInterrupt: 

In [None]:
# import csv

# f = open('submission.csv','w')
# writer = csv.writer(f)
print(X_test)