In [177]:
import pandas as pd
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F
import seaborn as sns

import os
import sys

from tqdm import tqdm

In [178]:

# current_folder_path = os.path.dirname(os.path.abspath(__file__))
current_folder_path = os.path.abspath('')

# Training datatset cleaning
train_data = pd.read_csv(f"{current_folder_path}/train.csv")
# Getting rid of too specific string values
train_data = train_data.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])

# Getting rid of NaN values among the remaining columns
# nan_count = train_data.isna().sum().sum()
# print(nan_count)
train_data.dropna(axis=0, inplace=True)
print(train_data.head())

train_labels = train_data[["Survived"]]

train_features = train_data.drop(columns=["Survived"])
train_features = pd.get_dummies(train_features)


# Testing datatset cleaning
test_data = pd.read_csv(f"{current_folder_path}/test.csv")
submission_data = pd.get_dummies(test_data.fillna(value=0).drop(columns=["PassengerId", "Name", "Ticket", "Cabin"]))
# print(train_features.columns.values)
submission_data = submission_data.reindex(columns=train_features.columns.values)
print(submission_data.head())
print(np.sum(submission_data.isna().to_numpy()))

answer = pd.read_csv(f"{current_folder_path}/gender_submission.csv")[["Survived"]]
test_data[["Survived"]] = answer[["Survived"]]

# Getting rid of too specific string values
test_data = test_data.drop(columns=["PassengerId", "Name", "Ticket", "Cabin"])

# Getting rid of NaN values among the remaining columns
test_data.dropna(axis=0, inplace=True)
print(test_data.head())

test_labels = test_data[["Survived"]]

test_features = test_data.drop(columns=["Survived"])
test_features = pd.get_dummies(test_features)

# print(test_data.head())
# print(test_features.tail())
# print(test_labels.tail())


   Survived  Pclass     Sex   Age  SibSp  Parch     Fare Embarked
0         0       3    male  22.0      1      0   7.2500        S
1         1       1  female  38.0      1      0  71.2833        C
2         1       3  female  26.0      0      0   7.9250        S
3         1       1  female  35.0      1      0  53.1000        S
4         0       3    male  35.0      0      0   8.0500        S
   Pclass   Age  SibSp  Parch     Fare  Sex_female  Sex_male  Embarked_C  \
0       3  34.5      0      0   7.8292           0         1           0   
1       3  47.0      1      0   7.0000           1         0           0   
2       2  62.0      0      0   9.6875           0         1           0   
3       3  27.0      0      0   8.6625           0         1           0   
4       3  22.0      1      1  12.2875           1         0           0   

   Embarked_Q  Embarked_S  
0           1           0  
1           0           1  
2           1           0  
3           0           1  
4      

In [179]:
print("\n train_features \n", train_features.head(1), "\n train_labels \n", train_labels.head(1))
print("\n test_features \n", test_features.head(1), "\n test_labels \n", test_labels.head(1))
print("\n test_features \n", submission_data.head(1))


 train_features 
    Pclass   Age  SibSp  Parch  Fare  Sex_female  Sex_male  Embarked_C  \
0       3  22.0      1      0  7.25           0         1           0   

   Embarked_Q  Embarked_S  
0           0           1   
 train_labels 
    Survived
0         0

 test_features 
    Pclass   Age  SibSp  Parch    Fare  Sex_female  Sex_male  Embarked_C  \
0       3  34.5      0      0  7.8292           0         1           0   

   Embarked_Q  Embarked_S  
0           1           0   
 test_labels 
    Survived
0         0

 test_features 
    Pclass   Age  SibSp  Parch    Fare  Sex_female  Sex_male  Embarked_C  \
0       3  34.5      0      0  7.8292           0         1           0   

   Embarked_Q  Embarked_S  
0           1           0  


#### NN structure definition

In [180]:
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(SimpleNN, self).__init__()
        self.input_size = input_size
        self.hidden_size= hidden_size
        self.output_size = output_size
        
        self.flatten = nn.Flatten()
        self.relu = nn.ReLU()

        self.input_layer = nn.Linear(input_size, hidden_size)
        
        self.layer1 = nn.Linear(hidden_size, hidden_size//2)
        self.layer2 = nn.Linear(hidden_size//2, hidden_size//4)
        
        self.batchnorm1 = nn.BatchNorm1d(hidden_size//2)
        self.batchnorm2 = nn.BatchNorm1d(hidden_size//4)

        self.output_layer = nn.Linear(hidden_size//4, output_size)
        self.sigmoid = nn.Sigmoid()  
        self.dropout = nn.Dropout1d(p=0.1)
            
    def forward(self, x):
        x = self.input_layer(x)
        x = self.relu(x)
        
        x = self.layer1(x)
        x = self.relu(x)
        x = self.batchnorm1(x)
        
        x = self.layer2(x)
        x = self.relu(x)
        x = self.batchnorm2(x)
        
        x = self.dropout(x)
        
        x = self.output_layer(x)  
        #x = self.relu(x)    
        x = self.sigmoid(x)
        return x        


In [181]:
train_features = torch.from_numpy(train_features.to_numpy()).float()
train_labels = torch.from_numpy(train_labels.to_numpy()).float()
test_features = torch.from_numpy(test_features.to_numpy()).float()
test_labels = torch.from_numpy(test_labels.to_numpy()).float()
submission_features = torch.from_numpy(submission_data.to_numpy()).float()

train_features = F.normalize(train_features)
test_features = F.normalize(test_features)

print(train_features.size(), train_labels.size(), test_features.size(), test_labels.size())


torch.Size([891, 10]) torch.Size([891, 1]) torch.Size([331, 10]) torch.Size([331, 1])


#### Model training

In [182]:
m, n = train_features.shape
input_size = n
print(input_size)
hidden_size = 64
output_size = 1

model = SimpleNN(input_size, hidden_size, output_size)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
#optimize = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.5)

model.train() 
running_loss = 0.0

losses_list = []
# num_epochs = int(input("Number of epochs : "))  
num_epochs = 500     
for epoch in tqdm(range(num_epochs)):
    optimizer.zero_grad()
    outputs = model(train_features)
    loss = criterion(outputs, train_labels)
    loss.backward()
    optimizer.step()
    running_loss += loss.item()
    losses_list.append(loss.item())

print(f"Epoch {epoch+1}, Loss: {running_loss}")
# print(np.round(losses_list[::10], 6))
p = (sns.lineplot(losses_list))
# p.label(x="Epochs", y="Loss", title="Losses")

10


  0%|          | 0/500 [00:00<?, ?it/s]


RuntimeError: all elements of input should be between 0 and 1

#### Model testing

In [None]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    outputs = model(test_features) 
    print(outputs[0:10].T)
    #predicted = torch.round(outputs.data)
    #_, predicted = torch.max(outputs.data, 1)
    predicted = torch.round(outputs.data)
    # predicted = np.round(outputs.numpy())
    total += test_labels.size(0)
    # correct += (predicted.numpy() == test_labels.numpy().T).sum().item()
    correct += (predicted == test_labels).sum().item()        
    
    print(predicted.numpy()[0:10].T, test_labels.numpy().T[0, 0:10])
    #correct += (predicted == test_labels.numpy().T[0]).sum().item()

    
# print(np.unique(predicted.numpy(), return_counts=True))
# print(np.unique(test_labels.numpy(), return_counts=True))
# print(predicted.numpy()[0:10], test_labels.numpy().T[0, 0:10])

accuracy = 100 * correct / total
print(f"Accuracy on test set: {accuracy:.2f}%")



RuntimeError: mat1 and mat2 shapes cannot be multiplied (331x10 and 11x64)

#### Test data submission

In [None]:

print(submission_data.head())

with torch.no_grad():
    outputs = model(submission_features) 
    print(outputs[0:10])
    submission_prediction = torch.round(outputs.data)
    print(submission_prediction[0:10])

    
prediction_array = submission_prediction.numpy()
submission_dataframe = pd.DataFrame(pd.read_csv(f"{current_folder_path}/test.csv")[["PassengerId"]])
submission_dataframe[["Survived"]] = pd.DataFrame(submission_prediction)

submission_dataframe.head()



   Pclass   Age  SibSp  Parch     Fare  Sex_female  Sex_male  Embarked_0  \
0       3  34.5      0      0   7.8292           0         1         NaN   
1       3  47.0      1      0   7.0000           1         0         NaN   
2       2  62.0      0      0   9.6875           0         1         NaN   
3       3  27.0      0      0   8.6625           0         1         NaN   
4       3  22.0      1      1  12.2875           1         0         NaN   

   Embarked_C  Embarked_Q  Embarked_S  
0           0           1           0  
1           0           0           1  
2           0           1           0  
3           0           0           1  
4           0           0           1  
tensor([[nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan]])
tensor([[nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan],
        [nan]])


Unnamed: 0,PassengerId,Survived
0,892,
1,893,
2,894,
3,895,
4,896,
