In [89]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight

In [90]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

In [91]:
df = pd.read_csv('train.csv')

In [92]:
# assume your data is in a pandas DataFrame called 'data'
x = df.drop('y', axis=1) # input features
y = df['y'] # target variable

# split the data into training and validation sets with 80% for training
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.2, random_state=42)

x_train = x_train.fillna(0)
x_val = x_val.fillna(0)

In [93]:
# Normalize the columns of x_train
scaler = MinMaxScaler()
x_train_normalized = scaler.fit_transform(x_train)
y_train_normalized = y_train - 1
# y_train_normalized = scaler.fit_transform(y_train.values.reshape(-1, 1))

# Convert x_train and y_train to PyTorch tensors
x_train_tensor = torch.Tensor(x_train_normalized)
y_train_tensor = torch.Tensor(y_train_normalized.values)

# Create a TensorDataset for x_train and y_train
# train_dataset = TensorDataset(x_train_tensor, y_train_tensor)

In [94]:
# Define the neural network architecture
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.fc1 = nn.Linear(37, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

In [95]:
# weights = compute_class_weight(class_weight='balanced', classes=np.unique(y_train_normalized), y=y_train_normalized)
weights = compute_class_weight(class_weight={1:5, 0:1}, classes=np.unique(y_train_normalized), y=y_train_normalized)
sample_weights = torch.tensor([weights[cls] for cls in y_train_normalized])

In [96]:
# Create an instance of the neural network
model = NeuralNetwork()

# Define the loss function and optimizer
criterion = nn.BCELoss(weight=sample_weights)
optimizer = optim.Adam(model.parameters(), lr=10e-3)

# Create a DataLoader for the training dataset
# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [97]:
# Train the model
num_epochs = 2500
for epoch in range(num_epochs):
    # outputs = torch.round(model(x_train_tensor))
    outputs = model(x_train_tensor)
    targets = y_train_tensor.view(-1)  # Convert targets to long data type and flatten
    loss = criterion(outputs.view(-1), targets)  # Reshape outputs to match target size
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    if epoch%100==0:
        print(f'In {epoch}|{num_epochs} the loss is: {loss.item()}')

In 0|2500 the loss is: 0.8602239489555359
In 100|2500 the loss is: 0.16723892092704773
In 200|2500 the loss is: 0.07789908349514008
In 300|2500 the loss is: 0.042769718915224075
In 400|2500 the loss is: 0.041186295449733734
In 500|2500 the loss is: 0.03392040356993675
In 600|2500 the loss is: 0.02827925607562065
In 700|2500 the loss is: 0.022245338186621666
In 800|2500 the loss is: 0.015899116173386574
In 900|2500 the loss is: 0.008162301033735275
In 1000|2500 the loss is: 0.004333669785410166
In 1100|2500 the loss is: 0.0026602735742926598
In 1200|2500 the loss is: 0.0017937816446647048
In 1300|2500 the loss is: 0.0012909072684124112
In 1400|2500 the loss is: 0.0009733779588714242
In 1500|2500 the loss is: 0.0007612625486217439
In 1600|2500 the loss is: 0.0006097671575844288
In 1700|2500 the loss is: 0.0004996348870918155
In 1800|2500 the loss is: 0.00041454468737356365
In 1900|2500 the loss is: 0.0003496503923088312
In 2000|2500 the loss is: 0.0002982827718369663
In 2100|2500 the los

In [98]:
# Normalize the columns of x_val
x_val_normalized = scaler.fit_transform(x_val)
y_val_normalized = y_val - 1

# Convert x_val and y_val to PyTorch tensors
x_val_tensor = torch.Tensor(x_val_normalized)
y_val_tensor = torch.Tensor(y_val_normalized.values)

# Create a TensorDataset for x_val and y_val
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)

# Create a DataLoader for the validation dataset
val_loader = DataLoader(val_dataset, batch_size=32)

In [99]:
# Evaluate the model on the validation set
model.eval()
total_val_loss = 0
correct = 0
total = 0
val_criterion = nn.BCELoss()
with torch.no_grad():
    for inputs, targets in val_loader:
        outputs = model(inputs)
        predicted = torch.round(outputs).squeeze().long()  # Squeeze and convert to long
        total_val_loss += val_criterion(outputs.view(-1), targets).item()
        total += targets.size(0)
        correct += (predicted == targets).sum().item()

val_loss = total_val_loss / len(val_loader)
accuracy = correct / total

print(f'Validation Loss: {val_loss:.4f}')
print(f'Correct Predictions: {correct}/{total}')
print(f'Validation Accuracy: {accuracy * 100:.2f}%')

Validation Loss: 0.6784
Correct Predictions: 192/200
Validation Accuracy: 96.00%


In [100]:
f1_score(np.array(y_val_normalized).reshape(-1,1), torch.round(model(x_val_tensor)).detach().numpy(), average='weighted')

0.958318701782389

In [101]:
x_test = pd.read_csv('test.csv')
x_test = x_test.fillna(0)

x_test_normalized = scaler.fit_transform(x_test)

# Convert x_train and y_train to PyTorch tensors
x_test_tensor = torch.Tensor(x_test_normalized)
x_test_tensor

tensor([[0.7671, 0.3034, 0.3297,  ..., 0.4099, 0.0000, 0.0000],
        [0.3591, 0.6307, 0.6129,  ..., 0.1156, 0.0000, 0.0000],
        [0.4859, 0.2978, 0.2459,  ..., 0.0663, 0.0000, 0.0000],
        ...,
        [0.9036, 0.5929, 0.3801,  ..., 0.1531, 0.0000, 0.0000],
        [0.3665, 0.5872, 0.4438,  ..., 0.0816, 0.0000, 0.0000],
        [0.8895, 0.3518, 0.2517,  ..., 0.0221, 0.0000, 0.0000]])

In [107]:
test_out = torch.round(model(x_test_tensor)).detach().numpy()
test_out = test_out + 1

In [108]:
np.savetxt('test_nn2.txt', test_out, delimiter=',')   # test_out is an array