In [6]:
import pandas as pd
import utils

In [7]:
# Read the csv file and preprocess it: convert qualitative attributes to integers
train = pd.read_csv("train.csv")
# drop duplicates
train = train.drop_duplicates()
X = train.agg(utils.transform_dataset) # utils.transform_dataset is a dicitionary which applies a transforming function on each column
y= X["Body_Level"]
y= y-1
X = X.drop("Body_Level", axis=1)

# Split the dataset into train and test sets
# sklearn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)


In [8]:
from imblearn.over_sampling import SMOTE

# Initialize the LazyClassifier and fit the training data
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader



# Convert the data into PyTorch tensors
X_train_tensor = torch.from_numpy(X_train.values).float()
X_test_tensor = torch.from_numpy(X_test.values).float()
y_train_tensor = torch.from_numpy(y_train.values).long()
y_test_tensor = torch.from_numpy(y_test.values).long()

# Define the neural network architecture
class Net(nn.Module):
    def __init__(self, dropout=0):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(16, 32)
        self.fc2 = nn.Linear(32, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 15)
        self.fc5 = nn.Linear(15, 4)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        # x= self.dropout(x)
        x = torch.relu(self.fc2(x))
        x= self.dropout(x)
        x = torch.relu(self.fc3(x))
        # x= self.dropout(x)
        x = torch.relu(self.fc4(x))
        x = self.fc5(x)
        # x = torch.softmax(x, dim=1)  # Apply the softmax function along the class dimension

        return x


In [10]:

# Create a PyTorch data loader for the training data
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Initialize the neural network, loss function, and optimizer
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

# Train the neural network
for epoch in range(500):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print('Epoch %d loss: %.3f' % (epoch + 1, running_loss / len(train_loader)))


Epoch 1 loss: 1.366
Epoch 2 loss: 1.248
Epoch 3 loss: 1.025
Epoch 4 loss: 0.814
Epoch 5 loss: 0.674
Epoch 6 loss: 0.592
Epoch 7 loss: 0.565
Epoch 8 loss: 0.514
Epoch 9 loss: 0.520
Epoch 10 loss: 0.502
Epoch 11 loss: 0.486
Epoch 12 loss: 0.479
Epoch 13 loss: 0.464
Epoch 14 loss: 0.460
Epoch 15 loss: 0.451
Epoch 16 loss: 0.445
Epoch 17 loss: 0.446
Epoch 18 loss: 0.454
Epoch 19 loss: 0.444
Epoch 20 loss: 0.413
Epoch 21 loss: 0.415
Epoch 22 loss: 0.416
Epoch 23 loss: 0.437
Epoch 24 loss: 0.388
Epoch 25 loss: 0.385
Epoch 26 loss: 0.376
Epoch 27 loss: 0.383
Epoch 28 loss: 0.370
Epoch 29 loss: 0.377
Epoch 30 loss: 0.396
Epoch 31 loss: 0.356
Epoch 32 loss: 0.345
Epoch 33 loss: 0.344
Epoch 34 loss: 0.360
Epoch 35 loss: 0.324
Epoch 36 loss: 0.322
Epoch 37 loss: 0.316
Epoch 38 loss: 0.326
Epoch 39 loss: 0.316
Epoch 40 loss: 0.312
Epoch 41 loss: 0.303
Epoch 42 loss: 0.292
Epoch 43 loss: 0.339
Epoch 44 loss: 0.301
Epoch 45 loss: 0.280
Epoch 46 loss: 0.315
Epoch 47 loss: 0.274
Epoch 48 loss: 0.259
E

In [11]:
with torch.no_grad():
    outputs = net(X_train_tensor)
    _, predicted_train = torch.max(outputs.data, 1)
# print acc
print('Accuracy of the network on the train data: %.3f' % (100 * torch.sum(y_train_tensor == predicted_train) / len(y_train_tensor)))


with torch.no_grad():
    outputs = net(X_test_tensor)
    _, predicted_test = torch.max(outputs.data, 1)
# print acc
print('Accuracy of the network on the test data: %.3f' % (100 * torch.sum(y_test_tensor == predicted_test) / len(y_test_tensor)))

# get train acc

Accuracy of the network on the train data: 98.652
Accuracy of the network on the test data: 91.980


In [12]:
# get F1 score
from sklearn.metrics import f1_score
print("f1_train",f1_score(y_train_tensor, predicted_train, average='macro'))
print("f1_test",f1_score(y_test_tensor, predicted_test, average='macro'))

# f1 train


f1_train 0.9865249116093808
f1_test 0.8947593729868082


In [13]:
# Read the csv file and preprocess it: convert qualitative attributes to integers
train = pd.read_csv("train.csv")
# drop duplicates
train = train.drop_duplicates()
X = train.agg(utils.transform_dataset) # utils.transform_dataset is a dicitionary which applies a transforming function on each column
y= X["Body_Level"]
y= y-1
X = X.drop("Body_Level", axis=1)

# Split the dataset into train and test sets
# sklearn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

# test to test and val
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=.2, random_state=42)

from imblearn.over_sampling import SMOTE

# Initialize the LazyClassifier and fit the training data
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)


In [14]:
X_train_tensor = torch.from_numpy(X_train.values).float()
X_test_tensor = torch.from_numpy(X_test.values).float()
y_train_tensor = torch.from_numpy(y_train.values).long()
y_test_tensor = torch.from_numpy(y_test.values).long()
y_val_tensor = torch.from_numpy(y_val.values).long()
X_val_tensor = torch.from_numpy(X_val.values).float()

net = Net()

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001, weight_decay=0.001)

# Train the neural network with L2 regularization
count =0
for epoch in range(500):
    running_loss = 0.0
    count +=1
    for i, data in enumerate(train_loader, 0):
        # Get the inputs
        inputs, labels = data

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
    if count % 100 == 0:    # print every 2000 mini-batches
        print('[%d, %5d] loss: %.3f' %
                (epoch + 1, i + 1, running_loss / 2000))
        running_loss = 0.0
        # print acc
        with torch.no_grad():
            outputs = net(X_train_tensor)
            _, predicted_train = torch.max(outputs.data, 1)
        print('Accuracy of the network on the train data: %.3f' % (100 * torch.sum(y_train_tensor == predicted_train) / len(y_train_tensor)))
        # get val
        with torch.no_grad():
            outputs = net(X_val_tensor)
            _, predicted_val = torch.max(outputs.data, 1)
        print('Accuracy of the network on the val data: %.3f' % (100 * torch.sum(y_val_tensor == predicted_val) / len(y_val_tensor)))

[100,    51] loss: 0.005
Accuracy of the network on the train data: 89.074
Accuracy of the network on the val data: 89.831
[200,    51] loss: 0.003
Accuracy of the network on the train data: 93.194
Accuracy of the network on the val data: 94.915
[300,    51] loss: 0.003
Accuracy of the network on the train data: 90.185
Accuracy of the network on the val data: 86.441
[400,    51] loss: 0.001
Accuracy of the network on the train data: 94.444
Accuracy of the network on the val data: 98.305
[500,    51] loss: 0.001
Accuracy of the network on the train data: 96.065
Accuracy of the network on the val data: 100.000


In [15]:
# print acc
with torch.no_grad():
    outputs = net(X_train_tensor)
    _, predicted_train = torch.max(outputs.data, 1)
print('Accuracy of the network on the train data: %.3f' % (100 * torch.sum(y_train_tensor == predicted_train) / len(y_train_tensor)))

with torch.no_grad():
    outputs = net(X_test_tensor)
    _, predicted_test = torch.max(outputs.data, 1)
# print acc
print('Accuracy of the network on the test data: %.3f' % (100 * torch.sum(y_test_tensor == predicted_test) / len(y_test_tensor)))
# get F1
from sklearn.metrics import f1_score
print("f1_test",f1_score(y_test_tensor, predicted_test, average='macro'))

Accuracy of the network on the train data: 96.065
Accuracy of the network on the test data: 96.154
f1_test 0.9510091509313423


In [19]:

X_train_tensor = torch.from_numpy(X_train.values).float()
X_test_tensor = torch.from_numpy(X_test.values).float()
X_val_tensor = torch.from_numpy(X_val.values).float()

y_train_tensor = torch.from_numpy(y_train.values).long()
y_test_tensor = torch.from_numpy(y_test.values).long()
y_val_tensor = torch.from_numpy(y_val.values).long()

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

net = Net(dropout=0.2)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001, weight_decay=0.001)

# Train the neural network with L2 regularization
count =0
model_fin = None
last_val_acc = 0
for epoch in range(500):
    running_loss = 0.0
    count +=1
    for i, data in enumerate(train_loader, 0):
        # Get the inputs
        inputs, labels = data

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
    if count % 50 == 0:    # print every 2000 mini-batches
        print('[%d, %5d] loss: %.3f' %
                (epoch + 1, i + 1, running_loss / 2000))
        running_loss = 0.0
        # print acc
        with torch.no_grad():
            outputs = net(X_train_tensor)
            _, predicted_train = torch.max(outputs.data, 1)
        print('Accuracy of the network on the train data: %.3f' % (100 * torch.sum(y_train_tensor == predicted_train) / len(y_train_tensor)))
        # get val
        with torch.no_grad():
            outputs = net(X_val_tensor)
            _, predicted_val = torch.max(outputs.data, 1)
        val_acc= 100 * torch.sum(y_val_tensor == predicted_val) / len(y_val_tensor)
        if (val_acc >= last_val_acc) :
            last_val_acc = val_acc
            model_fin = net
        print('Accuracy of the network on the val data: %.3f' % (100 * torch.sum(y_val_tensor == predicted_val) / len(y_val_tensor)))

[50,    68] loss: 0.009
Accuracy of the network on the train data: 89.259
Accuracy of the network on the val data: 86.441
[100,    68] loss: 0.007
Accuracy of the network on the train data: 92.824
Accuracy of the network on the val data: 86.441
[150,    68] loss: 0.005
Accuracy of the network on the train data: 94.213
Accuracy of the network on the val data: 91.525
[200,    68] loss: 0.004
Accuracy of the network on the train data: 96.759
Accuracy of the network on the val data: 91.525
[250,    68] loss: 0.004
Accuracy of the network on the train data: 96.620
Accuracy of the network on the val data: 91.525
[300,    68] loss: 0.003
Accuracy of the network on the train data: 97.222
Accuracy of the network on the val data: 93.220
[350,    68] loss: 0.002
Accuracy of the network on the train data: 97.639
Accuracy of the network on the val data: 91.525
[400,    68] loss: 0.002
Accuracy of the network on the train data: 97.778
Accuracy of the network on the val data: 89.831
[450,    68] loss

In [20]:
#get acc test
with torch.no_grad():
    outputs = model_fin(X_test_tensor)
    _, predicted_test = torch.max(outputs.data, 1)
# print acc
print('Accuracy of the network on the test data: %.3f' % (100 * torch.sum(y_test_tensor == predicted_test) / len(y_test_tensor)))
# get f1
print('F1 score of the network on the test data: %.3f' % (f1_score(y_test_tensor, predicted_test, average='macro')))


# get acc train
with torch.no_grad():
    outputs = model_fin(X_train_tensor)
    _, predicted_train = torch.max(outputs.data, 1)
# print acc
print('Accuracy of the network on the train data: %.3f' % (100 * torch.sum(y_train_tensor == predicted_train) / len(y_train_tensor)))

Accuracy of the network on the test data: 94.444
F1 score of the network on the test data: 0.915
Accuracy of the network on the train data: 98.843


In [18]:
# print len of them all

print(len(y_train_tensor))
print(len(y_test_tensor))
print(len(y_val_tensor))


2160
234
59


## Now train on the whole dataset and plot the loss over time

In [None]:

X_train_tensor = torch.from_numpy(X_train.values).float()
X_test_tensor = torch.from_numpy(X_test.values).float()
X_val_tensor = torch.from_numpy(X_val.values).float()

y_train_tensor = torch.from_numpy(y_train.values).long()
y_test_tensor = torch.from_numpy(y_test.values).long()
y_val_tensor = torch.from_numpy(y_val.values).long()


# combine all of them in a whole X and y
X = torch.cat((X_train_tensor, X_test_tensor, X_val_tensor), 0)
y = torch.cat((y_train_tensor, y_test_tensor, y_val_tensor), 0)

In [None]:

net = Net(dropout=0.2)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001, weight_decay=0.001)

# Train the neural network with L2 regularization
count =0
model_fin = None
last_val_acc = 0
for epoch in range(500):
    running_loss = 0.0
    count +=1
    for i, data in enumerate(train_loader, 0):
        # Get the inputs
        inputs, labels = data

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
    if count % 50 == 0:    # print every 2000 mini-batches
        print('[%d, %5d] loss: %.3f' %
                (epoch + 1, i + 1, running_loss / 2000))
        running_loss = 0.0
        # print acc
        with torch.no_grad():
            outputs = net(X_train_tensor)
            _, predicted_train = torch.max(outputs.data, 1)
        print('Accuracy of the network on the train data: %.3f' % (100 * torch.sum(y_train_tensor == predicted_train) / len(y_train_tensor)))
        # get val
        with torch.no_grad():
            outputs = net(X_val_tensor)
            _, predicted_val = torch.max(outputs.data, 1)
        val_acc= 100 * torch.sum(y_val_tensor == predicted_val) / len(y_val_tensor)
        if (val_acc >= last_val_acc) :
            last_val_acc = val_acc
            model_fin = net
        print('Accuracy of the network on the val data: %.3f' % (100 * torch.sum(y_val_tensor == predicted_val) / len(y_val_tensor)))

In [27]:
# Read the csv file and preprocess it: convert qualitative attributes to integers
train = pd.read_csv("train.csv")
# drop duplicates
train = train.drop_duplicates()
X = train.agg(utils.transform_dataset) # utils.transform_dataset is a dicitionary which applies a transforming function on each column
y= X["Body_Level"]
y= y-1
X = X.drop("Body_Level", axis=1)

from imblearn.over_sampling import SMOTE

# Initialize the LazyClassifier and fit the training data
# sm = SMOTE(random_state=42)
# X, y = sm.fit_resample(X, y)


In [28]:
X_tensor = torch.from_numpy(X.values).float()
y_tensor = torch.from_numpy(y.values).long()
train_dataset = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)


net = Net(dropout=0.2)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001, weight_decay=0.001)

# Train the neural network with L2 regularization
count =0
model_fin = None
last_acc = 0
last_val_acc = 0
for epoch in range(500):
    running_loss = 0.0
    count +=1
    for i, data in enumerate(train_loader, 0):
        # Get the inputs
        inputs, labels = data

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
    with torch.no_grad():
        outputs = net(X_tensor)
        _, predicted = torch.max(outputs.data, 1)
    acc= 100 * torch.sum(y_tensor == predicted) / len(y_tensor)
    if (acc >= last_acc) :
        last_acc = acc
        model_fin = net
    

        

In [29]:
# print acc

with torch.no_grad():
    outputs = model_fin(X_tensor)
    _, predicted_train = torch.max(outputs.data, 1)
print('Accuracy of the network on the train data: %.3f' % (100 * torch.sum(y_tensor == predicted_train) / len(y_tensor)))

# print acc on test
with torch.no_grad():
    outputs = model_fin(X_test_tensor)
    _, predicted_test = torch.max(outputs.data, 1)
print('Accuracy of the network on the test data: %.3f' % (100 * torch.sum(y_test_tensor == predicted_test) / len(y_test_tensor)))


Accuracy of the network on the train data: 83.732
Accuracy of the network on the test data: 85.470
