In [3]:
import pandas as pd
import utils

In [6]:




# Read the csv file and preprocess it: convert qualitative attributes to integers
train = pd.read_csv("train.csv")
# drop duplicates
train = train.drop_duplicates()
X = train.agg(utils.transform_dataset) # utils.transform_dataset is a dicitionary which applies a transforming function on each column
X["Is_Int"] = 0

X["Is_Int"] = (abs(round(X["Veg_Consump"]) - X["Veg_Consump"]) < 0.01).astype(int) \
+ (abs(round(X["Water_Consump"]) - X["Water_Consump"]) < 0.01).astype(int) \
+ (abs(round(X["Phys_Act"]) - X["Phys_Act"]) < 0.01).astype(int) \
+ (abs(round(X["Time_E_Dev"]) - X["Time_E_Dev"]) < 0.01).astype(int) \
+ (abs(round(X["Age"]) - X["Age"]) < 0.01).astype(int) \
+ (abs(round(X["Meal_Count"]) - X["Meal_Count"]) < 0.01).astype(int)
y= X["Body_Level"]
y= y-1
X = X.drop("Body_Level", axis=1)


In [8]:

# Split the dataset into train and test sets
# sklearn
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.4)

In [9]:
from imblearn.over_sampling import SMOTE

# Initialize the LazyClassifier and fit the training data
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader



# Convert the data into PyTorch tensors
X_train_tensor = torch.from_numpy(X_train.values).float()
X_test_tensor = torch.from_numpy(X_test.values).float()
y_train_tensor = torch.from_numpy(y_train.values).long()
y_test_tensor = torch.from_numpy(y_test.values).long()

# Define the neural network architecture
class Net(nn.Module):
    def __init__(self, dropout=0):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(17, 32)
        self.fc2 = nn.Linear(32, 64)
        self.fc3 = nn.Linear(64, 32)
        self.fc4 = nn.Linear(32, 15)
        self.fc5 = nn.Linear(15, 4)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        # x= self.dropout(x)
        x = torch.relu(self.fc2(x))
        x= self.dropout(x)
        x = torch.relu(self.fc3(x))
        # x= self.dropout(x)
        x = torch.relu(self.fc4(x))
        x = self.fc5(x)
        # x = torch.softmax(x, dim=1)  # Apply the softmax function along the class dimension

        return x


In [11]:

# Create a PyTorch data loader for the training data
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Initialize the neural network, loss function, and optimizer
net = Net()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001)

# Train the neural network
for epoch in range(500):
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    print('Epoch %d loss: %.3f' % (epoch + 1, running_loss / len(train_loader)))


Epoch 1 loss: 1.300
Epoch 2 loss: 1.036
Epoch 3 loss: 0.754
Epoch 4 loss: 0.658
Epoch 5 loss: 0.581
Epoch 6 loss: 0.512
Epoch 7 loss: 0.457
Epoch 8 loss: 0.408
Epoch 9 loss: 0.383
Epoch 10 loss: 0.370
Epoch 11 loss: 0.335
Epoch 12 loss: 0.328
Epoch 13 loss: 0.297
Epoch 14 loss: 0.291
Epoch 15 loss: 0.276
Epoch 16 loss: 0.253
Epoch 17 loss: 0.263
Epoch 18 loss: 0.252
Epoch 19 loss: 0.234
Epoch 20 loss: 0.234
Epoch 21 loss: 0.227
Epoch 22 loss: 0.225
Epoch 23 loss: 0.226
Epoch 24 loss: 0.230
Epoch 25 loss: 0.204
Epoch 26 loss: 0.220
Epoch 27 loss: 0.199
Epoch 28 loss: 0.226
Epoch 29 loss: 0.207
Epoch 30 loss: 0.218
Epoch 31 loss: 0.216
Epoch 32 loss: 0.204
Epoch 33 loss: 0.185
Epoch 34 loss: 0.182
Epoch 35 loss: 0.196
Epoch 36 loss: 0.187
Epoch 37 loss: 0.177
Epoch 38 loss: 0.169
Epoch 39 loss: 0.198
Epoch 40 loss: 0.206
Epoch 41 loss: 0.183
Epoch 42 loss: 0.170
Epoch 43 loss: 0.163
Epoch 44 loss: 0.167
Epoch 45 loss: 0.163
Epoch 46 loss: 0.170
Epoch 47 loss: 0.169
Epoch 48 loss: 0.161
E

In [13]:
with torch.no_grad():
    outputs = net(X_train_tensor)
    _, predicted_train = torch.max(outputs.data, 1)
# print acc
print('Accuracy of the network on the train data: %.3f' % (100 * torch.sum(y_train_tensor == predicted_train) / len(y_train_tensor)))


with torch.no_grad():
    outputs = net(X_test_tensor)
    _, predicted_test = torch.max(outputs.data, 1)
# print acc
print('Accuracy of the network on the test data: %.3f' % (100 * torch.sum(y_test_tensor == predicted_test) / len(y_test_tensor)))


# get F1 score
from sklearn.metrics import f1_score
print("f1_train",f1_score(y_train_tensor, predicted_train, average='macro'))
print("f1_test",f1_score(y_test_tensor, predicted_test, average='macro'))

# f1 train


Accuracy of the network on the train data: 100.000
Accuracy of the network on the test data: 92.321
f1_train 1.0
f1_test 0.9106609829885894


In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, random_state=42)

# test to test and val
X_test, X_val, y_test, y_val = train_test_split(X_test, y_test, test_size=.2, random_state=42)

from imblearn.over_sampling import SMOTE

# Initialize the LazyClassifier and fit the training data
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

In [16]:

X_train_tensor = torch.from_numpy(X_train.values).float()
X_test_tensor = torch.from_numpy(X_test.values).float()
X_val_tensor = torch.from_numpy(X_val.values).float()

y_train_tensor = torch.from_numpy(y_train.values).long()
y_test_tensor = torch.from_numpy(y_test.values).long()
y_val_tensor = torch.from_numpy(y_val.values).long()

net = Net(dropout=0.2)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=0.001, weight_decay=0.001)

# Train the neural network with L2 regularization
count =0
model_fin = None
last_val_acc = 0
for epoch in range(500):
    running_loss = 0.0
    count +=1
    for i, data in enumerate(train_loader, 0):
        # Get the inputs
        inputs, labels = data

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # Print statistics
        running_loss += loss.item()
    if count % 50 == 0:    # print every 2000 mini-batches
        print('[%d, %5d] loss: %.3f' %
                (epoch + 1, i + 1, running_loss / 2000))
        running_loss = 0.0
        # print acc
        with torch.no_grad():
            outputs = net(X_train_tensor)
            _, predicted_train = torch.max(outputs.data, 1)
        print('Accuracy of the network on the train data: %.3f' % (100 * torch.sum(y_train_tensor == predicted_train) / len(y_train_tensor)))
        # get val
        with torch.no_grad():
            outputs = net(X_val_tensor)
            _, predicted_val = torch.max(outputs.data, 1)
        val_acc= 100 * torch.sum(y_val_tensor == predicted_val) / len(y_val_tensor)
        if (val_acc >= last_val_acc) :
            last_val_acc = val_acc
            model_fin = net
        print('Accuracy of the network on the val data: %.3f' % (100 * torch.sum(y_val_tensor == predicted_val) / len(y_val_tensor)))

[50,    53] loss: 0.005
Accuracy of the network on the train data: 88.426
Accuracy of the network on the val data: 98.305
[100,    53] loss: 0.003
Accuracy of the network on the train data: 92.130
Accuracy of the network on the val data: 96.610
[150,    53] loss: 0.002
Accuracy of the network on the train data: 94.769
Accuracy of the network on the val data: 98.305
[200,    53] loss: 0.002
Accuracy of the network on the train data: 94.676
Accuracy of the network on the val data: 100.000
[250,    53] loss: 0.002
Accuracy of the network on the train data: 95.556
Accuracy of the network on the val data: 94.915
[300,    53] loss: 0.001
Accuracy of the network on the train data: 94.722
Accuracy of the network on the val data: 98.305
[350,    53] loss: 0.001
Accuracy of the network on the train data: 96.204
Accuracy of the network on the val data: 100.000
[400,    53] loss: 0.001
Accuracy of the network on the train data: 95.509
Accuracy of the network on the val data: 100.000
[450,    53] l

In [17]:
#get acc test
with torch.no_grad():
    outputs = model_fin(X_test_tensor)
    _, predicted_test = torch.max(outputs.data, 1)
# print acc
print('Accuracy of the network on the test data: %.3f' % (100 * torch.sum(y_test_tensor == predicted_test) / len(y_test_tensor)))
# get f1
print('F1 score of the network on the test data: %.3f' % (f1_score(y_test_tensor, predicted_test, average='macro')))


# get acc train
with torch.no_grad():
    outputs = model_fin(X_train_tensor)
    _, predicted_train = torch.max(outputs.data, 1)
# print acc
print('Accuracy of the network on the train data: %.3f' % (100 * torch.sum(y_train_tensor == predicted_train) / len(y_train_tensor)))

Accuracy of the network on the test data: 96.154
F1 score of the network on the test data: 0.951
Accuracy of the network on the train data: 95.880
