# Assignment 1

In [160]:
from google.colab import drive
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim

In [161]:
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [162]:
# Path to the dataset
data_main_path  = '/content/drive/MyDrive/ColabNotebooks/Winter Semester 2023 24/NSI/Datasets-20231011/'

## Task 1 - Spam

In [163]:
# Path to email dataset
email_data_path = data_main_path + 'email/'

In [164]:
def parse_email_data(folder_path):
    emails = []
    labels = []  # Spam 1, Ham 0

    # Iterate through ham emails
    ham_folder_path = os.path.join(folder_path, 'ham')
    for filename in os.listdir(ham_folder_path):
        with open(os.path.join(ham_folder_path, filename), 'r', encoding='utf-8', errors='ignore') as file:
            lines = file.readlines()
            if len(lines) >= 3:
                date = lines[0].strip()
                subject = lines[1].strip()
                body = ''.join(lines[2:]).strip()
                emails.append({'date': date, 'subject': subject, 'body': body})
                labels.append(0)

    # Iterate through spam emails
    spam_folder_path = os.path.join(folder_path, 'spam')
    for filename in os.listdir(spam_folder_path):
        with open(os.path.join(spam_folder_path, filename), 'r', encoding='utf-8', errors='ignore') as file:
            lines = file.readlines()
            if len(lines) >= 3:
                date = lines[0].strip()
                subject = lines[1].strip()
                body = ''.join(lines[2:]).strip()
                emails.append({'date': date, 'subject': subject, 'body': body})
                labels.append(1)

    return emails, labels

In [186]:
train_email_data = email_data_path + 'train/'
parsed_emails_train, train_labels = parse_email_data(train_email_data)

test_email_data = email_data_path + 'test/'
parsed_emails_test, test_labels = parse_email_data(test_email_data)

val_email_data = email_data_path + 'val/'
parsed_emails_val, val_labels = parse_email_data(val_email_data)

In [187]:
# Print some examples
for parsed_emails in parsed_emails_train[:2]:
  print("Date:", parsed_emails['date'])
  print("Subject:", parsed_emails['subject'])
  print("Body Text:", parsed_emails['body'])
  print()
  print("Label:", "spam" if train_labels[0] == 1 else "ham")
  print("------")

Date: Subject: pennzenergy property details
Subject: - - - - - - - - - - - - - - - - - - - - - - forwarded by ami chokshi / corp / enron on 12 / 17 / 99 04 : 03
Body Text: pm - - - - - - - - - - - - - - - - - - - - - - - - - - -
dscottl @ . com on 12 / 14 / 99 10 : 56 : 01 am
to : ami chokshi / corp / enron @ enron
cc :
subject : pennzenergy property details
ami , attached is some more details on the devon south texas properties . let
me
know if you have any questions .
david
- devon stx . xls

Label: ham
------
Date: Subject: hpl fuel gas buy - back for december 1999
Subject: fyi :
Body Text: - - - - - - - - - - - - - - - - - - - - - - forwarded by gregg lenart / hou / ect on 12 / 16 / 99 02 : 02 pm
- - - - - - - - - - - - - - - - - - - - - - - - - - -
enron north america corp .
from : sally shuler @ enron 12 / 16 / 99 01 : 55 pm
to : gregg lenart / hou / ect @ ect
cc :
subject : hpl fuel gas buy - back for december 1999
- - - - - - - - - - - - - - - - - - - - - - forwarded by sally s

## Task 2 - Shapes

In [141]:
# Path to shapes dataset
shapes_data_path = data_main_path + 'shapes/'

In [142]:
# Function to parse .wld files and extract features
def parse_shapes_data(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)
        objects = []

        # Iterate through each object in the data list
        for item in data:
            consts = item["Consts"]
            predicates = item["Predicates"]
            tags = item["Tags"]

            # Extract features for each object
            const_label = consts[0]
            shape, size = predicates
            position = tuple(tags)

            objects.append({
                'label': const_label,
                'shape': shape,
                'size': size,
                'position': position
            })

        return objects

In [143]:
# Function to parse .wld files in a folder
def parse_shapes_files_in_folder(folder_path):
    parsed_objects = []
    for filename in os.listdir(folder_path):
        if filename.endswith(".wld"):
            file_path = os.path.join(folder_path, filename)
            objects_data = parse_shapes_data(file_path)
            parsed_objects.extend(objects_data)
    return parsed_objects

In [144]:
parsed_objects_train = parse_shapes_files_in_folder(os.path.join(shapes_data_path, 'train'))
parsed_objects_test = parse_shapes_files_in_folder(os.path.join(shapes_data_path, 'test'))
parsed_objects_val = parse_shapes_files_in_folder(os.path.join(shapes_data_path, 'val'))

In [145]:
# Print some examples
for parsed_objects in parsed_objects_train[:5]:
  print("Shape:", parsed_objects['shape'])
  print("Label:", parsed_objects['label'])
  print("Size:", parsed_objects['size'])
  print("Position:", parsed_objects['position'])
  print("------")

Shape: Tet
Label: e
Size: Large
Position: (0, 0)
------
Shape: Tet
Label: b
Size: Large
Position: (0, 2)
------
Shape: Dodec
Label: d
Size: Medium
Position: (1, 1)
------
Shape: Dodec
Label: c
Size: Medium
Position: (2, 1)
------
Shape: Dodec
Label: a
Size: Medium
Position: (2, 3)
------


## Task 3 - Math

In [146]:
# Path to math dataset
math_data_path = data_main_path + 'math/'

In [147]:
# Function to parse the expressions and compute the results
def parse_and_compute_expressions(file_path):
    with open(file_path, "r") as file:
        lines = file.readlines()
        expressions = []

        for line in lines:
            # Extract arithmetic expression from the line
            expression = line.strip()

            # Parse the expression into a tree using AST
            parse_tree = ast.parse(expression, mode='eval')

            # Compute the result of the expression
            result = eval(compile(parse_tree, filename="", mode='eval'))
            expressions.append({
                'expression': expression,
                'result': result
            })

        return expressions

In [148]:
def parse_and_solve_expression(file_path):
    parsed_expressions = parse_and_compute_expressions(file_path)
    return parsed_expressions

# Train, test, and val
parsed_expressions_train = parse_and_solve_expression(os.path.join(math_data_path, 'train.txt'))
parsed_expressions_test = parse_and_solve_expression(os.path.join(math_data_path, 'test.txt'))
parsed_expressions_val = parse_and_solve_expression(os.path.join(math_data_path, 'val.txt'))

In [149]:
# Print some examples
for parsed_expressions in parsed_expressions_train[:5]:
  print("Expression:", parsed_expressions['expression'])
  print("Result:", parsed_expressions['result'])
  print()

Expression: (-2 * 9 * 5)
Result: -90

Expression: (4 * 8 * 5)
Result: 160

Expression: (-6 * ((-1 - 1) - -6))
Result: -24

Expression: (-9 * 4 * (1 - (1 * -8)) * 0)
Result: 0

Expression: (8 * -4 * 5)
Result: -160



## Task 4 - Iris

In [150]:
# Path to iris dataset
iris_data_path = data_main_path + 'iris/'

### Task a - Parse the data

In [183]:
def parse_iris_data(file_path, has_column_names=False):
    if has_column_names:
      # Since Test and Val files dont have column names
        data = pd.read_csv(file_path, index_col=0)
    else:
        column_names = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm', 'Species']
        data = pd.read_csv(file_path, names=column_names, index_col=0)
    features = data[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
    labels = data['Species']
    return features, labels

# train, test, and val
train_features, train_labels = parse_iris_data(os.path.join(iris_data_path, 'train.csv'), has_column_names=True)
test_features, test_labels = parse_iris_data(os.path.join(iris_data_path, 'test.csv'))
val_features, val_labels = parse_iris_data(os.path.join(iris_data_path, 'val.csv'))

In [184]:
# Print some examples
print("Train Features:")
print(train_features.head())

print()

print("Train Labels:")
print(train_labels.head())

Train Features:
    SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
Id                                                          
1             5.1           3.5            1.4           0.2
3             4.7           3.2            1.3           0.2
4             4.6           3.1            1.5           0.2
5             5.0           3.6            1.4           0.2
6             5.4           3.9            1.7           0.4

Train Labels:
Id
1    Iris-setosa
3    Iris-setosa
4    Iris-setosa
5    Iris-setosa
6    Iris-setosa
Name: Species, dtype: object


### Task b - Feed-forward Network

In [169]:
# Convert features and labels to Tensors
train_features_tensor = torch.tensor(train_features.values, dtype=torch.float32)
train_labels_tensor = torch.tensor(pd.Categorical(train_labels).codes, dtype=torch.long)

test_features_tensor = torch.tensor(test_features.values, dtype=torch.float32)
test_labels_tensor = torch.tensor(pd.Categorical(test_labels).codes, dtype=torch.long)

val_features_tensor = torch.tensor(val_features.values, dtype=torch.float32)
val_labels_tensor = torch.tensor(pd.Categorical(val_labels).codes, dtype=torch.long)

In [170]:
# Hyperparams

input_size = 4
output_size = 3
hidden_size = 512
criterion = nn.CrossEntropyLoss()

In [171]:
model = nn.Sequential(
    nn.Linear(input_size, hidden_size),
    nn.ReLU(),
    nn.Linear(hidden_size, output_size),
    nn.Softmax(dim=1)
)

In [172]:
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [173]:
# Training
num_epochs = 100
for epoch in range(num_epochs):

    outputs = model(train_features_tensor)
    loss = criterion(outputs, train_labels_tensor)

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    # Print the loss every 10 epochs
    if (epoch+1) % 10 == 0:
        print(f'At Epoch {epoch+1} the Loss is: {loss.item():.4f}')

At Epoch 10 the Loss is: 0.9362
At Epoch 20 the Loss is: 0.8179
At Epoch 30 the Loss is: 0.7582
At Epoch 40 the Loss is: 0.7164
At Epoch 50 the Loss is: 0.6856
At Epoch 60 the Loss is: 0.6642
At Epoch 70 the Loss is: 0.6484
At Epoch 80 the Loss is: 0.6365
At Epoch 90 the Loss is: 0.6273
At Epoch 100 the Loss is: 0.6201


In [174]:
# Test Set Performance

with torch.no_grad():
    model.eval()
    predictions = model(test_features_tensor)
    _, predicted_classes = torch.max(predictions, 1)
    accuracy = torch.sum(predicted_classes == test_labels_tensor).item() / test_labels_tensor.size(0)
    print(f'Test Accuracy: {accuracy*100:.2f}%')

Test Accuracy: 94.74%


In [175]:
# Validation Set Performance

with torch.no_grad():
    model.eval()
    predictions = model(val_features_tensor)
    _, predicted_classes = torch.max(predictions, 1)
    accuracy = torch.sum(predicted_classes == val_labels_tensor).item() / val_labels_tensor.size(0)
    print(f'Val Accuracy: {accuracy*100:.2f}%')

Val Accuracy: 90.91%
