In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

## Reading & Preprocessing Course Recommendations

In [None]:
df = pd.read_csv("/kaggle/input/labelled-dataset/labeled_data_student_to_courseRecomm.csv")

In [3]:
courses_list = []
for index, row in df.iterrows():
    for course in row['Recommendations'].replace('[', '').replace(']', '').split("',"):
        course = course.replace("'", "").lstrip().rstrip()
        courses_list.append(course)
courses_list = list(set(courses_list))

In [4]:
len(courses_list)

91

Creating course labels

In [5]:
# Add columns for each course with a default of 0
for course in courses_list:
    df[course] = 0

# Set the value to 1 where the course is recommended
for index, row in df.iterrows():
    for course in row['Recommendations'].replace('[', '').replace(']', '').split("',"):
        course = course.replace("'", "").lstrip().rstrip()
        if course in courses_list:  # Check if the course is in the list
            df.at[index, course] = 1

df.head()

Unnamed: 0,Column1,ID,Field_Of_Study,Primary_Hobby,Secondary_Hobby,Gender,Desired_Career_Field,Country_Of_Origin,combined_text,Recommendations,...,Environmental Transport Phenomena,Engineering Management Seminar,Chemical Fate of Organic Compounds,Fundamentals of Game Development,Introduction to Cyber Policy,Introduction to Computer Science,Functional Ecology of Plants,Compiler Construction,Introduction to Quantum Engineering,Introduction to Random Signals and Noise
0,0,100001,AI,Photography,Writing,Female,AI Ethics Specialist,USA,AI Ethics Specialist AI Photography Writing,"['Deep Learning Applications', 'Special Topics...",...,0,0,0,0,0,0,0,0,0,0
1,1,100002,Biomedical Engineering,Dance,Drawing,Male,Biomedical Research,Canada,Biomedical Research Biomedical Engineering Dan...,"['Modeling Cellular and Molecular Systems', 'B...",...,0,0,0,0,0,0,0,0,0,0
2,2,100003,Civil Engineering,Wood Working,Hiking,Female,Structural Engineer,UK,Structural Engineer Civil Engineering Wood Wor...,['Physical Chemical Processes in Environmental...,...,1,0,0,0,0,0,0,0,0,0
3,3,100004,Climate and Sustainability Engineering,Hiking,Gardening,Male,Renewable Energy Engineer,Australia,Renewable Energy Engineer Climate and Sustaina...,"['Climate Tech Startups and Investors', 'Globa...",...,0,0,0,0,0,0,0,0,0,0
4,4,100005,Computational Mechanics and Scientific Computing,Mountain Biking,Photography,Female,High-Performance Computing Specialist,Germany,High-Performance Computing Specialist Computat...,"['History of Computing, Cryptography, and Robo...",...,0,0,0,0,0,0,0,0,0,0


Forming user features and course_features from the dataframe

In [6]:
# User_features
X = pd.get_dummies(df.iloc[:, [2,3,4,6]])
X

Unnamed: 0,Field_Of_Study_AI,Field_Of_Study_AI Ethics Specialist,Field_Of_Study_Biomedical Engineering,Field_Of_Study_Biomedical Research,Field_Of_Study_Business Management,Field_Of_Study_Civil Engineering,Field_Of_Study_Climate and Sustainability Eng.,Field_Of_Study_Climate and Sustainability Engineering,Field_Of_Study_Computational Mechanics and Scientific Computing,Field_Of_Study_Computer Programming,...,Desired_Career_Field_Structural Engineer,Desired_Career_Field_Sustainable Building Design Engineer,Desired_Career_Field_Tissue Engineer,Desired_Career_Field_Transportation Engineer,Desired_Career_Field_UI/UX Designer,Desired_Career_Field_UX Researcher,Desired_Career_Field_User Experience (UX) Designer,Desired_Career_Field_Video Game Designer,Desired_Career_Field_Video Gaming,Desired_Career_Field_Water Resources Engineer
0,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,True,False,False,False,False,...,True,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,False,False,False,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
325,False,False,False,False,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
326,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
327,False,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [7]:
Y = df.iloc[:, 10:] 
Y

Unnamed: 0,Narrative Design,Algorithmic Game Theory,Control of Dynamic Systems,Sourcing Data for Analytics,"History of Computing, Cryptography, and Robotic Devices","Security Incident Detection, Response, and Resilience",Designing Customer Experiences in Technology,Using Real-Time Data to Improve Customer Quality Experience,Intermediate Electromagnetic Theory,Applied Biological Principles and Processes in Environmental Engineering,...,Environmental Transport Phenomena,Engineering Management Seminar,Chemical Fate of Organic Compounds,Fundamentals of Game Development,Introduction to Cyber Policy,Introduction to Computer Science,Functional Ecology of Plants,Compiler Construction,Introduction to Quantum Engineering,Introduction to Random Signals and Noise
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
324,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
325,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
326,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
327,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
user_features = X.columns.to_list()

## Creating Train & Test Data Loaders

In [10]:
X = X.values
Y = Y.values

In [11]:
# Train-test split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [12]:
# Create datasets
class NCFDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return torch.tensor(self.features[idx], dtype=torch.float32), torch.tensor(self.labels[idx], dtype=torch.float32)

In [13]:
train_dataset = NCFDataset(X_train, Y_train)
test_dataset = NCFDataset(X_test, Y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

## Modelling

In [14]:
import torch.nn as nn
import torch.nn.functional as F

class NCFModel(nn.Module):
    def __init__(self, num_users_features, num_courses):
        super(NCFModel, self).__init__()
        self.fc1 = nn.Linear(num_users_features, 128)
        self.fc2 = nn.Linear(128, 64)
        self.output = nn.Linear(64, num_courses)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.output(x))  # Using sigmoid since this is a binary classification
        return x

# Initialize the model
model = NCFModel(X_train.shape[1], Y_train.shape[1])

In [15]:
from torch.optim import Adam

optimizer = Adam(model.parameters(), lr=0.01)
criterion = nn.BCELoss()

def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for features, labels in loader:
        optimizer.zero_grad()
        outputs = model(features)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

# Training loop
for epoch in range(100):
    train_loss = train(model, train_loader, optimizer, criterion)
    print(f"Epoch {epoch+1}, Loss: {train_loss}")

Epoch 1, Loss: 0.3777135382680332
Epoch 2, Loss: 0.2103238929720486
Epoch 3, Loss: 0.19193562164026148
Epoch 4, Loss: 0.1825723797082901
Epoch 5, Loss: 0.1728925073848051
Epoch 6, Loss: 0.15298964258502512
Epoch 7, Loss: 0.1333636628354297
Epoch 8, Loss: 0.11938937972573672
Epoch 9, Loss: 0.10980912517098819
Epoch 10, Loss: 0.10072635420981575
Epoch 11, Loss: 0.09265061424059026
Epoch 12, Loss: 0.08313809203750946
Epoch 13, Loss: 0.07631368211963598
Epoch 14, Loss: 0.0680597273304182
Epoch 15, Loss: 0.06067829381893663
Epoch 16, Loss: 0.05592446620850002
Epoch 17, Loss: 0.05043403300292352
Epoch 18, Loss: 0.04353537629632389
Epoch 19, Loss: 0.03907065816661891
Epoch 20, Loss: 0.0377949780839331
Epoch 21, Loss: 0.03260949933353592
Epoch 22, Loss: 0.029794879148111623
Epoch 23, Loss: 0.027113843161393616
Epoch 24, Loss: 0.02688415359486552
Epoch 25, Loss: 0.02496788135784514
Epoch 26, Loss: 0.024116959422826767
Epoch 27, Loss: 0.022296004709513748
Epoch 28, Loss: 0.022100623697042465
Epo

## Evaluating on Test Data

In [16]:
def evaluate(model, loader):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for features, labels in loader:
            outputs = model(features)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
    return total_loss / len(loader)

test_loss = evaluate(model, test_loader)
print(f"Test Loss: {test_loss}")

Test Loss: 0.3832641690969467


## Testing to Validation Dataset - Human Evaluation

In [24]:
val_data = pd.read_excel('/kaggle/input/val-data/validation.xlsx')

In [26]:
val_data.drop(columns = ['Gender', 'Country_Of_Origin'], inplace = True)

In [37]:
def predict(new_student, user_features_cols, courses_list, model):
    """
    Predict the top 5 course recommendations for a new student.

    Args:
    new_student (dict): A dictionary containing the new student's features.
    model_path (str): The path to the trained neural collaborative filtering model.

    Returns:
    list: A list of the top 5 recommended courses for the new student.
    """

    # Create a DataFrame for the new student with all columns initialized to zero
    new_student_df = pd.DataFrame(0, index=[0], columns=user_features_cols)

    # Update the DataFrame with new student data by setting the appropriate fields
    for key in new_student:
        feature_name = key + "_" + new_student[key]
        if feature_name in new_student_df.columns:
            new_student_df.loc[0, feature_name] = 1

    # Convert the DataFrame to a PyTorch tensor
    new_student_tensor = torch.tensor(new_student_df.values.astype(np.float32))
    model.eval()

    # Perform prediction using the model
    with torch.no_grad():
        predictions = model(new_student_tensor)

    # Convert the model's predictions to probabilities
    course_probabilities = predictions.numpy().flatten()

    # Identify the indices of the top 5 courses
    top_course_indices = np.argsort(course_probabilities)[-5:][::-1]

    # Map indices to course names
    top_courses = [courses_list[idx] for idx in top_course_indices]

    return top_courses

In [39]:
user = ['Field_Of_Study', 'Primary_Hobby', 'Secondary_Hobby', 'Desired_Career_Field']
for row in val_data.iterrows():
    new_student = {}
    for val in user:
        new_student[val] = row[1][val]
    print(new_student)
    print(predict(new_student, user_features, list(courses_list), model))

{'Field_Of_Study': 'Environmental Engineering', 'Primary_Hobby': 'Beekeeping', 'Secondary_Hobby': 'Embroidery', 'Desired_Career_Field': 'Environmental Engineer'}
['Global Change Biology: From Molecules to Organisms', 'Deep-Sea Science and Environmental Management', 'Writing about Performance', 'Physical Chemical Processes in Environmental Engineering', 'Environmental Justice: Theory and Practice for Environmental Scientists and Policy Professionals']
{'Field_Of_Study': 'Financial Technology', 'Primary_Hobby': 'Shopping', 'Secondary_Hobby': 'Knitting', 'Desired_Career_Field': 'Financial Technology'}
['Data Science', 'Advanced Topics in Design and Technology Innovation', 'Special Topics in Electrical and Computer Engineering', 'Negotiations and Consultative Selling in Technology', 'Operations Management']
{'Field_Of_Study': 'Game Design Development and Innovation', 'Primary_Hobby': 'Video Gaming', 'Secondary_Hobby': 'Drawing', 'Desired_Career_Field': 'Video Game Designer'}
['Interdiscipl