In [62]:
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from spellchecker import SpellChecker
import pandas as pd

# Prepare training data

In [63]:
# Load the dataset
df = pd.read_csv("admissionData.csv")

pd.set_option("display.max_rows", None)
df['label'].value_counts()

label
computer science                       405
undeclared engineering                 359
life science                           185
business administration                175
biomedical engineering                 131
engineering science                    129
nuclear engineering                    129
commerce                               122
management                             119
software engineering                   116
math                                   115
cs/bba                                 112
computer engineering                    99
accounting                              93
physics                                 93
nanotechnology engineering              91
arts/business                           90
environmental science                   85
biochemistry                            84
physical sciences                       83
communications                          82
management engineering                  77
bba/finmath                             73
educa

In [64]:
# Encode programs as numbers and dictionarys to convert between them
label2num = {label: num for num, label in enumerate(sorted(df['label'].unique()))}
num2label = {num: label for label, num in label2num.items()}

# Lists containing the programs and their labels
programs = df['program'].tolist()
labels = [label2num[label] for label in df['label'].tolist()]

label2num

{'accounting': 0,
 'accounting and financial management': 1,
 'aerospace engineering': 2,
 'architectural engineering': 3,
 'architecture': 4,
 'arts': 5,
 'arts/business': 6,
 'bba/finmath': 7,
 'biochemistry': 8,
 'biology': 9,
 'biomedical engineering': 10,
 'biomedical science': 11,
 'business administration': 12,
 'business technology managment': 13,
 'chemical engineering': 14,
 'civil engineering': 15,
 'commerce': 16,
 'communications': 17,
 'computer engineering': 18,
 'computer science': 19,
 'computing and financial management': 20,
 'cs/bba': 21,
 'cs/math/stats': 22,
 'dev degree': 23,
 'economics': 24,
 'education': 25,
 'electrical engineering': 26,
 'engineering science': 27,
 'environmental engineering': 28,
 'environmental science': 29,
 'forestry': 30,
 'game design': 31,
 'geomatics': 32,
 'global business and digital arts': 33,
 'health science': 34,
 'humanities': 35,
 'industrial engineering': 36,
 'kinesiology': 37,
 'law': 38,
 'life science': 39,
 'management'

In [65]:
# Dataset class
class ProgramClassificationDataset(Dataset):
    def __init__(self, programs, labels, tokenizer, max_length):
        self.programs = programs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.programs)

    def __getitem__(self, idx):
        program = self.programs[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(program, return_tensors='pt', max_length=self.max_length, padding='max_length', truncation=True)  # Tokenization (encoding is a step before word embeddings)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(label)}

# Create classifier architecture

In [66]:
# Classifier architecture
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)          # BERT abstraction "layer"
        self.dropout = nn.Dropout(0.1)                                  # 10% Dropout Layer
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)  # Fully Connected layer for dimension reduction of BERT outputs to num_classes

    def forward(self, input_ids, attention_mask):
        bert_outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = bert_outputs.pooler_output  # Get hidden state (value) of the [CLS] token
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits


# Function that trains the model
def train(model, data_loader, optimizer, scheduler, device):
    model.train()   # Enable training mode

    for batch in data_loader:
        optimizer.zero_grad()   # Clear gradients

        input_ids = batch['input_ids'].to(device)   
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        loss = nn.CrossEntropyLoss()(outputs, labels)
        loss.backward()     # Backpropagation
        optimizer.step()    # Update weights
        scheduler.step()    # Update learning rate


# Function that evaluates the model
def evaluate(model, data_loader, device):
    model.eval()    # Enable evaluation mode

    predictions = []
    actual_labels = []

    with torch.no_grad():   # Disable gradient calculation
        for batch in data_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            preds = torch.argmax(outputs, dim=1)    # Get index of predicted class
            predictions.extend(preds.cpu().tolist())
            actual_labels.extend(labels.cpu().tolist())

    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions, zero_division=0)


# Function that predicts a program (i.e inputs a given program into the model), also returns confidence level
def predict_program(program, model, tokenizer, device, max_length=128):
    model.eval()    # Enable evaluation mode

    encoding = tokenizer(program, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)    # Tokenize
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():   # Disable gradient calculation
        logits = model(input_ids=input_ids, attention_mask=attention_mask)

        # Get predicted class
        preds = torch.argmax(logits, dim=1)
        predicted_program = num2label[preds.item()]

        # Calculate confidence
        outputs = torch.softmax(logits, dim = 1)
        confidence = torch.max(outputs, dim=1)[0].item()*100
        
    return predicted_program, confidence

# Function to check and correct spelling
def correct_spelling(text):

    # Initialize the spell checker with the custom dictionary
    spellcheck = SpellChecker()
    spellcheck.word_frequency.load_text_file('corpus.txt')

    corrected_text = []
    
    for word in text.split():
        if word in spellcheck:
            corrected_text.append(word)
        else:
            corrected_text.append(spellcheck.correction(word))

    # Fix NoneType errors
    for index, word in enumerate(corrected_text):
        if word is None:
            corrected_text[index] = text.split()[index]

    return ' '.join(corrected_text)

# Set up the model

In [67]:
# Hyperparameters
bert_model_name = 'bert-base-uncased'
num_classes = len(label2num)
max_length = 128
batch_size = 16
num_epochs = 6
learning_rate = 2e-5

In [68]:
# Tokenizer
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

In [69]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(programs, labels, test_size=0.2, random_state=42)

train_dataset = ProgramClassificationDataset(X_train, y_train, tokenizer, max_length)
test_dataset = ProgramClassificationDataset(X_test, y_test, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size)

In [70]:
# Instantiate model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BERTClassifier(bert_model_name, num_classes).to(device)

In [71]:
# Optimizer and Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Training

In [72]:
# Train model
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    train(model, train_dataloader, optimizer, scheduler, device)
    accuracy, report = evaluate(model, test_dataloader, device)

    print(f"Accuracy: {accuracy:.4f}")
    print(report)

Epoch 1/6
Accuracy: 0.7421
              precision    recall  f1-score   support

           0       0.44      1.00      0.62        16
           1       0.00      0.00      0.00         4
           2       0.00      0.00      0.00         5
           3       0.00      0.00      0.00         6
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         6
           6       0.93      1.00      0.96        13
           7       1.00      0.58      0.73        19
           8       0.94      1.00      0.97        15
           9       1.00      0.86      0.92         7
          10       1.00      0.71      0.83        35
          11       0.75      1.00      0.86        12
          12       0.76      0.91      0.83        35
          13       0.73      1.00      0.85        11
          14       0.00      0.00      0.00         7
          15       1.00      0.93      0.96        14
          16       1.00      0.62      0.77        24


In [73]:
# Save model
torch.save(model.state_dict(), "bert_program_classifier.pth")

In [74]:
# Load weights
model.load_state_dict(torch.load("bert_program_classifier.pth"), True)

<All keys matched successfully>

# Test Model

In [75]:
# Test prediction
test_text = "Integrated Biomedical Engineering & Health Sciences"
corrected_text = correct_spelling(test_text)
predicted_label = predict_program(corrected_text, model, tokenizer, device)
print(f"Input program: {test_text}")
print(f"Predicted category: {predicted_label[0]}")
print(f"Confidence: {predicted_label[1]}")

Input program: Integrated Biomedical Engineering & Health Sciences
Predicted category: biomedical engineering
Confidence: 54.99767065048218


In [76]:
# Test prediction
test_text = "computer science"
corrected_text = correct_spelling(test_text)
predicted_label = predict_program(corrected_text, model, tokenizer, device)
print(f"Input program: {test_text}")
print(f"Predicted category: {predicted_label[0]}")
print(f"Confidence: {predicted_label[1]}")

Input program: computer science
Predicted category: computer science
Confidence: 97.23633527755737


In [77]:
# Test prediction
test_text = "computer engineering"
corrected_text = correct_spelling(test_text)
predicted_label = predict_program(corrected_text, model, tokenizer, device)
print(f"Input program: {test_text}")
print(f"Predicted category: {predicted_label[0]}")
print(f"Confidence: {predicted_label[1]}")

Input program: computer engineering
Predicted category: computer engineering
Confidence: 88.51482272148132


In [78]:
# Test prediction
test_text = "computer eng"
corrected_text = correct_spelling(test_text)
predicted_label = predict_program(corrected_text, model, tokenizer, device)
print(f"Input program: {test_text}")
print(f"Predicted category: {predicted_label[0]}")
print(f"Confidence: {predicted_label[1]}")

Input program: computer eng
Predicted category: computer engineering
Confidence: 67.91397333145142


In [79]:
# Test prediction
test_text = "comp sci"
corrected_text = correct_spelling(test_text)
predicted_label = predict_program(corrected_text, model, tokenizer, device)
print(f"Input program: {test_text}")
print(f"Predicted category: {predicted_label[0]}")
print(f"Confidence: {predicted_label[1]}")

Input program: comp sci
Predicted category: computer science
Confidence: 94.17394399642944
