In [3]:
!pip install chardet transformers

Collecting transformers
  Obtaining dependency information for transformers from https://files.pythonhosted.org/packages/15/fc/7b6dd7e1adc0a6407b845ed4be1999e98b6917d0694e57316d140cc85484/transformers-4.39.3-py3-none-any.whl.metadata
  Using cached transformers-4.39.3-py3-none-any.whl.metadata (134 kB)
Collecting safetensors>=0.4.1 (from transformers)
  Obtaining dependency information for safetensors>=0.4.1 from https://files.pythonhosted.org/packages/cb/f6/19f268662be898ff2a23ac06f8dd0d2956b2ecd204c96e1ee07ba292c119/safetensors-0.4.3-cp311-none-win_amd64.whl.metadata
  Downloading safetensors-0.4.3-cp311-none-win_amd64.whl.metadata (3.9 kB)
Using cached transformers-4.39.3-py3-none-any.whl (8.8 MB)
Downloading safetensors-0.4.3-cp311-none-win_amd64.whl (287 kB)
   ---------------------------------------- 0.0/287.3 kB ? eta -:--:--
   -- ------------------------------------ 20.5/287.3 kB 320.0 kB/s eta 0:00:01
   ----- --------------------------------- 41.0/287.3 kB 487.6 kB/s eta 0:0


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import numpy as np
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from torch.utils.data import DataLoader, Dataset
import chardet
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# Detect the encoding of the file
with open('../data/raw/courses.csv', 'rb') as file:
    result = chardet.detect(file.read())  # or file.read(100000) if the file is large

# Read the CSV file with the detected encoding
courses_df = pd.read_csv('../data/raw/courses.csv', encoding=result['encoding'])

In [8]:
courses_df.head()

Unnamed: 0.1,Unnamed: 0,Subject,Catalog Number,Course Title,Course Type,Description,Keywords,Grading,Prerequisites
0,1,AIPI - AI for Product Innovation,501,AIPI Seminar,FALL-SPRNG,Current topics in AI for Product Innovation. S...,"AI, Product Innovation, Seminars, Industry lea...",Credit / No Credit,
1,2,AIPI - AI for Product Innovation,504,Introductory Residency,FALL,One week course to introduce the Master of Eng...,"Master of Engineering, AI for Product Innovati...",Graded,
2,3,AIPI - AI for Product Innovation,510,Sourcing Data for Analytics,FALL,Course introduces students to the technical an...,"Data privacy, GDPR, Machine learning, Data ana...",Graded,
3,4,AIPI - AI for Product Innovation,520,Modeling Process and Algorithms,FALL-SPRNG,This course is an introduction to the modeling...,"Modeling process, Machine learning algorithms,...",Graded,
4,5,AIPI - AI for Product Innovation,530,Optimization in Practice,FALL,Optimization is the ultimate skill in artifici...,"Optimization, Artificial intelligence, Prescri...",Graded,Corequisite: AIPI 510 and 520


In [9]:
courses_df['combined_text'] = courses_df['Subject'] + " " + courses_df['Course Title'] + " " + courses_df['Description']

In [10]:
students_df = pd.read_csv('../data/raw/Student_Personas_v2.csv')

In [11]:
students_df.head()

Unnamed: 0,ID,Field_Of_Study,Primary_Hobby,Secondary_Hobby,Gender,Desired_Career_Field,Country_Of_Origin
0,100001,AI,Photography,Writing,Female,AI Ethics Specialist,USA
1,100002,Biomedical Engineering,Dance,Drawing,Male,Biomedical Research,Canada
2,100003,Civil Engineering,Wood Working,Hiking,Female,Structural Engineer,UK
3,100004,Climate and Sustainability Engineering,Hiking,Gardening,Male,Renewable Energy Engineer,Australia
4,100005,Computational Mechanics and Scientific Computing,Mountain Biking,Photography,Female,High-Performance Computing Specialist,Germany


In [12]:
students_df['combined_text'] = students_df['Field_Of_Study'] + " " + students_df['Primary_Hobby'] + " " + students_df['Secondary_Hobby'] + " " + students_df['Desired_Career_Field']

In [13]:
class Dataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

In [14]:
# Load pre-trained BERT model
model = BertModel.from_pretrained('bert-base-uncased')

# Place model in evaluation mode and move to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [15]:
# Function to generate embeddings
def generate_embeddings(dataloader):
    with torch.no_grad():
        embeddings = []
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
            embeddings.append(outputs.last_hidden_state[:, 0, :].cpu().numpy())
        embeddings = np.concatenate(embeddings, axis=0)
        return embeddings

In [16]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize course descriptions
# You may want to customize the max_length and the truncation/padding strategy based on your data
courses_inputs = tokenizer(
    list(courses_df['combined_text']), 
    padding=True, 
    truncation=True, 
    return_tensors='pt',
    max_length=16  # Choose a max_length that suits your data
)

# Convert to a Dataset object
courses_dataset = Dataset(courses_inputs)

# DataLoader
courses_loader = DataLoader(courses_dataset, batch_size=32, shuffle=False)

# Generate embeddings
course_embeddings = generate_embeddings(courses_loader)


In [17]:
# Initialize BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize course descriptions
# You may want to customize the max_length and the truncation/padding strategy based on your data
students_inputs = tokenizer(
    list(students_df['combined_text']), 
    padding=True, 
    truncation=True, 
    return_tensors='pt',
    max_length=16  # Choose a max_length that suits your data
)

# Convert to a Dataset object
students_dataset = Dataset(students_inputs)

# DataLoader
students_loader = DataLoader(students_dataset, batch_size=32, shuffle=False)

# Generate embeddings
students_embeddings = generate_embeddings(students_loader)

In [18]:
course_embeddings

array([[-0.5600438 , -0.11545447, -0.34252006, ..., -0.40451112,
        -0.04201803,  0.64492404],
       [-0.47448975, -0.07996289,  0.0720619 , ..., -0.6110867 ,
         0.4166703 ,  0.301448  ],
       [-0.49124107, -0.37004536,  0.17035553, ..., -0.4448683 ,
         0.16554262,  0.27125952],
       ...,
       [-0.25120994, -0.56249136,  0.02402968, ..., -0.98138   ,
        -0.14338209,  0.5311576 ],
       [-0.18095273, -0.38658848, -0.0264961 , ..., -0.40668797,
        -0.18281405,  0.20533113],
       [-0.549462  ,  0.10059454, -0.51944816, ..., -0.19734716,
         0.18437123,  0.24828549]], dtype=float32)

In [19]:
students_embeddings

array([[-0.32210115,  0.3226166 , -0.38599992, ..., -0.38928717,
         0.23479898,  0.40587598],
       [ 0.09504601,  0.02062466, -0.32828873, ..., -0.2907245 ,
         0.28526855,  0.5285694 ],
       [-0.0087065 ,  0.4837623 , -0.27725396, ..., -0.19260749,
         0.17131878,  0.32141453],
       ...,
       [ 0.15727614,  0.2913635 , -0.5118813 , ..., -0.4592735 ,
         0.4518155 ,  0.31083128],
       [-0.01297386,  0.5519116 , -0.46297827, ..., -0.53901154,
         0.20450163,  0.18124276],
       [-0.4158405 ,  0.21593106, -0.35990715, ..., -0.34013417,
         0.26464313, -0.05653283]], dtype=float32)

In [20]:
# Compute the cosine similarity between student embeddings and course embeddings
similarity_matrix = cosine_similarity(students_embeddings, course_embeddings)

In [21]:
students_embeddings.shape

(500, 768)

In [22]:
similarity_matrix.shape

(500, 94)

In [23]:
top_course_indices = np.argsort(similarity_matrix, axis=1)[:, -5:]
top_course_indices.shape

(500, 5)

In [24]:
# Map the top course indices to course IDs, titles, or whatever identifier you use
# Assuming courses_df has a 'Course_ID' column
course_ids = courses_df['Course Title'].values
student_recommendations = course_ids[top_course_indices]

# You can now add these recommendations to the students_df or present them however you prefer
students_df['Recommendations'] = student_recommendations.tolist()

In [25]:
pd.set_option('display.max_colwidth', None)

In [26]:
students_df.drop(columns = ['Gender', 'combined_text', 'Country_Of_Origin']).head(5)

Unnamed: 0,ID,Field_Of_Study,Primary_Hobby,Secondary_Hobby,Desired_Career_Field,Recommendations
0,100001,AI,Photography,Writing,AI Ethics Specialist,"[Special Topics in Electrical and Computer Engineering, Interdisciplinary Introduction to Computer Science, Critical Analysis of Video Games, Fundamentals of Game Development, AIPI Seminar]"
1,100002,Biomedical Engineering,Dance,Drawing,Biomedical Research,"[AIPI Seminar, Computational Microeconomics, Engineering Management Seminar, Graph Analysis with Matrix Computation, Special Topics in Electrical and Computer Engineering]"
2,100003,Civil Engineering,Wood Working,Hiking,Structural Engineer,"[Engineering Management Seminar, Interdisciplinary Introduction to Computer Science, Functional Ecology of Plants, Fundamentals of Game Development, Graph Analysis with Matrix Computation]"
3,100004,Climate and Sustainability Engineering,Hiking,Gardening,Renewable Energy Engineer,"[Environmental Justice: Theory and Practice for Environmental Scientists and Policy Professionals, Cybersecurity Program Development, Operations & Analysis, Engineering Management Seminar, Introductory Residency, Marine Protected Area Monitoring and Management]"
4,100005,Computational Mechanics and Scientific Computing,Mountain Biking,Photography,High-Performance Computing Specialist,"[Engineering Management Seminar, Critical Analysis of Video Games, Biomedical Aspects of Blast and Ballistics (GE, BB), Special Topics in Electrical and Computer Engineering, Marine Protected Area Monitoring and Management]"
