In [2]:
import openai
from openai import OpenAI
from config import OPENAI_API_KEY
from heapq import nlargest
import numpy as np

client = OpenAI(
  api_key= OPENAI_API_KEY
)

import pandas as pd
import requests
import json

embedding_model_name = "text-embedding-3-small"

In [3]:
df = pd.read_csv('all_courses.csv')

In [22]:
df['title_and_desc'] = df['Title'] + ': ' + df['Description']

#### Most recent attempt
- Recommend a course based on a plain text prompt

In [23]:
user_prompt = "What are your learning goals for an online Computer Science course?"
user_input = "I want to learn the basics of natural language processing, what course should I take? I have an undegradaute degree in linguistics"

In [28]:
## Step 1 : Embed all the courses in the dataset

### Step 1.1: Write function to get embedding of one course
def get_embedding(text, model=embedding_model_name):
    text = text.replace("\n", " ")
    return client.embeddings.create(input = [text], model=model).data[0].embedding

### Step 1.2: Apply the function to the entire dataset
df['embedding'] = df['title_and_desc'].apply(get_embedding)

### Step 1.3: Save the dataset with embeddings
df.to_csv('all_courses_with_embeddings.csv', index=False)

In [40]:
## Step 2: Write a similarity function that computes the similarity between query (user input) and document (course description)
query_embedding=None
def embedding_cosine_similarity(query, document):
    # First, embed the query
    if query_embedding is None:
        query_embedding = get_embedding(query)
    # Next, get the document embedding from the dataset
    document_embedding = document['embedding']
    # Next, compute the cosine similarity between the query and the document
    return np.dot(query_embedding, document_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(document_embedding))

In [43]:
def jaccard_similarity(query, document):
    query = query.lower().split(" ")
    document = document['title_and_desc'].lower().split(" ")
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

def cosine_similarity(query, document):
    query = query.lower().split(" ")
    document = document['title_and_desc'].lower().split(" ")
    intersection = set(query).intersection(set(document))
    return len(intersection)/(len(query)*len(document))

def return_options(query, corpus, similarity_function=jaccard_similarity):
    similarities = []
    options_returned = []
    for idx, doc in corpus.iterrows():
        similarity = similarity_function(user_input, doc)
        similarities.append(similarity)
    indices = list(range(len(similarities)))
    for i in nlargest(5, indices, key = lambda x : similarities[x]): # get top 5 similarities
        options_returned.append(
            corpus['title_and_desc'][i]
        ) # return the document at the index of similarity i
    return options_returned

In [41]:
# Create the prompt
relevant_documents = return_options(user_input, df, similarity_function=embedding_cosine_similarity)
list_of_relevant_documents = [f"{i+1}. {doc}" for i, doc in enumerate(relevant_documents)]
relevant_documents_text = "\n".join(list_of_relevant_documents)
prompt = f"""
You are trying to help this user find an online Computer Science course
From my database of CS courses, here were some recommendations based on the user input: {relevant_documents_text}
The user input is: '{user_input}'
Compile a recommendation to the user based on the recommended Computer Science courses and the user input, 
ranking the courses from the database in order of best fit for the user, and providing a brief explanation for why each course is a fit.
"""
#########

try:
    # Make the request to the OpenAI API
    response = client.chat.completions.create(
        model="gpt-4o-mini",  
        messages=[
            {"role": "system", "content": "You are a bot that makes recommendations for Computer Science courses."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1000,
    )
    
    # Process the response
    chatgpt_response = response.choices[0].message.content
    print(chatgpt_response)

except openai.OpenAIError as e:
    print(f"OpenAI API error occurred: {e}")
except Exception as e:
    print(f"An error occurred: {e}")
#########


Based on your interest in learning the basics of natural language processing (NLP) and your background in linguistics, I recommend the following courses, ranked from best fit to less suitable options:

1. **Select Topics in Python: Natural Language Processing**  
   This course is ideal for someone starting out in NLP, especially with a background in linguistics. It’s designed for learners with some coding experience in Python but who are novices to NLP. The hands-on, self-paced format allows you to code without needing to install software, making it accessible and user-friendly. You'll cover foundational topics like text processing, analyzing speech and semantics, and even building a chatbot, helping you apply your linguistic knowledge practically.

2. **Natural Language and the Computer Representation of Knowledge**  
   This laboratory-oriented course provides a deeper dive into the theory and practice of building computer systems for human language processing. With an emphasis on b