## Imports

In [1]:
import openai
from openai import OpenAI
from config import OPENAI_API_KEY
from heapq import nlargest
import numpy as np

client = OpenAI(
  api_key= OPENAI_API_KEY
)
import rag_utils
import pandas as pd
import requests
import json

embedding_model_name = "text-embedding-3-small"
file_name = 'all_courses_with_embeddings.csv'

## Reading in the Data

### (And potentially doing the embedding)

In [3]:
df = pd.read_csv(file_name)

if ('title_and_desc' not in df.columns) and ('embedding' not in df.columns):
    choice = input("Do you want to run the script to generate embeddings? Caution: this will take a while and burn your OpenAI credits. (yes/no)")
    if choice == 'yes':
        df = rag_utils.embed_all_courses(df)
else:
    print("Embeddings already exist in the dataframe")
    print("But we need to convert them into NP arrays")
    df['embedding'] = df['embedding'].apply(lambda x: np.array(json.loads(x)))
    print("Done")

Embeddings already exist in the dataframe
But we need to convert them into NP arrays
Done


In [4]:
type(df['embedding'][0])

numpy.ndarray

## User Prompt and Input

In [5]:
user_prompt = "What are your learning goals for an online Computer Science course?"
user_input = "I want to learn the basics of natural language processing, what course should I take? I have an undegraduate degree in linguistics"

In [6]:
## Step 2: Write a similarity function that computes the similarity between query (user input) and document (course description)
def embedding_cosine_similarity(query, document, query_embedding=None):
    # First, embed the query
    if query_embedding is None:
        query_embedding = rag_utils.get_embedding(query)
    # Next, get the document embedding from the dataset
    document_embedding = document['embedding']
    # Next, compute the cosine similarity between the query and the document
    return np.dot(query_embedding, document_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(document_embedding))

In [7]:
def jaccard_similarity(query, document, query_embedding=None):
    query = query.lower().split(" ")
    document = document['title_and_desc'].lower().split(" ")
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

def cosine_similarity(query, document, query_embedding=None):
    query = query.lower().split(" ")
    document = document['title_and_desc'].lower().split(" ")
    intersection = set(query).intersection(set(document))
    return len(intersection)/(len(query)*len(document))

def return_options(query, corpus, similarity_function=cosine_similarity):
    similarities = []
    options_returned = []
    query_embedding = rag_utils.get_embedding(query)
    for idx, doc in corpus.iterrows():
        similarity = similarity_function(user_input, doc, query_embedding)
        similarities.append(similarity)
    indices = list(range(len(similarities)))
    for i in nlargest(5, indices, key = lambda x : similarities[x]): # get top 5 similarities
        options_returned.append(
            corpus['title_and_desc'][i]
        ) # return the document at the index of similarity i
    return options_returned

In [8]:
# Create the prompt
relevant_documents = return_options(user_input, df, similarity_function=embedding_cosine_similarity)
list_of_relevant_documents = [f"{i+1}. {doc}" for i, doc in enumerate(relevant_documents)]
relevant_documents_text = "\n".join(list_of_relevant_documents)
prompt = f"""
You are trying to help this user find an online Computer Science course
From my database of CS courses, here were some recommendations based on the user input: {relevant_documents_text}
The user input is: '{user_input}'
Compile a recommendation to the user based on the recommended Computer Science courses and the user input, 
ranking the courses from the database in order of best fit for the user, and providing a brief explanation for why each course is a fit.
"""
#########

try:
    # Make the request to the OpenAI API
    response = client.chat.completions.create(
        model="gpt-4o-mini",  
        messages=[
            {"role": "system", "content": "You are a bot that makes recommendations for Computer Science courses."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1000,
    )
    
    # Process the response
    chatgpt_response = response.choices[0].message.content
    print(chatgpt_response)

except openai.OpenAIError as e:
    print(f"OpenAI API error occurred: {e}")
except Exception as e:
    print(f"An error occurred: {e}")
#########


Based on your interest in learning the basics of natural language processing (NLP) and your undergraduate degree in linguistics, here are the top course recommendations tailored to your background and goals:

1. **Select Topics in Python: Natural Language Processing**
   - **Why It's a Fit:** This course is specifically designed for learners who are novices to NLP and aims to provide a hands-on approach to coding. Since you already have a linguistics background, this course will allow you to bridge your knowledge of languages with programming by helping you analyze and process text through practical coding experiences, without overwhelming you with too much theory. It also provides instant feedback, making it easier for you to grasp NLP concepts efficiently.

2. **Natural Language and the Computer Representation of Knowledge**
   - **Why It's a Fit:** This course focuses on building systems for human language processing, integrating both theory and practical applications. With your bac