## Imports

In [1]:
import openai
from openai import OpenAI
from config import OPENAI_API_KEY
from heapq import nlargest
import numpy as np

client = OpenAI(
  api_key= OPENAI_API_KEY
)
import rag_utils
import pandas as pd
import requests
import json

embedding_model_name = "text-embedding-3-small"
file_name = 'all_courses_with_embeddings.csv'

## Reading in the Data

### (And potentially doing the embedding)

In [2]:
df = pd.read_csv(file_name)

if ('title_and_desc' not in df.columns) and ('embedding' not in df.columns):
    choice = input("Do you want to run the script to generate embeddings? Caution: this will take a while and burn your OpenAI credits. (yes/no)")
    if choice == 'yes':
        df = rag_utils.embed_all_courses(df)
else:
    print("Embeddings already exist in the dataframe")
    print("But we need to convert them into NP arrays")
    df['embedding'] = df['embedding'].apply(lambda x: np.array(json.loads(x)))
    print("Done")

Embeddings already exist in the dataframe
But we need to convert them into NP arrays
Done


## User Prompt and Input

In [3]:
user_prompt = "What are your learning goals for an online Computer Science course? (For example: I want to learn the basics of programming and how to code. I have an undegraduate degree in Psychology)"
user_input = "I want to learn the basics of natural language processing, what course should I take? I have an undegraduate degree in linguistics"

In [4]:
## Step 2: Write a similarity function that computes the similarity between query (user input) and document (course description)
def embedding_cosine_similarity(query, document, query_embedding=None):
    # First, embed the query
    if query_embedding is None:
        query_embedding = rag_utils.get_embedding(query)
    # Next, get the document embedding from the dataset
    document_embedding = document['embedding']
    # Next, compute the cosine similarity between the query and the document
    return np.dot(query_embedding, document_embedding) / (np.linalg.norm(query_embedding) * np.linalg.norm(document_embedding))

In [6]:
def jaccard_similarity(query, document, query_embedding=None):
    query = query.lower().split(" ")
    document = document['title_and_desc'].lower().split(" ")
    intersection = set(query).intersection(set(document))
    union = set(query).union(set(document))
    return len(intersection)/len(union)

def cosine_similarity(query, document, query_embedding=None):
    query = query.lower().split(" ")
    document = document['title_and_desc'].lower().split(" ")
    intersection = set(query).intersection(set(document))
    return len(intersection)/(len(query)*len(document))

def return_options(query, corpus, similarity_function=cosine_similarity):
    similarities = []
    options_returned = []
    relevant_doc_links = [] 
    query_embedding = rag_utils.get_embedding(query)
    for idx, doc in corpus.iterrows():
        similarity = similarity_function(user_input, doc, query_embedding)
        similarities.append(similarity)
    indices = list(range(len(similarities)))
    for i in nlargest(5, indices, key = lambda x : similarities[x]): # get top 5 similarities
        options_returned.append(corpus['title_and_desc'][i]) # return the document at the index of similarity i
        relevant_doc_links.append(corpus['Link'][i]) # CHAGED HERE: return the link of the document at the index of similarity i
    return options_returned, relevant_doc_links

In [13]:
# Create the prompt
relevant_documents, relevant_doc_links = return_options(user_input, df, similarity_function=embedding_cosine_similarity)
list_of_relevant_documents = [f"{i+1}. {doc}" for i, doc in enumerate(relevant_documents)]
list_of_relevant_documents_links = [f"{i+1}. {link}" for i, link in enumerate(relevant_doc_links)]
relevant_documents_text = "\n".join(list_of_relevant_documents)
relevant_documents_links_text = "\n".join(list_of_relevant_documents_links)
prompt = f"""
You are trying to help this user find an online Computer Science course
From my database of CS courses, here were some recommendations based on the user input: {relevant_documents_text}
The user input is: '{user_input}'
Compile a recommendation to the user based on the recommended Computer Science courses and the user input, 
returning the top 3 courses with their links embedded in the title: {relevant_documents_links_text} from the database, ranked in order of best fit for the user, and providing a brief explanation for why each course is a fit.
Additionally, ask the user a relevant question to gather more infomation about whether they possess the prerequisite knowledge to take the courses.
"""
#########

try:
    # Make the request to the OpenAI API
    response = client.chat.completions.create(
        model="gpt-4o-mini",  
        messages=[
            {"role": "system", "content": "You are a bot that makes recommendations for Computer Science courses."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=1000,
    )
    
    # Process the response
    chatgpt_response = response.choices[0].message.content
    print(chatgpt_response)

except openai.OpenAIError as e:
    print(f"OpenAI API error occurred: {e}")
except Exception as e:
    print(f"An error occurred: {e}")
#########


Based on your interest in learning the basics of Natural Language Processing (NLP) and your background in linguistics, here are the top three course recommendations that would suit you well:

1. [Select Topics in Python: Natural Language Processing](https://www.coursera.org/learn/codio-select-topics-python-natural-language-processing)  
   This course is perfect for novices in NLP and provides a hands-on, video-free learning experience. Since you have some experience with linguistics, you will find it engaging to process and analyze text, along with building practical applications such as chatbots. Additionally, this course does not require extensive prior programming knowledge, making it a great entry point.

2. [Natural Language and the Computer Representation of Knowledge](https://ocw.mit.edu/courses/6-863j-natural-language-and-the-computer-representation-of-knowledge-spring-2003/)  
   This laboratory-oriented course aligns well with your linguistics background, as it delves into b

In [12]:
user_response = "I took an intro programming class taught in Python in college, but do not have much experience with machine learning."

user_response_prompt = f"""
The user responded: '{user_response}'
Based on the user response, could you come up with a short (under 15 words) plain text query for me to run in my vector database so that we can recommend courses to the user to satisfy any missing prerequisites?
Return just the query.
"""

chat_so_far = [
            {"role": "system", "content": "You are a bot that makes recommendations for Computer Science courses."},
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": chatgpt_response},
        ]

chat_for_query = chat_so_far + [{"role": "user", "content": user_response_prompt}]

# Make the request to the OpenAI API
response = client.chat.completions.create(
    model="gpt-4o-mini",  
    messages=chat_for_query,
    max_tokens=1000,
)

# Process the response
chatgpt_query = response.choices[0].message.content
print(chatgpt_query)


"Python programming courses for beginners and machine learning basics for non-experts."


In [16]:
relevant_documents, relevant_doc_links = return_options(chatgpt_query, df, similarity_function=embedding_cosine_similarity)
list_of_relevant_documents = [f"{i+1}. {doc}" for i, doc in enumerate(relevant_documents)]
list_of_relevant_documents_links = [f"{i+1}. {link}" for i, link in enumerate(relevant_doc_links)]
relevant_documents_text = "\n".join(list_of_relevant_documents)
relevant_documents_links_text = "\n".join(list_of_relevant_documents_links)
prompt = f"""
You are trying to help this user find online Computer Science courses to satisfy any missing prerequisites
From my database of CS courses, here were some recommendations based on the user input: {relevant_documents_text}
The user input is: '{user_response}'
Compile a recommendation to the user based on the recommended Computer Science courses and the user input,
returning the top 2 courses with thier links embedded in the title: {relevant_documents_links_text} from the database, ranked in order of best fit for the user, and providing a brief explanation for why each course is a fit.
"""
chat_so_far.append({"role": "user", "content": prompt})

try:
    # Make the request to the OpenAI API
    response = client.chat.completions.create(
        model="gpt-4o-mini",  
        messages=chat_so_far,
        max_tokens=1000,
    )
    
    # Process the response
    chatgpt_response = response.choices[0].message.content
    print(chatgpt_response)

except openai.OpenAIError as e:
    print(f"OpenAI API error occurred: {e}")
except Exception as e:
    print(f"An error occurred: {e}")




Based on your background of having taken an introductory programming course in Python and your desire to learn more about machine learning, here are the top two course recommendations that would best suit your needs:

1. **[Introduction to Machine Learning with Python](https://www.coursera.org/learn/introduction-to-machine-learning-with-python)**  
   This course is a perfect fit for you since it introduces the concepts of machine learning using Python, which aligns with your existing programming knowledge. It covers key areas such as supervised and unsupervised learning, deep learning, and the implementation of various machine learning models. Since the course does not require advanced programming skills, it will provide a solid foundation in machine learning concepts while allowing you to leverage your prior experience with Python.

2. **[Python Programming Fundamentals](https://www.coursera.org/learn/python-programming-fundamentals)**  
   While you have already taken an introductor