# Custom Chatbot Project

TODO: In this cell, write an explanation of which dataset you have chosen and why it is appropriate for this task

In [14]:
# environment variables

OPENAI_API_KEY = 'PUT HERE YOU OPENAPI KEY' 

SOURCE_URL = 'https://theanalyst.com/eu/2023/05/101-best-premier-league-facts-2022-23'
PAGE_FILEPATH = './wikipedia.html'
CSV_FILEPATH_WITH_EMBEDDINGS = './wikipedia_with_embeddings.csv'

EMBEDDING_MODEL = 'text-embedding-3-small'
COMPLETION_MODEL = 'gpt-3.5-turbo'

BATCH_SIZE = 25

## Data Wrangling

TODO: In the cells below, load your chosen dataset into a `pandas` dataframe with a column named `"text"`. This column should contain all of your text data, separated into at least 20 rows.

In [15]:
# import libraries

import requests
import pandas as pd
from openai import OpenAI
from bs4 import BeautifulSoup
from typing import List, Union
from scipy.spatial.distance import cosine

In [16]:
# helpers functions
def pull_html_page(url: str):
    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        return response.content
    else:
        raise Exception('Connection error')


with open(PAGE_FILEPATH, mode='wb') as html_file:
    html_page = pull_html_page(SOURCE_URL)
    html_file.write(html_page)

### Read Page

In [17]:
with open(PAGE_FILEPATH) as fp:
    soup = BeautifulSoup(fp, 'html.parser')

In [18]:
root_dom_node = soup.find('h2', {'class':'has-text-align-center wp-block-heading'})
root_dom_node

<h2 class="has-text-align-center wp-block-heading"><strong>August</strong></h2>

In [19]:
month_headers = [month_header.find_next('strong') for month_header in soup.find_all('h2', {'class':'has-text-align-center wp-block-heading'})]
month_headers

[<strong>August</strong>,
 <strong>September</strong>,
 <strong>October</strong>,
 <strong>November</strong>,
 <strong>December</strong>,
 <strong>January</strong>,
 <strong>February</strong>,
 <strong>March</strong>,
 <strong>April</strong>,
 <strong>May</strong>]

In [20]:
current_month = None
data = []

for node in root_dom_node.find_all_next():
    if node in month_headers:
        current_month = node.text
    elif node.name == 'ul':
        data.append(f"{current_month} 2024 -- {node.find_next('li').text.strip()}")

In [22]:
import pandas as pd

pd.set_option('display.max_colwidth', None)  
pd.set_option('display.max_rows', None)  

df = pd.DataFrame(data, columns=['text'])
df.head()

Unnamed: 0,text
0,"August 2024 -- On 13 August 2022, Manchester City ended the day top and Manchester United ended the day bottom of the top-flight table for the first time since 29 November 1929."
1,August 2024 -- Erik ten Hag became the first manager to lose each of his first two games in charge of Manchester United since John Chapman in November 1921.
2,"August 2024 -- Harry Kane netted his 185th Premier League goal for Tottenham Hotspur against Wolves, overtaking Sergio Aguero’s record for Premier League goals for a single club (184 for Manchester City)."
3,August 2024 -- Brenden Aaronson’s opening goal in Leeds’ 3-0 win against Chelsea was the first time an American player scored under an American manager (Jesse Marsch) in Premier League history.
4,"August 2024 -- Darwin Núñez came off the bench to score and assist on his Premier League debut for Liverpool against Fulham, only the third player to score and assist as a substitute on debut, along with Sergio Aguero (2011) and Alvaro Morata (2017)."


In [23]:
df.shape

(101, 1)

### Create Embedding Database

In [None]:
from openai import OpenAI

client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
pd.reset_option('display.max_colwidth')  
pd.reset_option('display.max_rows')  

def get_embeddings(prompt: Union[str, List[str]], embedding_model: str) -> List[List[float]]:
    response = client.embeddings.create(
            input=prompt if type(prompt) is list else [prompt],
            model=embedding_model
    )
    return [row.embedding for row in response.data]
                                                                                     

def create_embeddings(df, embedding_model_name: str = EMBEDDING_MODEL, batch_size: int = 25) -> List[List[float]]:
    output = []
    for idx in range(0, len(df), BATCH_SIZE):
        batch = df.iloc[idx:idx+BATCH_SIZE].tolist()
        embeddings = get_embeddings(batch, embedding_model_name)
        output.extend(embeddings)

    return output

df['embedding'] = create_embeddings(df['text'])
df.to_csv(CSV_FILEPATH_WITH_EMBEDDINGS, sep=',', index=False)    
df.head()

## Custom Query Completion

TODO: In the cells below, compose a custom query using your chosen dataset and retrieve results from an OpenAI `Completion` model. You may copy and paste any useful code from the course materials.

In [None]:
# create client
client = OpenAI(api_key=OPENAI_API_KEY)

In [None]:
def build_simple_prompt(question: str):
    return [
        {
            'role': 'user',
            'content': question
        }
    ]

def build_custom_prompt(question: str, database_df):
    return [
        {
            'role': 'system',
            'content': """
            Anser the question based on provided context below. If the question cannot be answered based on provided context, say "I don't know the answer". We have 2024. Context contains facts from season 2022/2023 for English Premier League. Facts are annotated with date and seperated by lines. 
            Context: 
                {}
            """.format('\n\n'.join(build_custom_context(question, database_df)))
        },
        {
            'role': 'user',
            'content': question
        }
    ]

def build_custom_context(question: str, database_df: df, n: int = 5):
    question_embedding = get_embeddings(question, EMBEDDING_MODEL)[0]
    
    df = database_df.copy()
    df["distances"] = df['embedding'].apply(lambda embedding: cosine(embedding, question_embedding))

    df.sort_values("distances", ascending=True, inplace=True)
    return df.iloc[:n]['text'].tolist()


def get_embeddings(prompt: Union[str, List[str]], embedding_model: str) -> List[List[float]]:
    response = client.embeddings.create(
            input=prompt if type(prompt) is list else [prompt],
            model=embedding_model
    )
    return [row.embedding for row in response.data]

def handle_question(prompt, model_name: str = COMPLETION_MODEL):
    response = client.chat.completions.create(
        model=model_name,
        messages=prompt,
        max_tokens=100
    )
    return response.choices[0].message.content

## Custom Performance Demonstration

TODO: In the cells below, demonstrate the performance of your custom query using at least 2 questions. For each question, show the answer from a basic `Completion` model query as well as the answer from your custom query.

In [None]:
df = pd.read_csv(CSV_FILEPATH_WITH_EMBEDDINGS)
df['embedding'] = df['embedding'].apply(lambda value: [float(dim) for dim in value.replace('[', '').replace(']', '').split(',')])

### Question 1

In [None]:
question = 'Who did win the Premier League in season 2022/2023?'
print('__Answer:__', handle_question(build_simple_prompt(question)))
print('__Answer with Context:__', handle_question(build_custom_prompt(question, df)))

### Question 2

In [None]:
question = 'What football team did Harry Kane play in in season 2022/2023?'
print('__Answer:__', handle_question(build_simple_prompt(question)))
print('__Answer with Context:__', handle_question(build_custom_prompt(question, df)))

### Question 3

In [None]:
question = 'What team did finish the match with the most competitive win? What was the result? Who was the opponent?'
print('__Answer:__', handle_question(build_simple_prompt(question)))
print('__Answer with Context:__', handle_question(build_custom_prompt(question, df)))