## Call Wikipedia API to get 2022 events listings

In [1]:
import requests

In [2]:
# Get the Wikipedia page for "2022" since OpenAI's models stop in 2021
params = {
    "action": "query", 
    "prop": "extracts",
    "exlimit": 1,
    "titles": "2022",
    "explaintext": 1,
    "formatversion": 2,
    "format": "json"
}

In [3]:
resp = requests.get("https://en.wikipedia.org/w/api.php", params=params)

In [4]:
response_dict = resp.json()

In [5]:
response_dict["query"]["pages"][0]["extract"].split("\n")

['2022 (MMXXII) was a common year starting on Saturday of the Gregorian calendar, the 2022nd year of the Common Era (CE) and Anno Domini (AD) designations, the 22nd  year of the 3rd millennium and the 21st century, and the  3rd   year of the 2020s decade.  ',
 'The year 2022 saw the removal of nearly all COVID-19 restrictions and the reopening of international borders in most countries, and the global rollout of COVID-19 vaccines continued. The global economic recovery from the pandemic continued, though many countries experienced an ongoing inflation surge; in response, many central banks raised their interest rates to landmark levels. The world population reached eight billion people in 2022, though the year also witnessed numerous natural disasters, including two devastating Atlantic hurricanes (Fiona and Ian), and the most powerful volcano eruption of the century so far. The later part of the year also saw the first public release of ChatGPT by OpenAI starting an arms race in artif

In [6]:
import pandas as pd

In [7]:
df = pd.DataFrame()
df["text"] = response_dict["query"]["pages"][0]["extract"].split("\n")

In [8]:
df = df[df["text"].str.len() > 0]
df

Unnamed: 0,text
0,2022 (MMXXII) was a common year starting on Sa...
1,The year 2022 saw the removal of nearly all CO...
2,2022 was also dominated by wars and armed conf...
5,== Events ==
8,=== January ===
...,...
242,== Demographics ==
243,The world population was estimated to have rea...
246,== Deaths ==
249,== Nobel Prizes ==


In [9]:
df = df[~df["text"].str.startswith("==")]
df

Unnamed: 0,text
0,2022 (MMXXII) was a common year starting on Sa...
1,The year 2022 saw the removal of nearly all CO...
2,2022 was also dominated by wars and armed conf...
9,1 January – The Regional Comprehensive Econom...
10,2 January – Abdalla Hamdok resigns as Prime Mi...
...,...
236,21–26 December – A major winter storm hits the...
237,24 December – 2022 Fijian general election: Th...
238,29 December – Brazilian football legend Pelé d...
239,31 December – Former Pope Benedict XVI dies at...


In [10]:
from dateutil.parser import parse

In [11]:
prefix = ""
for (i, row) in df.iterrows():
    # If the row already has " - ", it already has the needed date prefix
    if " – " not in row["text"]:
        try:
            # If the row's text is a date, set it as the new prefix
            parse(row["text"])
            prefix = row["text"]
        except:
            # If the row's text isn't a date, add the prefix
            row["text"] = prefix + " – " + row["text"]
df = df[df["text"].str.contains(" – ")].reset_index(drop=True)
df

Unnamed: 0,text
0,– 2022 (MMXXII) was a common year starting on...
1,– The year 2022 saw the removal of nearly all...
2,– 2022 was also dominated by wars and armed c...
3,1 January – The Regional Comprehensive Econom...
4,2 January – Abdalla Hamdok resigns as Prime Mi...
...,...
175,21–26 December – A major winter storm hits the...
176,24 December – 2022 Fijian general election: Th...
177,29 December – Brazilian football legend Pelé d...
178,31 December – Former Pope Benedict XVI dies at...


In [12]:
df.to_csv("data/text.csv")

## Embedding

In [13]:
import openai
f = open("../../openai_app.key", "rt")
openai.api_key = f.read()

In [14]:
EMBEDDING_MODEL_NAME = "text-embedding-ada-002"

In [15]:

batch_size = 100
embeddings = []
for i in range(0, len(df), batch_size):
    # Send text data to OpenAI model to get embeddings
    response = openai.Embedding.create(
        input=df.iloc[i:i+batch_size]["text"].tolist(),
        engine=EMBEDDING_MODEL_NAME
    )

    # Add embeddings to list
    embeddings.extend([data["embedding"] for data in response["data"]])

# Add embeddings list to dataframe
df["embeddings"] = embeddings

In [16]:
df

Unnamed: 0,text,embeddings
0,– 2022 (MMXXII) was a common year starting on...,"[-9.17855777515797e-06, -0.018004480749368668,..."
1,– The year 2022 saw the removal of nearly all...,"[-0.010343171656131744, -0.023116888478398323,..."
2,– 2022 was also dominated by wars and armed c...,"[-0.009752900339663029, -0.0154692018404603, 0..."
3,1 January – The Regional Comprehensive Econom...,"[-0.002039003651589155, -0.02498391643166542, ..."
4,2 January – Abdalla Hamdok resigns as Prime Mi...,"[-0.016290072351694107, 0.005167210940271616, ..."
...,...,...
175,21–26 December – A major winter storm hits the...,"[-0.02596324123442173, -0.01704932563006878, 0..."
176,24 December – 2022 Fijian general election: Th...,"[-0.010850615799427032, -0.007660393137484789,..."
177,29 December – Brazilian football legend Pelé d...,"[-0.006588870193809271, 0.006336679682135582, ..."
178,31 December – Former Pope Benedict XVI dies at...,"[0.01998756267130375, 0.010498648509383202, -0..."


In [17]:
df.to_csv("data/embeddings.csv")

## Cosine Similarity

In [18]:
import numpy as np
import pandas as pd
df = pd.read_csv("../embeddings.csv", index_col=0)
df["embeddings"] = df["embeddings"].apply(eval).apply(np.array)
df

Unnamed: 0,text,embeddings
0,– 2022 (MMXXII) was a common year starting on...,"[9.742109250510111e-05, -0.018078520894050598,..."
1,– 2022 saw the removal of nearly all COVID-19...,"[-0.006740245968103409, -0.024927902966737747,..."
2,"– The Russian invasion of Ukraine, the larges...","[-0.0062200650572776794, -0.007076282054185867..."
3,"– Many prominent figures died in 2022, includ...","[-0.0067838518880307674, -0.008528643287718296..."
10,January 1 – The Regional Comprehensive Economi...,"[-0.0005128878983668983, -0.02750258892774582,..."
...,...,...
229,December 7 – After substantial protests agains...,"[0.015803100541234016, -0.01866208389401436, -..."
230,December 17 – Leo Varadkar succeeds Micheál Ma...,"[0.008315027691423893, -0.02372470125555992, -..."
231,December 19 – At the UN Biodiversity Conferenc...,"[0.0017093520145863295, -0.009248359128832817,..."
232,December 21–26 – A major winter storm hits the...,"[-0.024563809856772423, -0.020963717252016068,..."


## Find Embeddings which is close to the question

In [19]:
question="When did Russia invaded Ukraine?"

In [20]:
from openai.embeddings_utils import get_embedding
question_embeddings = get_embedding(question, engine=EMBEDDING_MODEL_NAME)
question_embeddings

[0.0020747568923979998,
 -0.018061840906739235,
 0.002959392499178648,
 -0.01575796864926815,
 -0.024858897551894188,
 0.0024216105230152607,
 -0.014510569162666798,
 -0.02499891072511673,
 -0.014879697933793068,
 -0.0198183823376894,
 0.019640183076262474,
 0.026424510404467583,
 -0.007955356501042843,
 -0.013797769322991371,
 -0.007115270476788282,
 -0.010832012630999088,
 0.010882927104830742,
 -0.003538542427122593,
 0.03194871172308922,
 -0.017030825838446617,
 -0.013479555025696754,
 -0.01635621301829815,
 -4.819827154278755e-06,
 -0.00033253387664444745,
 -0.013135883957147598,
 0.007357113528996706,
 0.014586941339075565,
 -0.031388651579618454,
 0.015376112423837185,
 -0.013886869885027409,
 -0.00947642046958208,
 -0.020951226353645325,
 -0.021205797791481018,
 -0.014370555058121681,
 -0.036301881074905396,
 -0.03184688091278076,
 0.009813726879656315,
 -0.008260841481387615,
 0.015579769387841225,
 -0.002591855125501752,
 0.010106484405696392,
 0.018660083413124084,
 -0.00433

In [21]:
from openai.embeddings_utils import distances_from_embeddings
df["distances"] = distances_from_embeddings(
    question_embeddings,
    df["embeddings"].to_list(),
    distance_metric="cosine"
)

In [22]:
shortest = df.sort_values(by="distances").iloc[0]
shortest

text          March 2 – 2022 Russian invasion of Ukraine: Ru...
embeddings    [-7.068190461723134e-05, -0.03225385770201683,...
distances                                              0.115224
Name: 52, dtype: object

In [25]:
from openai.embeddings_utils import get_embedding, distances_from_embeddings
EMBEDDING_MODEL_NAME = "text-embedding-ada-002"

def get_rows_sorted_by_relevance(question, df):
    """
    Function that takes in a question string and a dataframe containing
    rows of text and associated embeddings, and returns that dataframe
    sorted from least to most relevant for that question
    """

    # Get embeddings for the question text
    question_embeddings = get_embedding(question, engine=EMBEDDING_MODEL_NAME)

    # Make a copy of the dataframe and add a "distances" column containing
    # the cosine distances between each row's embeddings and the
    # embeddings of the question
    df_copy = df.copy()
    df_copy["distances"] = distances_from_embeddings(
        question_embeddings,
        df_copy["embeddings"].to_list(),
        distance_metric="cosine"
    )

    # Sort the copied dataframe by the distances and return it
    # (shorter distance = more relevant so we sort in ascending order)
    df_copy.sort_values("distances", ascending=True, inplace=True)
    return df_copy

In [26]:
get_rows_sorted_by_relevance(question, df)

Unnamed: 0,text,embeddings,distances
52,March 2 – 2022 Russian invasion of Ukraine: Ru...,"[-7.068190461723134e-05, -0.03225385770201683,...",0.115171
79,April 3 – 2022 Russian invasion of Ukraine: As...,"[-0.013741135597229004, -0.02586185373365879, ...",0.115730
188,October 8 – 2022 Russian invasion of Ukraine: ...,"[-0.014346115291118622, -0.017521075904369354,...",0.120102
197,October 29 – 2022 Russian invasion of Ukraine:...,"[-0.007012648973613977, -0.04132407531142235, ...",0.120259
34,February 21–24 – Russian President Vladimir Pu...,"[-0.00456717424094677, -0.005147133022546768, ...",0.121752
...,...,...,...
221,"November 30 – OpenAI releases ChatGPT, an arti...","[-0.011301065795123577, -0.014325111173093319,...",0.291775
74,March 31 – Expo 2020 closes in Dubai after a 6...,"[-0.0031263940036296844, -0.04660109058022499,...",0.294036
210,"November 11 – The cryptocurrency exchange FTX,...","[0.0022525235544890165, -0.025786597281694412,...",0.295230
219,November 20 – 2022 Nepalese general election: ...,"[-0.00422509154304862, -0.0007446320960298181,...",0.296548
