## Call Wikipedia API to get 2022 events listings

In [53]:
import requests

In [54]:
# Get the Wikipedia page for "2022" since OpenAI's models stop in 2021
params = {
    "action": "query", 
    "prop": "extracts",
    "exlimit": 1,
    "titles": "2022",
    "explaintext": 1,
    "formatversion": 2,
    "format": "json"
}

In [55]:
resp = requests.get("https://en.wikipedia.org/w/api.php", params=params)

In [56]:
response_dict = resp.json()

In [57]:
response_dict["query"]["pages"][0]["extract"].split("\n")

['2022 (MMXXII) was a common year starting on Saturday of the Gregorian calendar, the 2022nd year of the Common Era (CE) and Anno Domini (AD) designations, the 22nd  year of the 3rd millennium and the 21st century, and the  3rd   year of the 2020s decade.  ',
 'The year 2022 saw the removal of nearly all COVID-19 restrictions and the reopening of international borders in most countries, and the global rollout of COVID-19 vaccines continued. The global economic recovery from the pandemic continued, though many countries experienced an ongoing inflation surge; in response, many central banks raised their interest rates to landmark levels. The world population reached eight billion people in 2022, though the year also witnessed numerous natural disasters, including two devastating Atlantic hurricanes (Fiona and Ian), and the most powerful volcano eruption of the century so far. The later part of the year also saw the first public release of ChatGPT by OpenAI starting an arms race in artif

In [58]:
import pandas as pd

In [59]:
df = pd.DataFrame()
df["text"] = response_dict["query"]["pages"][0]["extract"].split("\n")

In [60]:
df = df[df["text"].str.len() > 0]
df

Unnamed: 0,text
0,2022 (MMXXII) was a common year starting on Sa...
1,The year 2022 saw the removal of nearly all CO...
2,2022 was also dominated by wars and armed conf...
5,== Events ==
8,=== January ===
...,...
245,== Demographics ==
246,The world population was estimated to have rea...
249,== Deaths ==
252,== Nobel Prizes ==


In [61]:
df = df[~df["text"].str.startswith("==")]
df

Unnamed: 0,text
0,2022 (MMXXII) was a common year starting on Sa...
1,The year 2022 saw the removal of nearly all CO...
2,2022 was also dominated by wars and armed conf...
9,1 January – The Regional Comprehensive Econom...
10,2 January – Abdalla Hamdok resigns as Prime Mi...
...,...
239,21–26 December – A major winter storm hits the...
240,24 December – 2022 Fijian general election: Th...
241,29 December – Brazilian football legend Pelé d...
242,31 December – Former Pope Benedict XVI dies at...


In [62]:
from dateutil.parser import parse

In [63]:
prefix = ""
for (i, row) in df.iterrows():
    # If the row already has " - ", it already has the needed date prefix
    if " – " not in row["text"]:
        try:
            # If the row's text is a date, set it as the new prefix
            parse(row["text"])
            prefix = row["text"]
        except:
            # If the row's text isn't a date, add the prefix
            row["text"] = prefix + " – " + row["text"]
df = df[df["text"].str.contains(" – ")].reset_index(drop=True)
df

Unnamed: 0,text
0,– 2022 (MMXXII) was a common year starting on...
1,– The year 2022 saw the removal of nearly all...
2,– 2022 was also dominated by wars and armed c...
3,1 January – The Regional Comprehensive Econom...
4,2 January – Abdalla Hamdok resigns as Prime Mi...
...,...
177,21–26 December – A major winter storm hits the...
178,24 December – 2022 Fijian general election: Th...
179,29 December – Brazilian football legend Pelé d...
180,31 December – Former Pope Benedict XVI dies at...


In [64]:
df.to_csv("data/text.csv")

## Embedding

In [65]:
import openai
f = open("../../openai_app.key", "rt")
openai.api_key = f.read()

In [67]:
EMBEDDING_MODEL_NAME = "text-embedding-ada-002"
batch_size = 100
embeddings = []
for i in range(0, len(df), batch_size):
    # Send text data to OpenAI model to get embeddings
    response = openai.Embedding.create(
        input=df.iloc[i:i+batch_size]["text"].tolist(),
        engine=EMBEDDING_MODEL_NAME
    )

    # Add embeddings to list
    embeddings.extend([data["embedding"] for data in response["data"]])

# Add embeddings list to dataframe
df["embeddings"] = embeddings

In [68]:
df

Unnamed: 0,text,embeddings
0,– 2022 (MMXXII) was a common year starting on...,"[-9.17855777515797e-06, -0.018004480749368668,..."
1,– The year 2022 saw the removal of nearly all...,"[-0.010302220471203327, -0.023034777492284775,..."
2,– 2022 was also dominated by wars and armed c...,"[-0.009752900339663029, -0.0154692018404603, 0..."
3,1 January – The Regional Comprehensive Econom...,"[-0.002039003651589155, -0.02498391643166542, ..."
4,2 January – Abdalla Hamdok resigns as Prime Mi...,"[-0.016290072351694107, 0.005167210940271616, ..."
...,...,...
177,21–26 December – A major winter storm hits the...,"[-0.02596324123442173, -0.01704932563006878, 0..."
178,24 December – 2022 Fijian general election: Th...,"[-0.010966101661324501, -0.007615169510245323,..."
179,29 December – Brazilian football legend Pelé d...,"[-0.006588870193809271, 0.006336679682135582, ..."
180,31 December – Former Pope Benedict XVI dies at...,"[0.019952034577727318, 0.010480967350304127, -..."
