In [2]:
# init
import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")



In [3]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [7]:
import pandas as pd

input_path = "./datasets/90minFootballTransferNewsNLP.csv"
df = pd.read_csv(input_path)
df = df[["Title","Date", "Link", "Content"]]
df = df.dropna()
df["Combined"] = (
    "Title: " + df.Title.str.strip() + "; Content: " + df.Content.str.strip()
)
df.head(2)

print(df.shape[0])

6726


In [8]:
import tiktoken

top_n = 100
# subsample 100
df = df.iloc[:top_n]


encoding = tiktoken.get_encoding(embedding_encoding)

# omit reviews that are too long to embed
df["n_tokens"] = df.Combined.apply(lambda x: len(encoding.encode(x)))
df = df[df.n_tokens <= max_tokens].tail(top_n)
len(df)

100

In [6]:
from openai.embeddings_utils import get_embedding

# This may take a few minutes
df["embedding"] = df.Combined.apply(lambda x: get_embedding(x, engine=embedding_model))
df.to_csv("./results/footballTransferNewsEmbeddings.csv")

In [7]:
df.head(5)

Unnamed: 0,Title,Date,Link,Content,Combined,n_tokens,embedding
0,Football transfer rumours: Why Maguire's Man U...,"Aug 15, 2023",https://www.90min.com/posts/football-transfer-...,"90minÂ rounds up the latestÂ transfer news, ru...",Title: Football transfer rumours: Why Maguire'...,458,"[-0.004948447924107313, -0.007800770457834005,..."
1,Chelsea agree Romeo Lavia fee with Southampton,"Aug 15, 2023",https://www.90min.com/posts/chelsea-agree-rome...,Chelsea have finalised an agreement with South...,Title: Chelsea agree Romeo Lavia fee with Sout...,327,"[0.019322646781802177, 0.009196124039590359, 0..."
2,Harry Maguire's proposed West Ham transfer col...,"Aug 15, 2023",https://www.90min.com/posts/harry-maguire-prop...,Harry Maguire's proposed transfer to West Ham ...,Title: Harry Maguire's proposed West Ham trans...,372,"[-0.013167046010494232, -0.023806432262063026,..."
3,Southampton director breaks down Chelsea & Liv...,"Aug 15, 2023",https://www.90min.com/posts/southampton-direct...,Southampton director Jason Wilcox has revealed...,Title: Southampton director breaks down Chelse...,419,"[0.013198532164096832, 0.01562053058296442, 0...."
4,Neymar completes move from PSG to Al Hilal,"Aug 15, 2023",https://www.90min.com/posts/neymar-completes-m...,Saudi Pro League side Al Hilal have confirmed ...,Title: Neymar completes move from PSG to Al Hi...,393,"[-0.006522698327898979, 0.015212981030344963, ..."
