Based on: https://github.com/tomasonjo/blogs/blob/master/youtube/video2graph.ipynb

Uses newspaper3k: https://pythonrepo.com/repo/codelucas-newspaper-python-web-crawling

In [1]:
import pandas as pd
import newspaper
import openai
import tiktoken
from secret_credentials import OPENAI_API_KEY


In [2]:
def num_tokens_from_messages(messages, model="gpt-3.5-turbo"):
  """Returns the number of tokens used by a list of messages."""
  try:
      encoding = tiktoken.encoding_for_model(model)
  except KeyError:
      encoding = tiktoken.get_encoding("cl100k_base")
  if model == "gpt-3.5-turbo":  # note: future models may deviate from this
      num_tokens = 0
      for message in messages:
          num_tokens += 4  # every message follows <im_start>{role/name}\n{content}<im_end>\n
          for key, value in message.items():
              num_tokens += len(encoding.encode(value))
              if key == "name":  # if there's a name, the role is omitted
                  num_tokens += -1  # role is always required and always 1 token
      num_tokens += 2  # every reply is primed with <im_start>assistant
      return num_tokens
  else:
      raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}.
  See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens.""")

In [3]:
# set openai system prompt
openai.api_key = OPENAI_API_KEY
prompt_system = '''You are an expert financial, economic, and political analyst helping to read news articles and extract relevant information (entities and relationships) into a knowledge and current events graph. As input, you will accept the text of a news article. The first line of the input will always be the article headline. You will generate output that contains three sections:
entity - All relevant entities (related to finance, the economy, or politics), labeled with an appropriate descriptive category. Each entity is written on its own line as “LABEL {Entity Name}”
relationship - All direct relationships between the extracted entities. Each relationship is written on its own line as: “{Head Entity Name} RELATIONSHIP {Tail Entity Name}”
current_event - All news items (actions or events described in the article that involve one or more of the extracted entities but are not simple direct relationships), along with the associated entities (only reference entities which you have previously defined in the first section). Each event and associated entities is written on its own line as “NEWS_ITEM {Entity 1} {Entity 2} {…} {Entity n}”

To help you understand the requirements, here is an example:
INPUT
DeSantis threatens Disney with legal retaliation
Florida Governor Ron DeSantis escalated the state's ongoing legal battle with Disney for control over their special district in Orlando, Florida.

OUTPUT
entity:
PLACE {Florida} 
PLACE {Orlando}
PERSON {Ron DeSantis}
COMPANY {Disney}
PLACE {Disney Special District}

relationship:
{Ron DeSantis} GOVERNOR_OF {Florida}
{Disney} OWNS {Disney Special District}
{Disney Special District} IN {Orlando}
{Orlando} IN {Florida} 

current_event:
ONGOING_LEGAL_BATTLE {Florida} {Disney} {Ron DeSantis} {Disney Special District}
'''

In [5]:
# download article
#cnn_paper = newspaper.build('http://cnn.com')
#articles = [article for article in cnn_paper.articles] # if "business" in article.url] # and article.url.endswith("index.html")
article = newspaper.Article("https://www.cnn.com/2023/04/06/tech/korea-samsung-chips-cuts-hnk-intl/index.html")
article.download()
article.parse()

prompt_input = article.title + "\r\n" + article.text

In [6]:
prompt_messages=[
        {"role": "system", "content": prompt_system},
        {"role": "user", "content": prompt_input}
    ]
num_tokens_from_messages(prompt_messages, "gpt-3.5-turbo")

882

In [7]:
response = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=prompt_messages,
  temperature=0 #most deterministic
)

In [28]:
restext = response["choices"][0]["message"]["content"]

In [29]:
for line in restext.split('\n'):
    print(line)

entity:
COMPANY {Samsung Electronics}
PRODUCT {memory chips}
COMPANY {SK Hynix}
PRODUCT {TV}
PLACE {Seoul}
TIME {January-March 2019}

relationship:
{Samsung Electronics} IS_WORLD'S_LARGEST {memory chip and TV maker}
{Samsung Electronics} OWNS {memory chips}
{SK Hynix} IS_RIVAL_OF {Samsung Electronics}
{memory chips} ARE_PRODUCED_IN {Seoul}

current_event:
CHIP_PRODUCTION_CUT {Samsung Electronics}
DECLINE_IN_MEMORY_DEMAND {Samsung Electronics}
DECLINE_IN_SEMICONDUCTOR_BUYERS {data center operators, smartphone and personal computer makers}
INVENTORY_ADJUSTMENT {Samsung Electronics}
