In [44]:
import pandas as pd
import json
import os
import requests
from dotenv import load_dotenv
import openai

load_dotenv()

True

In [45]:
BING_KEY = os.environ["BING_SEARCH_V7_SUBSCRIPTION_KEY"]
BING_API_ENDPOINT = os.environ["BING_SEARCH_V7_ENDPOINT"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]

In [102]:
def get_news(query="", count=100):
    headers = {"Ocp-Apim-Subscription-Key": BING_KEY}
    params = {"mkt": "en-US", "q": query, "count": count, "sortBy": "Date"}
    response = requests.get(BING_API_ENDPOINT, headers=headers, params=params)
    response.raise_for_status()
    news_articles = response.json()
    return news_articles

### Specify queries for each location

In [103]:
QUERIES = [
    {
        "location": "Bogota, Colombia",
        "events": [
            "natural disaster",
            "workers strike",
        ],
    },
    {
        "location": "Solway, Guatemala",
        "events": [
            "natural disaster",
            "workers strike",
        ],
    },
    {
        "location": "Onca Puma, Brazil",
        "events": [
            "natural disaster",
            "workers strike",
        ],
    }
]

## Fetch articles from news api (expensive cell 🥲)

In [105]:
articles = []

for query in QUERIES:
    for event in query["events"]:
        q = f"{event} {query['location']}"
        news_articles = get_news(query=q)
        for article in news_articles["value"]:
           articles.append(
               {
                    "location": query["location"],
                    "event": event,
                    "name": article["name"],
                    "url": article["url"],
                    "description": article["description"],
                    "timestamp": article["datePublished"]
                }
            )

print(f"Found {len(articles)} articles")

Found 354 articles


In [106]:
json.dump(articles, open("data/full-news-data.json", "w"), indent=4)

#### Try analysing news with GPT

In [65]:
from openai import OpenAI

openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [178]:
def get_system_prompt(location):
    return f"""
You're an analyst for Outokumpu, a stainless steel manufacturer.
You're tasked with monitoring news articles for events that could impact the company's operations.
You have an important raw material supplier in {location}.
You monitor news articles for potentially disruptive events in {location}.
If there is a {event} that could impact the company's operations, you need to mention it in your report.
If the news don't contain important information, you MUST NOT mention it in your daily report.
For important information, the location must match {location} and the event must be important.
Your report is of the form:
Potential risk in {location}: <boolean>
If <boolean> is true, output the following two lines:
RELEVANT NEWS ARTICLES: [[$title$, $content$, $url$], ...]
SUMMARY OF RISK: ...
"""

In [179]:
def get_query_prompt(article_list):
    prompt = ""
    for article in article_list:
        prompt += f"""
        Title: {article["name"]}
        Description: {article["description"]}
        URL: {article["url"]}
        """
    return prompt

In [180]:
MAX_ARTICLES_PER_EVENT = 5

def generate_prompt_for_each_location(queries, articles):
    prompts = []
    for query in queries:
        location = query["location"]
        system_prompt = get_system_prompt(location)
        query_articles = []
        for event in query["events"]:
            arts =  [
                article for article in articles
                if article["location"] == location
                and article["event"] == event
            ]
            arts = arts[:MAX_ARTICLES_PER_EVENT]
            query_articles.extend(arts)

        query_prompt = get_query_prompt(query_articles)
        prompts.append({
            "location": location,
            "system_prompt": system_prompt,
            "query_prompt": query_prompt,
            "articles": query_articles
        })
    return prompts

In [181]:
def fetch_completions(prompts_list):
    completions = []
    for prompt in prompts_list:
        system_prompt = prompt["system_prompt"]
        query_prompt = prompt["query_prompt"]
        completion = openai_client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": system_prompt,
                },
                {
                    "role": "user",
                    "content": query_prompt,
                },
            ],
            model="gpt-3.5-turbo"
        )
        completions.append(
            {
                "prompt": prompt,
                "completion": completion
            }
        )
    return completions

## Fetch analysis from GPT for each location (expensive cell 🥲)

In [182]:
prompts = generate_prompt_for_each_location(QUERIES, articles)
first_completions = fetch_completions(prompts)

In [183]:
for c in first_completions:
    print(c["completion"].choices[0].message.content)
    print("")

Potential risk in Bogota, Colombia: False

Potential risk in Solway, Guatemala: False

Potential risk in Onca Puma, Brazil: False



## Try adding a fake article of an important risk and see if GPT elevates that

In [184]:
fake_earthquake_bogota = {
    "location": "Bogota, Colombia",
    "event": "natural disaster",
    "name": "Earthquake in Bogota",
    "url": "https://fakeurl.com/earthquake",
    "description": "A 6.1 magnitude earthquake has struck the Colombian capital, Bogotá, causing buildings to shake and people to run into the streets.",
    "timestamp": "2021-07-25T21:35:00.0000000Z"
}

fake_strike_onca_puma = {
    "location": "Onca Puma, Brazil",
    "event": "workers strike",
    "name": "Factory workers' union calls for strike in Onca Puma",
    "url": "https://fakeurl.com/workers-strike",
    "description": "The union representing mining workers in Onca Puma has called for a strike.",
    "timestamp": "2021-07-25T21:35:00.0000000Z"
}

# Put them first because only first 5 articles are considered
# per each (event, location) pair
extended_articles = [fake_earthquake_bogota, fake_strike_onca_puma] + articles

## Fetch analysis from GPT for the extended queries (expensive cell 🥲)

In [185]:
second_prompts = generate_prompt_for_each_location(QUERIES, extended_articles)
second_completions = fetch_completions(second_prompts)

In [186]:
for c in second_completions:
    print(c["completion"].choices[0].message.content)
    print("")

Potential risk in Bogota, Colombia: True

RELEVANT NEWS ARTICLES: 
[["Earthquake in Bogota", "A 6.1 magnitude earthquake has struck the Colombian capital, Bogotá, causing buildings to shake and people to run into the streets.", "https://fakeurl.com/earthquake"]]

SUMMARY OF RISK: 
There has been a 6.1 magnitude earthquake in Bogotá, Colombia. This could potentially impact the company's operations in the region.

Potential risk in Solway, Guatemala: False

Potential risk in Onca Puma, Brazil: True

RELEVANT NEWS ARTICLES: [['Factory workers' union calls for strike in Onca Puma', 'The union representing mining workers in Onca Puma has called for a strike.', 'https://fakeurl.com/workers-strike']]

SUMMARY OF RISK: There is a risk of a workers' strike in Onca Puma, Brazil. This strike could potentially disrupt the company's operations.

