In [44]:
import pandas as pd
import json
import os
import requests
from dotenv import load_dotenv
import openai

load_dotenv()

True

In [235]:
BING_KEY = os.environ["BING_SEARCH_V7_SUBSCRIPTION_KEY"]
BING_API_ENDPOINT = os.environ["BING_SEARCH_V7_ENDPOINT"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
MODEL_ID = "gpt-3.5-turbo-1106"

In [102]:
def get_news(query="", count=100):
    headers = {"Ocp-Apim-Subscription-Key": BING_KEY}
    params = {"mkt": "en-US", "q": query, "count": count, "sortBy": "Date"}
    response = requests.get(BING_API_ENDPOINT, headers=headers, params=params)
    response.raise_for_status()
    news_articles = response.json()
    return news_articles

### Specify queries for each location

In [103]:
QUERIES = [
    {
        "location": "Bogota, Colombia",
        "events": [
            "natural disaster",
            "workers strike",
        ],
    },
    {
        "location": "Solway, Guatemala",
        "events": [
            "natural disaster",
            "workers strike",
        ],
    },
    {
        "location": "Onca Puma, Brazil",
        "events": [
            "natural disaster",
            "workers strike",
        ],
    }
]

## Fetch articles from news api (expensive cell ðŸ¥²)

In [105]:
articles = []

for query in QUERIES:
    for event in query["events"]:
        q = f"{event} {query['location']}"
        news_articles = get_news(query=q)
        for article in news_articles["value"]:
           articles.append(
               {
                    "location": query["location"],
                    "event": event,
                    "name": article["name"],
                    "url": article["url"],
                    "description": article["description"],
                    "timestamp": article["datePublished"]
                }
            )

print(f"Found {len(articles)} articles")

Found 354 articles


In [106]:
json.dump(articles, open("data/full-news-data.json", "w"), indent=4)

#### Try analysing news with GPT

In [65]:
from openai import OpenAI

openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [178]:
def get_system_prompt(location):
    return f"""
You're an analyst for Outokumpu, a stainless steel manufacturer.
You're tasked with monitoring news articles for events that could impact the company's operations.
You have an important raw material supplier in {location}.
You monitor news articles for potentially disruptive events in {location}.
If there is a {event} that could impact the company's operations, you need to mention it in your report.
If the news don't contain important information, you MUST NOT mention it in your daily report.
For important information, the location must match {location} and the event must be important.
"""

In [231]:
MAX_ARTICLES_PER_EVENT = 5

def get_articles_for_location(location, all_articles):
    return [a for a in all_articles if a["location"] == location][:MAX_ARTICLES_PER_EVENT]

In [232]:
def get_query_prompt(articles):
    prompt = ""
    for article in articles:
        prompt += f"""
        Title: {article["name"]}
        Description: {article["description"]}
        URL: {article["url"]}
        """
    return prompt

In [285]:
def get_tool_for_location(location):
    return [
        {
            "type": "function",
            "function": {
                "name": "report_risk_status",
                "description": f"Report the risk status for {location} based on the news articles",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "has_risk": {
                            "type": "boolean",
                            "description": "Whether there is a potential risk for operations or not",
                        },
                        "risk_title": {
                            "type": "string",
                            "description": "Given if there is a risk. One sentence title for the risk for operations intended for analysts. For example: Potential disruption due to a labor strike in the region.",
                        },
                        "risk_summary": {
                            "type": "string",
                            "description": "Given if there is a risk. A few sentences summary for the risk for operations intended for analysts.",
                        },
                        "articles": {
                            "type": "array",
                            "description": "Given if there is a risk. The articles that support the risk.",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "title": {
                                        "type": "string",
                                        "description": "The title of the article",
                                    },
                                    "content": {
                                        "type": "string",
                                        "description": "The content of the article",
                                    },
                                    "url": {
                                        "type": "string",
                                        "description": "The URL of the article",
                                    },
                                },
                            },
                        },
                    },
                    "required": ["has_risk"],
                },
            },
        },
    ]

In [286]:
def get_prompts_for_location(location, articles):
    query_prompt = get_query_prompt(articles)
    system_prompt = get_system_prompt(location)
    tools = get_tool_for_location(location)
    return {
        "messages": [
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": query_prompt,
            },
        ],
        "tools": tools,
        "tool_choice": {"type": "function", "function": {"name": "report_risk_status"}},
    }

In [287]:
def get_completion_for_location(location, articles):
    articles = get_articles_for_location(location, articles)
    prompts = get_prompts_for_location(location, articles)
    completion = openai.chat.completions.create(
        messages=prompts["messages"],
        tools=prompts["tools"],
        tool_choice=prompts["tool_choice"],
        model=MODEL_ID,
    )
    return completion

In [288]:
def parse_risk_status_from_completion(completion):
    try:
        return json.loads(completion.choices[0].message.tool_calls[0].function.arguments)
    except Exception as e:
        print(f"------\nError parsing risk status: {e}")
        print(f"Related completion: {completion}------\n")
        return { "has_risk": False }

In [289]:
risk_statuses = []
for query in QUERIES:
    completion = get_completion_for_location(query["location"], articles)
    risk_status = parse_risk_status_from_completion(completion)
    risk_statuses.append({
        "location": query["location"],
        "risk_status": risk_status
    })

# pretty print the results json
print(json.dumps(risk_statuses, indent=4))

[
    {
        "location": "Bogota, Colombia",
        "risk_status": {
            "has_risk": false
        }
    },
    {
        "location": "Solway, Guatemala",
        "risk_status": {
            "has_risk": false
        }
    },
    {
        "location": "Onca Puma, Brazil",
        "risk_status": {
            "has_risk": false
        }
    }
]


## Try adding a fake article of an important risk and see if GPT elevates that

In [290]:
fake_earthquake_bogota = {
    "location": "Bogota, Colombia",
    "event": "natural disaster",
    "name": "Earthquake in Bogota",
    "url": "https://fakeurl.com/earthquake",
    "description": "A 6.1 magnitude earthquake has struck the Colombian capital, BogotÃ¡, causing buildings to shake and people to run into the streets.",
    "timestamp": "2021-07-25T21:35:00.0000000Z"
}

fake_strike_onca_puma = {
    "location": "Onca Puma, Brazil",
    "event": "workers strike",
    "name": "Factory workers' union calls for strike in Onca Puma",
    "url": "https://fakeurl.com/workers-strike",
    "description": "The union representing mining workers in Onca Puma has called for a strike.",
    "timestamp": "2021-07-25T21:35:00.0000000Z"
}

## Fetch analysis from GPT for the extended queries (expensive cell ðŸ¥²)

In [291]:
# Put them first because only first 5 articles are considered
# per each (event, location) pair
extended_articles = [fake_earthquake_bogota, fake_strike_onca_puma] + articles

risk_statuses_extended = []
for query in QUERIES:
    completion = get_completion_for_location(query["location"], extended_articles)
    risk_status = parse_risk_status_from_completion(completion)
    risk_statuses_extended.append({
        "location": query["location"],
        "risk_status": risk_status
    })

# pretty print the results json
print(json.dumps(risk_statuses_extended, indent=4))

[
    {
        "location": "Bogota, Colombia",
        "risk_status": {
            "has_risk": true,
            "risk_title": "Potential risk due to a 6.1 magnitude earthquake in Bogota",
            "risk_summary": "An earthquake of 6.1 magnitude has struck Bogot\u00e1, causing buildings to shake and people to run into the streets. This could potentially have an impact on the company's operations, and a thorough assessment of the situation is recommended.",
            "articles": [
                {
                    "title": "Earthquake in Bogota",
                    "content": "A 6.1 magnitude earthquake has struck the Colombian capital, Bogot\u00e1, causing buildings to shake and people to run into the streets.",
                    "url": "https://fakeurl.com/earthquake"
                }
            ]
        }
    },
    {
        "location": "Solway, Guatemala",
        "risk_status": {
            "has_risk": false
        }
    },
    {
        "location": "Onca Puma, 