In [337]:
import json
import os
import requests
from dotenv import load_dotenv
import openai
from geopy.distance import geodesic
import xml.etree.ElementTree as ET
from datetime import datetime

load_dotenv()

True

In [235]:
BING_KEY = os.environ["BING_SEARCH_V7_SUBSCRIPTION_KEY"]
BING_API_ENDPOINT = os.environ["BING_SEARCH_V7_ENDPOINT"]
OPENAI_API_KEY = os.environ["OPENAI_API_KEY"]
MODEL_ID = "gpt-3.5-turbo-1106"

In [102]:
def get_news(query="", count=100):
    headers = {"Ocp-Apim-Subscription-Key": BING_KEY}
    params = {"mkt": "en-US", "q": query, "count": count, "sortBy": "Date"}
    response = requests.get(BING_API_ENDPOINT, headers=headers, params=params)
    response.raise_for_status()
    news_articles = response.json()
    return news_articles

### Specify queries for each location
Queries contain:
- The location's country, city and coordinates
- Events to search for in each location


In [293]:
QUERIES = [
    {
        "location": {
            "country": "Colombia",
            "city": "Bogota",
            "coordinates": [4.678390790414742, -74.08310452775451],
        },
        "events": [
            "natural disaster",
            "workers strike",
        ],
    },
    {
        "location": {
            "country": "Guatemala",
            "city": "El Estor",
            "coordinates": [15.5322197047923, -89.33265507494376],
        },
        "events": [
            "natural disaster",
            "workers strike",
        ],
    },
    {
        "location": {
            "country": "Brazil",
            "city": "Onca Puma",
            "coordinates": [-6.605335306591729, -51.100066887737015],
        },
        "events": [
            "natural disaster",
            "workers strike",
        ],
    },
    {
        "location": {
            "country": "Holland",
            "city": "Terneuzen",
            "coordinates": [51.33013942257549, 3.83565678442852],
        },
        "events": [
            "natural disaster",
            "workers strike",
        ],
    }
]

## Fetch articles from news api (expensive cell 🥲)

In [294]:
ALL_ARTICLES = []

for query in QUERIES:
    for event in query["events"]:
        country = query["location"]["country"]
        city = query["location"]["city"]
        q = f"{event} {city} {country}"
        news_articles = get_news(query=q)
        for article in news_articles["value"]:
           ALL_ARTICLES.append(
               {
                    "location": query["location"],
                    "event": event,
                    "name": article["name"],
                    "url": article["url"],
                    "description": article["description"],
                    "timestamp": article["datePublished"]
                }
            )

print(f"Found {len(ALL_ARTICLES)} articles")

Found 414 articles


In [295]:
json.dump(ALL_ARTICLES, open("data/full-news-data.json", "w"), indent=4)

## Fetch distaster geodata

In [448]:
def calculate_distance(lat1, lon1, lat2, lon2):
    coords1 = (lat1, lon1)
    coords2 = (lat2, lon2)
    return geodesic(coords1, coords2).kilometers

In [449]:
def parse_xml(content_xml, locations, url):
    root = ET.fromstring(content_xml)
    result_data = []
    for item in root.findall('.//item'):
        title = item.find('./title').text
        description = item.find('./description').text
        pub_date_raw = item.find('./pubDate').text
        date = datetime.strptime(pub_date_raw, "%a, %d %b %Y %H:%M:%S %Z").strftime("%Y-%m-%dT%H:%M:%SZ")
        point = item.find('.//geo:Point', namespaces={'geo': 'http://www.w3.org/2003/01/geo/wgs84_pos#'})
        event_lat = point.find('./geo:lat', namespaces={'geo': 'http://www.w3.org/2003/01/geo/wgs84_pos#'}).text
        event_lon = point.find('./geo:long', namespaces={'geo': 'http://www.w3.org/2003/01/geo/wgs84_pos#'}).text
        alert_level = item.find('./gdacs:alertlevel', namespaces={'gdacs': 'http://www.gdacs.org'}).text
        # alert_score = item.find('./gdacs:alertscore', namespaces={'gdacs': 'http://www.gdacs.org'}).text
        # calculation_type = item.find('./gdacs:calculationtype', namespaces={'gdacs': 'http://www.gdacs.org'}).text
        # severity_level = item.find('./gdacs:severity', namespaces={'gdacs': 'http://www.gdacs.org'}).get('value')
        # country = item.find('./gdacs:country', namespaces={'gdacs': 'http://www.gdacs.org'}).text

        result_json = {
            "title": title,
            "description": description,
            "timestamp": date,
            "locations_affected": [],
            "url": url
        }

        for location in locations:
            location_lat = location["coordinates"][0]
            location_lon = location["coordinates"][1]
            distance = calculate_distance(location_lat, location_lon, event_lat, event_lon)
            if alert_level == "Green" and distance < 100:
                result_json["locations_affected"].append(location)
            elif alert_level == "Orange" and distance < 1000:
                result_json["locations_affected"].append(location)
            elif alert_level == "Red" < 10000:
                result_json["locations_affected"].append(location)
                
        # Only add disasters that are affecting specified locations
        if len(result_json["locations_affected"]) > 0:
            result_data.append(result_json)
    return result_data

## Fetch disaster data (expensive cell 🥲)

In [450]:
def fetch_disaster_data_for_locations(locations):
    url = "https://www.gdacs.org/xml/rss.xml"
    response = requests.get(url)
    content_xml = response.content.decode("utf-8")
    return parse_xml(content_xml, locations, url)

In [451]:
locations = [q["location"] for q in QUERIES]
DISASTER_DATA = fetch_disaster_data_for_locations(locations)

with open("data/disaster-data.json", 'w') as json_file:
        json.dump(DISASTER_DATA, json_file)

### Turn disaster data into similar objects as news

In [452]:
def generate_articles_for_disaster(disaster):
    # generates articles for each location affected by a single disaster
    articles = []
    for location in disaster["locations_affected"]:

        event = "natural disaster"
        articles.append(
            {
                "location": location,
                "event": event,
                "name": disaster["title"],
                "url": disaster["url"],
                "description": disaster["description"],
                "timestamp": disaster["timestamp"]
            }
        )
    return articles


In [453]:
DISASTER_ARTICLES = []
for disaster in DISASTER_DATA:
    DISASTER_ARTICLES.extend(generate_articles_for_disaster(disaster))

In [454]:
COMBINED_ARTICLES = DISASTER_ARTICLES + ALL_ARTICLES

In [455]:
from openai import OpenAI

openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [456]:
def parse_location_name(location):
    return f"{location['city']}, {location['country']}"

In [457]:
def get_system_prompt(location):
    location_name = parse_location_name(location)
    return f"""
You're an analyst for Outokumpu, a stainless steel manufacturer.
You're tasked with monitoring news articles for events that could impact the company's operations.
You have an important raw material supplier in {location_name}.
You monitor news articles for potentially disruptive events in {location_name}.
If there is an event that could impact the company's operations, you need to mention it in your report.
If the news don't contain important information, you MUST NOT mention it in your daily report.
For important information, the location must match {location_name} and the event must be important.
"""

In [458]:
def get_articles_for_location(location, articles, max_per_event=5):
    location_name = parse_location_name(location)
    return [a for a in articles if parse_location_name(a["location"]) == location_name][:max_per_event]

In [459]:
def get_query_prompt(articles):
    prompt = ""
    for article in articles:
        prompt += f"""
        Title: {article["name"]}
        Description: {article["description"]}
        URL: {article["url"]}
        """
    return prompt

In [460]:
def get_tool_for_location(location):
    location_name = parse_location_name(location)
    return [
        {
            "type": "function",
            "function": {
                "name": "report_risk_status",
                "description": f"Report the risk status for {location_name} based on the news articles",
                "parameters": {
                    "type": "object",
                    "properties": {
                        "has_risk": {
                            "type": "boolean",
                            "description": "Whether there is a potential risk for operations or not",
                        },
                        "risk_title": {
                            "type": "string",
                            "description": "Given if there is a risk. One sentence title for the risk for operations intended for analysts. For example: Potential disruption due to a labor strike in the region.",
                        },
                        "risk_summary": {
                            "type": "string",
                            "description": "Given if there is a risk. A few sentences summary for the risk for operations intended for analysts.",
                        },
                        "articles": {
                            "type": "array",
                            "description": "Given if there is a risk. The articles that support the risk.",
                            "items": {
                                "type": "object",
                                "properties": {
                                    "title": {
                                        "type": "string",
                                        "description": "The title of the article",
                                    },
                                    "content": {
                                        "type": "string",
                                        "description": "The content of the article",
                                    },
                                    "url": {
                                        "type": "string",
                                        "description": "The URL of the article",
                                    },
                                },
                            },
                        },
                    },
                    "required": ["has_risk"],
                },
            },
        },
    ]

In [461]:
def get_prompts_for_location(location, articles):
    query_prompt = get_query_prompt(articles)
    system_prompt = get_system_prompt(location)
    tools = get_tool_for_location(location)
    return {
        "messages": [
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": query_prompt,
            },
        ],
        "tools": tools,
        "tool_choice": {"type": "function", "function": {"name": "report_risk_status"}},
    }

In [462]:
def get_completion_for_location(location, articles):
    articles = get_articles_for_location(location, articles)
    prompts = get_prompts_for_location(location, articles)
    completion = openai.chat.completions.create(
        messages=prompts["messages"],
        tools=prompts["tools"],
        tool_choice=prompts["tool_choice"],
        model=MODEL_ID,
        max_tokens=500,
        temperature=0.3
    )
    return completion

In [463]:
def parse_risk_status_from_completion(completion):
    try:
        return json.loads(completion.choices[0].message.tool_calls[0].function.arguments)
    except Exception as e:
        print(f"------\nError parsing risk status: {e}")
        print(f"Related completion: {completion}------\n")
        return { "has_risk": False }

In [464]:
risk_statuses = []
for query in QUERIES:
    completion = get_completion_for_location(query["location"], COMBINED_ARTICLES)
    risk_status = parse_risk_status_from_completion(completion)
    risk_statuses.append({
        "location": query["location"],
        "risk_status": risk_status
    })

# pretty print the results json
print(json.dumps(risk_statuses, indent=4))

[
    {
        "location": {
            "country": "Colombia",
            "city": "Bogota",
            "coordinates": [
                4.678390790414742,
                -74.08310452775451
            ]
        },
        "risk_status": {
            "has_risk": false
        }
    },
    {
        "location": {
            "country": "Guatemala",
            "city": "El Estor",
            "coordinates": [
                15.5322197047923,
                -89.33265507494376
            ]
        },
        "risk_status": {
            "has_risk": false
        }
    },
    {
        "location": {
            "country": "Brazil",
            "city": "Onca Puma",
            "coordinates": [
                -6.605335306591729,
                -51.100066887737015
            ]
        },
        "risk_status": {
            "has_risk": false
        }
    },
    {
        "location": {
            "country": "Holland",
            "city": "Terneuzen",
            "coordinates": [
  

## Try adding a fake article of an important risk and see if GPT elevates that

In [465]:
fake_strike_onca_puma = {
    "location": {
        "country": "Brazil",
        "city": "Onca Puma",
        "coordinates": [-6.605335306591729, -51.100066887737015],
    },
    "event": "workers strike",
    "name": "Factory workers' union calls for strike in Onca Puma",
    "url": "https://fakeurl.com/workers-strike",
    "description": "The union representing mining workers in Onca Puma has called for a strike.",
    "timestamp": "2021-07-25T21:35:00.0000000Z"
}

## Fetch analysis from GPT for the extended queries (expensive cell 🥲)

In [466]:
# Put the fake article first because only first 5 articles are considered
# per each (event, location) pair
extended_articles = [fake_strike_onca_puma] + COMBINED_ARTICLES

risk_statuses_extended = []
for query in QUERIES:
    completion = get_completion_for_location(query["location"], extended_articles)
    risk_status = parse_risk_status_from_completion(completion)
    risk_statuses_extended.append({
        "location": query["location"],
        "risk_status": risk_status
    })

# pretty print the results json
print(json.dumps(risk_statuses_extended, indent=4))

[
    {
        "location": {
            "country": "Colombia",
            "city": "Bogota",
            "coordinates": [
                4.678390790414742,
                -74.08310452775451
            ]
        },
        "risk_status": {
            "has_risk": false
        }
    },
    {
        "location": {
            "country": "Guatemala",
            "city": "El Estor",
            "coordinates": [
                15.5322197047923,
                -89.33265507494376
            ]
        },
        "risk_status": {
            "has_risk": false
        }
    },
    {
        "location": {
            "country": "Brazil",
            "city": "Onca Puma",
            "coordinates": [
                -6.605335306591729,
                -51.100066887737015
            ]
        },
        "risk_status": {
            "has_risk": true,
            "risk_title": "Potential disruption due to a labor strike in the region",
            "risk_summary": "The union representing minin

In [467]:
# Dump risk_statuses_extended to a file
json.dump(risk_statuses_extended, open("data/risk-statuses.json", "w"), indent=4)