In [None]:
# !pip install openai
# !pip install python-dotenv

In [2]:
import os
import json
import time
import openai
import sqlite3

from datetime import datetime
from dotenv import load_dotenv
from webdriver_manager.chrome import ChromeDriverManager 
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service as ChromeService
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

In [None]:
# Load the variables from .env into the environment
load_dotenv()

## Connecting to the News Website

In [None]:
URL = "https://www.maritimelogisticsprofessional.com"

In [None]:
browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install())) 
browser.implicitly_wait(5)
browser.get(URL)

In [None]:
try:
    # Wait for the button to be clickable
    WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, "//button[text()='Ok']"))
    )
    # Click the button once it is clickable
    browser.find_element(By.XPATH, "//button[text()='Ok']").click()
except (NoSuchElementException, TimeoutException):
    print("The cookie acceptance button was not found on the page.")

In [None]:
# List of latest news
latest_news = browser.find_elements(By.CLASS_NAME, "snippet-flex")
latest_news

In [None]:
# Collect all URLs before navigating
article_urls = [element.get_attribute('href') for element in latest_news if '/news/' in element.get_attribute('href')]
# Premium version
premium = False

article_urls

In [None]:
# Article
article_link = latest_news[0].get_attribute('href')
article_link

In [None]:
# Get article
time.sleep(5)
browser.get(article_link)

In [None]:
# Find the title element and get the text
title_element = browser.find_element(By.CSS_SELECTOR, "h1[itemprop='name']")
article_title = title_element.text

# Find the article body element and get all the paragraph texts
article_body_element = browser.find_element(By.CSS_SELECTOR, "div[property='articleBody']")
article_paragraphs = article_body_element.find_elements(By.TAG_NAME, "p")

# Combine the text of all paragraphs to form the body text
article_text = " ".join(paragraph.text for paragraph in article_paragraphs)

# Now you have the article's title and text
print("Title:", article_title)
print("Article Text:", article_text)

In [None]:
# Collect all URLs before navigating
urls = [element.get_attribute('href') for element in latest_news]

for url in urls:
    print(f"Getting {url}")
    time.sleep(5)  # Consider using WebDriverWait here instead of time.sleep for better efficiency
    browser.get(url)
    # Now you can perform your scraping logic on each article page

In [None]:
browser.quit()

---

## Categorizing the Article

In [None]:
# Load keywords from JSON file
with open('../json/keywords.json', 'r') as file:
    keywords = json.load(file)

def classify_article(article_text, keywords):
    max_count = 0
    max_category = "Unclassified or Neutral"

    # Convert article text to lower case for comparison
    article_text_lower = article_text.lower()
    
    # Check for the presence of each keyword in the article
    for category, category_keywords in keywords.items():
        count = sum(keyword in article_text_lower for keyword in category_keywords)
        if count > max_count:
            max_count = count
            max_category = category

    return max_category

# Usage example:
classification = classify_article(article_text, keywords)
print(f"The article is related to: {classification}")


---

## Connect to the OpenAI API

In [None]:
openai_api_key = os.getenv('OPENAI_API_KEY')
openai.api_key = openai_api_key

def summarize_text(text):
    response = openai.Completion.create(
        engine="text-davinci-003",
        prompt= f"""
Please summarize the key points of the article for a business audience in a concise paragraph, limiting the summary to no more than 350 characters. Then, in a separate paragraph of 500 characters, elaborate on the potential impact of the situation described in the article on maritime logistics, port operations, and supply chain management, specifically focusing on its implications for Latin America. Label this second paragraph 'Impacto en LATAM:' and ensure there is a clear separation between the two sections. Present your response in Spanish and format it as markdown text for clarity:\n\n{text}
                """,
        temperature=0.7,
        max_tokens=150,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    return response.choices[0].text.strip()

#summary = summarize_text(article_text)

In [None]:
summary = "Get Premium for enabling AI-powered summary!"
location = "Global / LATAM"

---

## Save Original Article, Title and Summary in DDBB

In [None]:
# Establishes a connection to the SQLite database
def connect_to_db(db_path):
    return sqlite3.connect(db_path)

# Create a new table for storing only daily articles
def create_daily_table(cursor, table_name):
    """
    """
    cursor.execute(f"""CREATE TABLE IF NOT EXISTS {table_name} (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        title TEXT,
                        text TEXT,
                        summary TEXT,
                        classification TEXT,
                        location TEXT
                    );""")

# Create a new table for storing daily links
def create_daily_links_table(cursor, table_name, articles_table_name):
    """
    """
    cursor.execute(f"""CREATE TABLE IF NOT EXISTS {table_name} (
                        id INTEGER PRIMARY KEY AUTOINCREMENT,
                        article_id INTEGER,
                        title TEXT,
                        link TEXT,
                        FOREIGN KEY (article_id) REFERENCES {articles_table_name}(id)
                    );""")

# Inserts an article into the daily table
def insert_article_data(cursor, table_name, article_title, article_text, summary, classification, location):
    """
    """
    # Check if an article with the same title already exists
    cursor.execute(f"SELECT id FROM {table_name} WHERE title = ?", (article_title,))
    existing_article = cursor.fetchone()

    if existing_article == None:
        cursor.execute(f"""INSERT INTO {table_name} (title, text, summary, classification, location)
                        VALUES (?, ?, ?, ?, ?);""",
                    (article_title, article_text, summary, classification, location))
        return cursor.lastrowid  # Return the ID of the new article
    else:
        return existing_article[0]  # Return the ID of the existing article

# Inserts a link into the daily links table
def insert_link_data(cursor, table_name, article_id, article_title, link):
    """
    """
    # Check if the link already exists for the given article
    cursor.execute(f"SELECT id FROM {table_name} WHERE article_id = ? AND link = ?", (article_id, link))
    existing_link = cursor.fetchone()

    if existing_link is None:
        # If the link does not exist for this article, insert it
        cursor.execute(f"""INSERT INTO {table_name} (article_id, title, link)
                           VALUES (?, ?, ?);""",
                       (article_id, article_title, link))

### DDBB Initialization

In [None]:
# Define the path to the SQLite database
db_path = '../data/news/maritime_news.db'

# Define table naming
current_date = datetime.now().strftime("%m%d%Y")

# Establish a connection and create a cursor
conn = connect_to_db(db_path)
cursor = conn.cursor()

# Create tables for today's date with news
news_table_name = f"news_{current_date}"
create_daily_table(cursor, news_table_name)

# Create table for storing links
links_table_name = f"news_links_{current_date}"
create_daily_links_table(cursor, links_table_name, news_table_name)

# Commit the changes and close the connection
conn.commit()
conn.close()

### DDBB News/Links Input or Update

In [None]:
# Establish a connection and create a cursor
conn = connect_to_db(db_path)
cursor = conn.cursor()

# Insert the article data into the table and get the article ID
create_daily_table(cursor, news_table_name)
article_id = insert_article_data(cursor, news_table_name, article_title, article_text, summary, classification, location)

# Insert the link data into the links table
insert_link_data(cursor, links_table_name, article_id, article_title, article_link)

# Commit the changes and close the connection
conn.commit()
conn.close()

---

## Full Cycle

In [None]:
# Create a browser session
browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install())) 
browser.implicitly_wait(2)
browser.get(URL)

# Cookies button
try:
    # Wait for the button to be clickable
    WebDriverWait(browser, 10).until(
        EC.element_to_be_clickable((By.XPATH, "//button[text()='Ok']"))
    )
    # Click the button once it is clickable
    browser.find_element(By.XPATH, "//button[text()='Ok']").click()
except (NoSuchElementException, TimeoutException):
    print("The cookie acceptance button was not found on the page.")

In [None]:
"""
GLOBAL LATEST NEWS
"""

# List of latest global news
latest_news = browser.find_elements(By.CLASS_NAME, "snippet-flex")
# Collect all URLs before navigating
article_urls = [element.get_attribute('href') for element in latest_news]
# Premium version
premium = False

# Establish a connection and create a cursor
conn = connect_to_db(db_path)
cursor = conn.cursor()

for article in article_urls:
    """
    """
    # Get article
    time.sleep(5)
    browser.get(article)

    # Find the title element and get the text
    title_element = browser.find_element(By.CSS_SELECTOR, "h1[itemprop='name']")
    article_title = title_element.text

    # Find the article body element and get all the paragraph texts
    article_body_element = browser.find_element(By.CSS_SELECTOR, "div[property='articleBody']")
    article_paragraphs = article_body_element.find_elements(By.TAG_NAME, "p")

    # Combine the text of all paragraphs to form the body text
    article_text = " ".join(paragraph.text for paragraph in article_paragraphs)

    # AI-powered Summary
    if premium:
        summarize_text(article_text)
    else:
        # Summary not premium
        summary = "Get Premium for enabling AI-powered summary!"
    
    # Article classification
    classification = classify_article(article_text, keywords)
    # Article location
    location = "Global"
    
    # Insert the article data into the table and get the article ID
    article_id = insert_article_data(cursor, news_table_name, article_title, article_text, summary, classification, location)
    # Insert the link data into the links table
    insert_link_data(cursor, links_table_name, article_id, article_title, article_link)

    # Commit the changes
    conn.commit()

# Close browser and database connection
browser.quit()
conn.close()

In [36]:
URL = "https://www.maritimelogisticsprofessional.com"

# Create a browser session
browser = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install())) 
browser.get(URL)

# Define a wait variable with a timeout of 10 seconds
wait = WebDriverWait(browser, 10)

# Cookies button
try:
    # Wait for the button to be clickable and click it
    cookie_button = wait.until(
        EC.element_to_be_clickable((By.XPATH, "//button[text()='Ok']")),
        message="Cookie button not clickable."
    )
    cookie_button.click()
except TimeoutException:
    print("The cookie acceptance button was not found on the page.")

In [38]:
browser.quit()

In [37]:
# Filter for LATAM
filter_news = '/south-america'
# Wait for the category links to be present and iterate through them
cat_links = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".cat-link")))

latam_link = None
for cat_link_div in cat_links:
    a_element = cat_link_div.find_element(By.TAG_NAME, 'a')
    href = a_element.get_attribute('href')
    if href.endswith(filter_news):
        latam_link = href
        break  # Exit the loop once we find the LATAM link

# Check if the LATAM link was found before proceeding
if latam_link:
    # Now navigate to the LATAM link
    browser.get(latam_link)

    # Wait for snippets to be present after navigating to the LATAM page
    snippets = wait.until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, ".snippet")))

    # Process each snippet
    for snippet in snippets:
        news_link = snippet.get_attribute('href')
        news_title = snippet.find_element(By.TAG_NAME, 'h2').text
        body_text = snippet.find_element(By.TAG_NAME, 'p').text

        print(news_link)
        print(news_title)
        print(body_text)
else:
    print("LATAM link was not found")

browser.quit()


https://www.maritimeprofessional.com/south-america
APM Terminals Callao Expansion Moves Forward
APM Terminals Callao commenced works on stage 3A of the Multipurpose North Terminal Modernization Project in the Port of Callao this week. The $95 million, 100% private investment, exceeds contractual commitments in the addendum by almost 40%.Works to be carried out during 2024 include the construction of a battery of 12 vertical silos for clean grains. This will increase capacity from 25,000 to 85…
https://www.maritimeprofessional.com/south-america
Venezuela Oil Exports Rising, but Shipping Delays Persist
Venezuela's oil exports slightly increased in February to some 670,000 barrels per day (bpd), but ongoing shipping delays worsened a bottleneck of tankers waiting to load, according to documents and vessel monitoring data.State-run oil firm…
https://www.maritimeprofessional.com/south-america
Tanker Backlog Grows in Venezuela as PDVSA Struggles to Deliver Oil
A bottleneck of vessels waiting