In [1]:
# Install required libraries
!pip install google-auth google-auth-oauthlib google-auth-httplib2 google-api-python-client beautifulsoup4


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# Required Libraries
import os
import pickle
import base64
import re
import google.auth
import google.auth.transport.requests
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from bs4 import BeautifulSoup

In [3]:
# Authenticate and create the service
SCOPES = ['https://www.googleapis.com/auth/gmail.readonly']
creds = None

# Load existing credentials if available
if os.path.exists('token.pickle'):
    with open('token.pickle', 'rb') as token:
        creds = pickle.load(token)

# If no valid credentials are available, request login
if not creds or not creds.valid:
    if creds and creds.expired and creds.refresh_token:
        creds.refresh(Request())
    else:
        flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
        creds = flow.run_local_server(port=0)

    # Save the credentials for next run
    with open('token.pickle', 'wb') as token:
        pickle.dump(creds, token)

service = build('gmail', 'v1', credentials=creds)
user_info = service.users().getProfile(userId='me').execute()
print(f"Authenticated with account: {user_info['emailAddress']}")


Authenticated with account: viverratesting3@gmail.com


In [4]:
# Fetch unread emails with specified query
results = service.users().messages().list(userId='me', q='is:unread').execute()
messages = results.get('messages', [])

newsletters = []
for message in messages:
    msg = service.users().messages().get(userId='me', id=message['id']).execute()
    newsletters.append(msg)

print(f"Fetched {len(newsletters)} newsletters.")


Fetched 100 newsletters.


In [5]:
# Extract and clean text from email content
extracted_data = []

for email in newsletters:
    payload = email.get('payload', {})
    parts = payload.get('parts')
    
    # Check if 'parts' exists and use the first part, otherwise use the body directly
    if parts:
        part = parts[0]
        data = part['body'].get('data')
    else:
        data = payload.get('body', {}).get('data')

    # If there's no data, skip this email
    if not data:
        continue

    msg_str = base64.urlsafe_b64decode(data.encode('ASCII')).decode('utf-8')
    soup = BeautifulSoup(msg_str, 'html.parser')
    text = soup.get_text()
    links = [a['href'] for a in soup.find_all('a', href=True)]
    
    # Remove headers, footers, and irrelevant parts (customize this as needed)
    cleaned_text = re.sub(r'\s+', ' ', text).strip()
    
    extracted_data.append({'text': cleaned_text, 'links': links})

print("Extracted and cleaned text from newsletters.")
for data in extracted_data:
    print(f"Text: {data['text']}\nLinks: {data['links']}\n")


Extracted and cleaned text from newsletters.
Text: View this post on the web at https://latinamericadailybriefing.substack.com/p/protests-gather-steam-in-venezuela Spontaneous protests started in diverse locations around Venezuela yesterday, after electoral authorities said Nicolás Maduro won reelection in Sunday’s presidential vote, without releasing tallies proving the results. Laboratorio de Paz [ https://substack.com/redirect/017afa8f-9c7a-41f8-9ba6-f79857bbeb8e?j=eyJ1IjoiMzFmdXh4In0.L520X0qdNbvIdiyCXDPuD09Laf4PkKgM7ejvjLgFci4 ] verified 210 protests — notably in working class neighborhoods that have traditionally been chavista strongholds [ https://substack.com/redirect/5a7222fb-80e6-408a-a174-2fb5e96070bf?j=eyJ1IjoiMzFmdXh4In0.L520X0qdNbvIdiyCXDPuD09Laf4PkKgM7ejvjLgFci4 ] — throughout the day, and at least seven statues of Hugo Chávez were torn down. (Runrun.es [ https://substack.com/redirect/322d91ba-8ded-4f20-9ee8-3ac858c73870?j=eyJ1IjoiMzFmdXh4In0.L520X0qdNbvIdiyCXDPuD09Laf4Pk

In [6]:
processed_data = []

for data in extracted_data:
    text = data['text']
    links = re.findall(r'(https?://[^\s]+)', text)
    cleaned_text = re.sub(r'(https?://[^\s]+)', '', text).strip()
    
    processed_data.append({'text': cleaned_text, 'links': links})

print(processed_data)



In [7]:
# Install required libraries
!pip install openai==0.28 python-dotenv tqdm



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [8]:
# Import required libraries
import openai
import os
from dotenv import load_dotenv
from tqdm.notebook import tqdm

In [9]:
# Load environment variables from .env file
load_dotenv()
openai.api_key = os.getenv('OPENAI_API_KEY')

In [12]:
# Function to determine the region of the text using GPT-4-turbo
def determine_region(text):
    response = openai.ChatCompletion.create(
        model="gpt-4-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that identifies regions based on text."},
            {"role": "user", "content": f"Which region does the following text belong to? The regions are: Americas, Europe, Asia, Middle East, Africa, and Other.\n\n{text}"}
        ]
    )
    region_response = response['choices'][0]['message']['content'].strip()
    if "Americas" in region_response:
        return "Americas"
    elif "Europe" in region_response:
        return "Europe"
    elif "Asia" in region_response:
        return "Asia"
    elif "Middle East" in region_response:
        return "Middle East"
    elif "Africa" in region_response:
        return "Africa"
    else:
        return "Other"


# Cluster the news articles by region
regions = {
    "Americas": [],
    "Europe": [],
    "Asia": [],
    "Middle East": [],
    "Africa": [],
    "Other": []
}

for data in tqdm(processed_data, desc="Clustering news by region"):
    text = data['text']
    links = data['links']
    
    # Determine the region of the text
    region = determine_region(text)
    
    # Add the text and links to the appropriate region
    regions[region].append({'text': text, 'links': links})
    
    # Print the region of the last classified article
    print(f"The last article was classified into {region} region")

# Print the number of articles in each region
for region, articles in regions.items():
    print(f"{region}: {len(articles)} articles")


Clustering news by region:   0%|          | 0/100 [00:00<?, ?it/s]

The last article was classified into Americas region
The last article was classified into Europe region
The last article was classified into Asia region
The last article was classified into Americas region
The last article was classified into Africa region
The last article was classified into Americas region
The last article was classified into Europe region
The last article was classified into Asia region
The last article was classified into Americas region
The last article was classified into Africa region
The last article was classified into Europe region
The last article was classified into Asia region
The last article was classified into Americas region
The last article was classified into Europe region
The last article was classified into Africa region
The last article was classified into Americas region
The last article was classified into Europe region
The last article was classified into Asia region
The last article was classified into Americas region
The last article was clas

In [13]:
# Initialize the formatted output string
formatted_output = ""

# Iterate over each region and its news items
for region, news_items in regions.items():
    if news_items:
        formatted_output += f"### {region}\n"
        for news in tqdm(news_items, desc=f"Summarizing articles in {region}"):
            # Summarize the text using GPT-4-turbo
            response = openai.ChatCompletion.create(
                model="gpt-4-turbo",
                messages=[
                    {"role": "system", "content": "You are a helpful assistant that summarizes text into one or two sentences."},
                    {"role": "user", "content": f"Summarize the following text into one or two sentences:\n\n{news['text']}"}
                ]
            )
            summary = response['choices'][0]['message']['content'].strip()
            
            # Get the link
            link = news['links'][0] if news['links'] else ""
            
            # Add the summary and link to the formatted output
            formatted_output += f"- {summary} [Read more]({link})\n"

# Print the formatted output
print(formatted_output)


Summarizing articles in Americas:   0%|          | 0/30 [00:00<?, ?it/s]

Summarizing articles in Europe:   0%|          | 0/25 [00:00<?, ?it/s]

Summarizing articles in Asia:   0%|          | 0/25 [00:00<?, ?it/s]

Summarizing articles in Africa:   0%|          | 0/20 [00:00<?, ?it/s]

### Americas
- Protests erupted across Venezuela in response to electoral authorities announcing Nicolás Maduro's reelection without providing vote counts, leading to widespread unrest, violence, and international concerns over electoral integrity. Opposition claimed Maduro's victory was mathematically impossible based on their data, while the government dismissed the protests as a coup attempt, amidst international calls for a review of the election results. [Read more](https://latinamericadailybriefing.substack.com/p/protests-gather-steam-in-venezuela)
- The Venezuelan opposition claims to have evidence showing Edmundo Gonzalez overwhelmingly won the recent election with 6.3 million votes compared to Nicolas Maduro's 2.8 million, a result which the opposition states is mathematically impossible for Maduro to have overcome. They plan to publish this evidence online for international scrutiny, amidst widespread allegations of electoral fraud by the government agency CNE, which has decl

In [14]:
# Save formatted ouput in a text file
with open('news_summary.md', 'w') as file:
    file.write(formatted_output)