## Collecting Reddit data

In [3]:
import praw
import pandas as pd
import os
import sys
from datetime import datetime

# Allow import from parent directory (where config.py is)
sys.path.append(os.path.abspath(os.path.join('..')))

# Import credentials from config.py
from config import REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, YOUR_IDENTIFIER

# Define keywords, subreddits, and date filter
keywords = [
    "usaid kenya", "usaid funding", "usaid budget cut", "kenya foreign aid",
    "usaid kenya funding cut", "usaid suspended funding", "development aid kenya",
    "kenya donor funding"
]

subreddits = ["Kenya", "Africa", "worldnews"]


# Start date: Jan 20, 2025 (Trump announcement)
start_date = int(datetime(2025, 1, 20).timestamp())

# Authenticate with Reddit using PRAW
reddit = praw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_CLIENT_SECRET,
    user_agent=REDDIT_USER_AGENT
)

# Scrape submissions from selected subreddits and keywords
posts = []

for subreddit_name in subreddits:
    subreddit = reddit.subreddit(subreddit_name)
    print(f"[...] Searching r/{subreddit_name}")
    for keyword in keywords:
        for submission in subreddit.search(keyword, sort="new", limit=100):
            if submission.created_utc >= start_date:
                posts.append({
                    "title": submission.title,
                    "selftext": submission.selftext,
                    "subreddit": submission.subreddit.display_name,
                    "author": str(submission.author),
                    "created_utc": datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                    "url": submission.url,
                    "score": submission.score,
                    "num_comments": submission.num_comments,
                    "keyword": keyword
                })

# Convert to DataFrame
df_reddit = pd.DataFrame(posts)

# Save to CSV in raw data folder
output_path = r"C:\Users\user\Desktop\USAID-Kenya-Sentiment-Analysis\USAID-Kenya-Sentiment-Analysis\data\raw\Agatha_reddit.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_reddit.to_csv(output_path, index=False)

df_reddit.head()

[...] Searching r/Kenya
[...] Searching r/Africa
[...] Searching r/worldnews


Unnamed: 0,title,selftext,subreddit,author,created_utc,url,score,num_comments,keyword
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,Kenya,muerki,2025-04-15 13:16:53,https://www.reddit.com/r/Kenya/comments/1jzrn2...,3,5,usaid kenya
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,Kenya,Morio_anzenza,2025-04-07 04:21:12,https://www.reddit.com/r/Kenya/comments/1jtcvb...,169,95,usaid kenya
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,Kenya,vindtar,2025-04-05 19:09:10,https://www.reddit.com/r/Kenya/comments/1jsb14...,2,2,usaid kenya
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",Kenya,Gold_Smart,2025-03-25 08:18:04,https://www.reddit.com/r/Kenya/comments/1jjehw...,13,20,usaid kenya
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,Kenya,westmaxia,2025-03-08 08:08:58,https://www.reddit.com/r/Kenya/comments/1j6cjz...,1,6,usaid kenya


In [4]:
df_reddit.shape

(466, 9)

## Data collection from NewsAPI.org

In [None]:
import requests
import pandas as pd
import os
import sys
import time
from datetime import datetime, timedelta, timezone
from newspaper import Article  # newspaper3k import

# Allow import from parent directory (to access config.py)
sys.path.append(os.path.abspath(os.path.join('..')))

# Import credentials
from config import NEWS_API_KEY, YOUR_IDENTIFIER

# Keywords relevant to USAID and Kenya
keywords = [
    "usaid kenya",
    "usaid funding",
    "usaid budget cut",
    "kenya foreign aid",
    "usaid suspended funding",
    "development aid kenya",
    "kenya donor funding",
    "foreign aid cut",
    "foreign aid withdrawal",
    "us foreign aid kenya",
    "funding reduction kenya"
]

# Time window: last 30 days (NewsAPI free tier limit)
END_DATE = datetime.now(timezone.utc)
START_DATE = END_DATE - timedelta(days=30)

from_date = START_DATE.strftime('%Y-%m-%d')
to_date = END_DATE.strftime('%Y-%m-%d')

# NewsAPI endpoint
url = "https://newsapi.org/v2/everything"
articles = []

# Fetch metadata from NewsAPI
for keyword in keywords:
    params = {
        'q': keyword,
        'from': from_date,
        'to': to_date,
        'language': 'en',
        'sortBy': 'relevancy',
        'pageSize': 100,
        'apiKey': NEWS_API_KEY,
    }

    response = requests.get(url, params=params)
    if response.status_code == 200:
        data = response.json()
        for article in data.get('articles', []):
            articles.append({
                "keyword": keyword,
                "source": article["source"]["name"],
                "author": article["author"],
                "title": article["title"],
                "description": article["description"],
                "publishedAt": article["publishedAt"],
                "url": article["url"]
            })
    else:
        print(f"[ERROR] Failed to fetch articles for '{keyword}' (status code: {response.status_code})")

# Use newspaper3k to enrich with full text
enriched_articles = []
for art in articles:
    try:
        url = art['url']
        news_article = Article(url)
        news_article.download()
        news_article.parse()
        news_article.nlp()

        enriched_articles.append({
            "keyword": art['keyword'],
            "source": art['source'],
            "author": news_article.authors,
            "title": news_article.title,
            "publishedAt": news_article.publish_date,
            "summary": news_article.summary,
            "text": news_article.text,
            "url": url
        })
        time.sleep(1)  
    except Exception as e:
        print(f"[ERROR] Failed to process URL: {url}\nReason: {e}")

# Save final DataFrame
df_enriched = pd.DataFrame(enriched_articles)

output_path = r"C:\Users\user\Desktop\USAID-Kenya-Sentiment-Analysis\USAID-Kenya-Sentiment-Analysis\data\raw\Agatha_news_fulltext.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_enriched.to_csv(output_path, index=False)

# Show preview
df_enriched.head()




[ERROR] Failed to process URL: https://www.forbes.com/sites/daniellenierenberg/2025/06/06/food-safety-depends-on-every-link-in-the-supply-chain/
Reason: Article `download()` failed with 403 Client Error: Max restarts limit reached for url: https://www.forbes.com/sites/daniellenierenberg/2025/06/06/food-safety-depends-on-every-link-in-the-supply-chain/ on URL https://www.forbes.com/sites/daniellenierenberg/2025/06/06/food-safety-depends-on-every-link-in-the-supply-chain/
[ERROR] Failed to process URL: https://techcabal.com/2025/06/02/kenya-health-data-usaid-trump-funding/
Reason: Article `download()` failed with HTTPSConnectionPool(host='techcabal.com', port=443): Read timed out. (read timeout=7) on URL https://techcabal.com/2025/06/02/kenya-health-data-usaid-trump-funding/
[ERROR] Failed to process URL: https://www.forbes.com/sites/kellyphillipserb/2025/06/12/house-moves-to-claw-back-money-from-public-broadcasting-and-foreign-aid/
Reason: Article `download()` failed with 403 Client Err



[ERROR] Failed to process URL: https://www.forbes.com/sites/taxnotes/2025/06/10/past-irs-commissioners-analyze-agency-changes-under-trump/
Reason: Article `download()` failed with 403 Client Error: Max restarts limit reached for url: https://www.forbes.com/sites/taxnotes/2025/06/10/past-irs-commissioners-analyze-agency-changes-under-trump/ on URL https://www.forbes.com/sites/taxnotes/2025/06/10/past-irs-commissioners-analyze-agency-changes-under-trump/
[ERROR] Failed to process URL: https://www.newsweek.com/elon-musks-legacy-dogewhat-did-he-do-2079124
Reason: Article `download()` failed with 403 Client Error: Forbidden for url: https://www.newsweek.com/elon-musks-legacy-dogewhat-did-he-do-2079124 on URL https://www.newsweek.com/elon-musks-legacy-dogewhat-did-he-do-2079124
[ERROR] Failed to process URL: https://www.foxnews.com/politics/house-advances-trumps-9-4b-spending-cuts-package-targeting-npr-pbs-usaid
Reason: Article `download()` failed with 403 Client Error: Forbidden 1002 for ur

Unnamed: 0,keyword,source,author,title,publishedAt,summary,text,url
0,usaid kenya,Al Jazeera English,"[Madison Czopek, Amy Sherman]",Has DOGE really saved the US government $180bn?,2025-06-06 00:00:00,President Donald Trump and adviser Elon Musk c...,Elon Musk first claimed the Department of Gove...,https://www.aljazeera.com/news/2025/6/6/has-do...
1,usaid kenya,Daily Signal,"[Mike Gonzalez, .Wp-Block-Co-Authors-Plus-Coau...",Congress Should Quickly Approve Trump’s Rescis...,2025-06-10 00:00:00,President Donald Trump‘s rescission legislatio...,President Donald Trump‘s rescission legislatio...,https://www.dailysignal.com/2025/06/10/congres...
2,usaid kenya,Defense One,"[Meghann Myers, Staff Reporter]","AFRICOM asks for help deterring terrorism, aft...",2025-05-29 21:15:17+00:00,“It is the epicenter of terrorism on the globe...,Deterring the spread of terrorism in Africa an...,https://www.defenseone.com/threats/2025/05/afr...
3,usaid kenya,Thisamericanlife.org,[],Some Things We Don't Do Anymore,2025-06-06 09:29:47-04:00,Two Americans moved to Eswatini when that coun...,Two Americans moved to Eswatini when that coun...,https://www.thisamericanlife.org/862/some-thin...
4,usaid kenya,Biztoc.com,[],BizToc,,"Tech stocks, led by Nvidia and Microsoft, drov...",President Trump abruptly terminated all U.S. t...,https://biztoc.com/x/6c16ca23e701790a


In [5]:
import requests
import pandas as pd
import os
import sys
from datetime import datetime, timedelta, timezone

# Allow import from parent directory (to access config.py)
sys.path.append(os.path.abspath(os.path.join('..')))

# Import credentials
from config import NEWS_API_KEY, YOUR_IDENTIFIER

# Keywords (relevant to USAID and Kenya)
keywords = [
    "usaid kenya",
    "usaid funding",
    "usaid budget cut",
    "kenya foreign aid",
    "usaid suspended funding",
    "development aid kenya",
    "kenya donor funding",
    "foreign aid cut",
    "foreign aid withdrawal",
    "us foreign aid kenya",
    "funding reduction kenya"
]

#Time window: last 30 days (NewsAPI free tier limit)
END_DATE = datetime.now(timezone.utc)
START_DATE = END_DATE - timedelta(days=30)

from_date = START_DATE.strftime('%Y-%m-%d')
to_date = END_DATE.strftime('%Y-%m-%d')

# NewsAPI endpoint
url = "https://newsapi.org/v2/everything"

# List to hold articles
articles = []

# Loop over each keyword and collect articles
for keyword in keywords:
    params = {
        'q': keyword,
        'from': from_date,
        'to': to_date,
        'language': 'en',
        'sortBy': 'relevancy',
        'pageSize': 100,  # Max per request
        'apiKey': NEWS_API_KEY,
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()
        for article in data.get('articles', []):
            articles.append({
                "keyword": keyword,
                "source": article["source"]["name"],
                "author": article["author"],
                "title": article["title"],
                "description": article["description"],
                "content": article["content"],
                "publishedAt": article["publishedAt"],
                "url": article["url"]
            })
    else:
        print(f"[ERROR] Failed to fetch articles for '{keyword}' (status code: {response.status_code})")

# Convert list of articles to DataFrame
df_news = pd.DataFrame(articles)

# Save to CSV
output_path = r"C:\Users\user\Desktop\USAID-Kenya-Sentiment-Analysis\USAID-Kenya-Sentiment-Analysis\data\raw\Agatha_news.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_news.to_csv(output_path, index=False)

df_news.head()


Unnamed: 0,keyword,source,author,title,description,content,publishedAt,url
0,usaid kenya,Al Jazeera English,Al Jazeera,Has DOGE really saved the US government $180bn?,Elon Musk first claimed the department would m...,President Donald Trump and adviser Elon Musk c...,2025-06-06T11:21:51Z,https://www.aljazeera.com/news/2025/6/6/has-do...
1,usaid kenya,CleanTechnica,Guest Contributor,The Life Story of Ecomobilus Technologies Limi...,By Prof Geoffrey Gitau Here is a story showcas...,By Prof Geoffrey Gitau\r\nHere is a story show...,2025-05-26T17:13:41Z,https://cleantechnica.com/2025/05/26/the-life-...
2,usaid kenya,ProPublica,by Brett Murphy and Anna Maria Barry-Jester,"Death, Sexual Violence and Human Trafficking: ...",by Brett Murphy and Anna Maria Barry-Jester \n...,ProPublica is a nonprofit newsroom that invest...,2025-05-28T18:45:00Z,https://www.propublica.org/article/trump-usaid...
3,usaid kenya,Daily Signal,Mike Gonzalez,Congress Should Quickly Approve Trump’s Rescis...,President Donald Trump‘s rescission legislatio...,President Donald Trumps rescission legislation...,2025-06-10T12:00:00Z,https://www.dailysignal.com/2025/06/10/congres...
4,usaid kenya,Forbes,"Danielle Nierenberg, Contributor, \n Danielle ...",Food Safety Depends On Every Link In The Suppl...,Almost 1 in 10 people globally fall ill from c...,Colorful fish and vegetables can be purchased ...,2025-06-06T13:55:41Z,https://www.forbes.com/sites/daniellenierenber...


In [6]:
df_news.shape

(592, 8)