In [3]:
import praw
import pandas as pd
import os
import sys
from datetime import datetime

# ✅ Allow import from parent directory (where config.py is)
sys.path.append(os.path.abspath(os.path.join('..')))

# ✅ Import credentials from config.py
from config import REDDIT_CLIENT_ID, REDDIT_CLIENT_SECRET, REDDIT_USER_AGENT, YOUR_IDENTIFIER

# 🗝️ Confirm config loaded
#print(f"[INFO] Running as: {YOUR_IDENTIFIER}")

# 📆 Step 2: Define keywords, subreddits, and date filter
keywords = [
    "usaid kenya", "usaid funding", "usaid budget cut", "kenya foreign aid",
    "usaid kenya funding cut", "usaid suspended funding", "development aid kenya",
    "kenya donor funding"
]

subreddits = ["Kenya", "Africa", "worldnews"]


# 📆 Start date: Jan 20, 2025 (Trump announcement)
start_date = int(datetime(2025, 1, 20).timestamp())

# 📲 Step 3: Authenticate with Reddit using PRAW
reddit = praw.Reddit(
    client_id=REDDIT_CLIENT_ID,
    client_secret=REDDIT_CLIENT_SECRET,
    user_agent=REDDIT_USER_AGENT
)

# 📥 Step 4: Scrape submissions from selected subreddits and keywords
posts = []

for subreddit_name in subreddits:
    subreddit = reddit.subreddit(subreddit_name)
    print(f"[...] Searching r/{subreddit_name}")
    for keyword in keywords:
        for submission in subreddit.search(keyword, sort="new", limit=100):
            if submission.created_utc >= start_date:
                posts.append({
                    "title": submission.title,
                    "selftext": submission.selftext,
                    "subreddit": submission.subreddit.display_name,
                    "author": str(submission.author),
                    "created_utc": datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S'),
                    "url": submission.url,
                    "score": submission.score,
                    "num_comments": submission.num_comments,
                    "keyword": keyword
                })

# 🧾 Step 5: Convert to DataFrame
df_reddit = pd.DataFrame(posts)

# 📁 Step 6: Save to CSV in raw data folder
output_path = r"C:\Users\user\Desktop\USAID-Kenya-Sentiment-Analysis\USAID-Kenya-Sentiment-Analysis\data\raw\Agatha_reddit.csv"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_reddit.to_csv(output_path, index=False)

# ✅ Final message
#print(f"[✔] Scraped {len(df)} posts since Jan 20, 2025. Data saved to:\n{output_path}")
df_reddit.head()

[...] Searching r/Kenya
[...] Searching r/Africa
[...] Searching r/worldnews


Unnamed: 0,title,selftext,subreddit,author,created_utc,url,score,num_comments,keyword
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,Kenya,muerki,2025-04-15 13:16:53,https://www.reddit.com/r/Kenya/comments/1jzrn2...,3,5,usaid kenya
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,Kenya,Morio_anzenza,2025-04-07 04:21:12,https://www.reddit.com/r/Kenya/comments/1jtcvb...,169,95,usaid kenya
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,Kenya,vindtar,2025-04-05 19:09:10,https://www.reddit.com/r/Kenya/comments/1jsb14...,2,2,usaid kenya
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",Kenya,Gold_Smart,2025-03-25 08:18:04,https://www.reddit.com/r/Kenya/comments/1jjehw...,13,20,usaid kenya
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,Kenya,westmaxia,2025-03-08 08:08:58,https://www.reddit.com/r/Kenya/comments/1j6cjz...,1,6,usaid kenya


In [4]:
df_reddit.shape

(466, 9)

## Data collection from NewsAPI.org

In [None]:
import requests
import json
import os
import sys
from datetime import datetime, timedelta, timezone # Import timezone for UTC
import pandas as pd # Import pandas for DataFrame creation
import time # Import time for sleep

# --- Path Configuration for finding config.py ---
# so we need to go up one level to find the project root.
project_root = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

# Add project_root to sys.path to allow importing from config.py
if project_root not in sys.path:
    sys.path.insert(0, project_root)

# --- Import API credentials and personal identifier from config.py ---
try:
    from config import NEWS_API_KEY, YOUR_IDENTIFIER
    print("API Key and Identifier loaded from config.py")
except ImportError:
    print("Error: config.py not found or missing NEWS_API_KEY.")
    print("Please ensure config.py is in the project root and defines NEWS_API_KEY and YOUR_IDENTIFIER.")git
    
# --- Configuration for News API Data Collection ---
NEWS_API_BASE_URL = "https://newsapi.org/v2/everything"

SEARCH_KEYWORDS = [
    "USAID Kenya funding cuts",
    "USAID Kenya aid",
    "US aid Kenya",
    "American aid Kenya",
    "Kenya development aid",
    "foreign aid Kenya cuts",
    "USAID Kenya health",
    "USAID Kenya education"
]

# Define a time range for data collection
# NewsAPI free tier typically allows up to 1 month back for 'everything' endpoint.
# Using timezone.utc for consistency as recommended by APIs.
END_DATE = datetime.now(timezone.utc)
# Start from 30 days ago, adjusted for NewsAPI free tier limits.
START_DATE = END_DATE - timedelta(days=30) 

# Output directory for raw News data (to save raw data)
DATA_OUTPUT_DIR = os.path.join(project_root, "data", "raw")
os.makedirs(DATA_OUTPUT_DIR, exist_ok=True) # Ensure raw data directory exists

# Maximum number of articles to retrieve per keyword per API call
ARTICLES_PER_QUERY = 100 # Max for single request for 'everything' endpoint

def collect_news_data_in_notebook(api_key, query, from_date, to_date, page_size=ARTICLES_PER_QUERY):
    """
    Collects news articles from NewsAPI.org based on a search query.
    Returns a list of dictionaries (articles).
    """
    articles_data = []
    
    # Format dates for API request
    from_date_str = from_date.strftime('%Y-%m-%dT%H:%M:%S')
    to_date_str = to_date.strftime('%Y-%m-%dT%H:%M:%S')

    params = {
        'q': query,
        'from': from_date_str,
        'to': to_date_str,
        'language': 'en',
        'sortBy': 'relevancy',
        'pageSize': page_size,
        'apiKey': api_key
    }

    print(f"Searching NewsAPI for: '{query}' from {from_date_str} to {to_date_str}...")
    try:
        response = requests.get(NEWS_API_BASE_URL, params=params)
        response.raise_for_status() # Raise an exception for HTTP errors (4xx or 5xx)
        
        data = response.json()
        
        if data['status'] == 'ok':
            articles = data['articles']
            print(f"  Found {len(articles)} articles for '{query}'.")
            for article in articles:
                articles_data.append({
                    'source_id': article['source']['id'] if article['source'] else None,
                    'source_name': article['source']['name'] if article['source'] else None,
                    'author': article['author'],
                    'title': article['title'],
                    'description': article['description'],
                    'url': article['url'],
                    'urlToImage': article['urlToImage'],
                    'publishedAt': article['publishedAt'],
                    'content': article['content']
                })
        else:
            print(f"  NewsAPI error for '{query}': {data.get('message', 'Unknown error')}")

    except requests.exceptions.RequestException as req_err:
        print(f"  Error collecting for '{query}': {req_err}")
    except Exception as e:
        print(f"  An unexpected error occurred for '{query}': {e}")
            
    return articles_data

# --- Main execution in notebook cell ---
all_articles_raw = []

# Iterate through each search keyword to collect articles
for keyword in SEARCH_KEYWORDS:
    articles = collect_news_data_in_notebook(NEWS_API_KEY, keyword, START_DATE, END_DATE, ARTICLES_PER_QUERY)
    all_articles_raw.extend(articles)
    # A small delay to be polite to the API and avoid hitting rate limits too quickly
    time.sleep(1) # You might need to increase this if hitting rate limits

print(f"\nTotal articles collected (before deduplication): {len(all_articles_raw)}")

# Deduplicate articles based on URL (as they might appear in multiple searches)
unique_articles_dict = {}
for article in all_articles_raw:
    if 'url' in article and article['url']:
        unique_articles_dict[article['url']] = article
    
final_articles_list = list(unique_articles_dict.values())
print(f"Collected {len(final_articles_list)} unique articles.")

# Save raw data to JSON file 
collector_id = YOUR_IDENTIFIER
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
json_filename = os.path.join(DATA_OUTPUT_DIR, f"{collector_id}_news_data_{timestamp}.json")

with open(json_filename, 'w', encoding='utf-8') as f:
    json.dump(final_articles_list, f, ensure_ascii=False, indent=4)
print(f"\nRaw News data saved to: {json_filename}")

# Create DataFrame directly
news_df = pd.DataFrame(final_articles_list)


API Key and Identifier loaded from config.py
Searching NewsAPI for: 'USAID Kenya funding cuts' from 2025-05-08T07:09:23 to 2025-06-07T07:09:23...
  Found 14 articles for 'USAID Kenya funding cuts'.
Searching NewsAPI for: 'USAID Kenya aid' from 2025-05-08T07:09:23 to 2025-06-07T07:09:23...
  Found 19 articles for 'USAID Kenya aid'.
Searching NewsAPI for: 'US aid Kenya' from 2025-05-08T07:09:23 to 2025-06-07T07:09:23...
  Found 43 articles for 'US aid Kenya'.
Searching NewsAPI for: 'American aid Kenya' from 2025-05-08T07:09:23 to 2025-06-07T07:09:23...
  Found 16 articles for 'American aid Kenya'.
Searching NewsAPI for: 'Kenya development aid' from 2025-05-08T07:09:23 to 2025-06-07T07:09:23...
  Found 49 articles for 'Kenya development aid'.
Searching NewsAPI for: 'foreign aid Kenya cuts' from 2025-05-08T07:09:23 to 2025-06-07T07:09:23...
  Found 25 articles for 'foreign aid Kenya cuts'.
Searching NewsAPI for: 'USAID Kenya health' from 2025-05-08T07:09:23 to 2025-06-07T07:09:23...
  Foun

In [10]:
news_df.head()

Unnamed: 0,source_id,source_name,author,title,description,url,urlToImage,publishedAt,content
0,,ProPublica,by Brett Murphy and Anna Maria Barry-Jester,"Death, Sexual Violence and Human Trafficking: ...",by Brett Murphy and Anna Maria Barry-Jester \n...,https://www.propublica.org/article/trump-usaid...,https://img.assets-d.propublica.org/v5/images/...,2025-05-28T18:45:00Z,ProPublica is a nonprofit newsroom that invest...
1,,Yahoo Entertainment,TAIWO ADEBAYO,Children die as USAID aid cuts snap a lifeline...,"Under the dappled light of a thatched shelter,...",https://www.yahoo.com/news/children-die-usaid-...,https://s.yimg.com/ny/api/res/1.2/dF26yqA3e5Sy...,2025-05-16T04:15:24Z,"DIKWA, Nigeria (AP) Under the dappled light of..."
2,,DW (English),Silja Fröhlich,How hard are the USAID cuts hitting Africa?,"HIV patients are losing access to medication, ...",https://www.dw.com/en/how-hard-are-usaid-cuts-...,https://static.dw.com/image/72449345_6.jpg,2025-05-11T06:30:00Z,For the inhabitants of remote South African vi...
3,,DW (English),Silja Fröhlich,How hard are USAID cuts hitting Africa's healt...,"HIV patients are losing access to medication, ...",https://www.dw.com/en/how-hard-are-usaid-cuts-...,https://static.dw.com/image/72449345_6.jpg,2025-05-11T06:30:00Z,For the inhabitants of remote South African vi...
4,,Boston Herald,Associated Press,Children die as USAID aid cuts snap a lifeline...,"Globally, 50% of the therapeutic foods for tre...",https://www.bostonherald.com/2025/05/16/usaid-...,https://www.bostonherald.com/wp-content/upload...,2025-05-16T18:59:12Z,"By TAIWO ADEBAYO\r\nDIKWA, Nigeria (AP) Under ..."
