### Data Collection 


#### 1. Reddit Scrapping

In [8]:

import os
import praw
import pandas as pd

# ✅ Custom output directory
output_dir = r"N:\Moringa\afterM\Leo NLP 004 USAID 01.06.2025\USAID-Kenya-Sentiment-Analysis\data\mj 001 raw data"
os.makedirs(output_dir, exist_ok=True)

# Replace these with your credentials
client_id = 'uRoMjHrYqBAUjIFZgJilRg'
client_secret = 'dQkm_47qP1qUUzOdvO8piv5AP--L0A'
user_agent = 'usaidKenyaScraper by u/IngenuityStunning997'

# ✅ Initialize Reddit instance
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

# ✅ Define search parameters
subreddits = ['Kenya', 'development', 'worldnews']
query = "USAID funding OR donor cuts OR USAID health"
limit = 1000

# ✅ Collect posts
posts = []
for subreddit in subreddits:
    for submission in reddit.subreddit(subreddit).search(query, limit=limit):
        posts.append({
            'title': submission.title,
            'score': submission.score,
            'url': submission.url,
            'created': pd.to_datetime(submission.created_utc, unit='s'),
            'subreddit': submission.subreddit.display_name,
            'selftext': submission.selftext
        })

# ✅ Convert to DataFrame
df = pd.DataFrame(posts)

# ✅ Save as CSV and JSON
filename = "reddit_usaid_kenya"
csv_path = os.path.join(output_dir, f"{filename}.csv")
json_path = os.path.join(output_dir, f"{filename}.json")

df.to_csv(csv_path, index=False)
df.to_json(json_path, orient="records", lines=True)

print(f"✅ Saved {len(df)} Reddit posts to:")
print(f"   📄 CSV:  {csv_path}")
print(f"   📄 JSON: {json_path}")


✅ Saved 17 Reddit posts to:
   📄 CSV:  N:\Moringa\afterM\Leo NLP 004 USAID 01.06.2025\USAID-Kenya-Sentiment-Analysis\data\mj 001 raw data\reddit_usaid_kenya.csv
   📄 JSON: N:\Moringa\afterM\Leo NLP 004 USAID 01.06.2025\USAID-Kenya-Sentiment-Analysis\data\mj 001 raw data\reddit_usaid_kenya.json


In [None]:
'''

import praw
import pandas as pd
import os

# Replace these with your credentials
client_id = 'uRoMjHrYqBAUjIFZgJilRg'
client_secret = 'dQkm_47qP1qUUzOdvO8piv5AP--L0A'
user_agent = 'usaidKenyaScraper by u/IngenuityStunning997'

# Reddit instance
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

# Define your search parameters
subreddits = ['Kenya', 'development', 'worldnews']  # Can add more
query = "USAID funding OR donor cuts OR health"
limit = 1000

# Fetch posts
posts = []
for subreddit in subreddits:
    for submission in reddit.subreddit(subreddit).search(query, limit=limit):
        posts.append({
            'title': submission.title,
            'score': submission.score,
            'url': submission.url,
            'created': pd.to_datetime(submission.created_utc, unit='s'),
            'subreddit': submission.subreddit.display_name,
            'selftext': submission.selftext
        })

# Convert to DataFrame
df = pd.DataFrame(posts)

# Save to CSV or JSON
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)
filename = "reddit_usaid_kenya"
csv_path = os.path.join(output_dir, f"{filename}.csv")
df.to_csv(csv_path, index=False)
print(f"✅ Saved {len(df)} Reddit posts to: {csv_path}")

'''

✅ Saved 163 Reddit posts to: output\reddit_usaid_kenya.csv


#### 2. NewsAPI.org

In [27]:
import requests
import pandas as pd
import os
from datetime import datetime, timedelta

# 📌 Your NewsAPI key
api_key = "92ec9d796b7d493295eb56be553d8208"  # Replace with your actual key

# ✅ Limit to the last 30 days
today = datetime.today()
from_date = (today - timedelta(days=29)).strftime('%Y-%m-%d')
to_date = today.strftime('%Y-%m-%d')

# 🔍 Search parameters
params = {
    'q': 'USAID Kenya',
    'from': from_date,
    'to': to_date,
    'language': 'en',
    'sortBy': 'relevancy',
    'pageSize': 100
}

# ✅ Headers to avoid 426 errors
headers = {
    'User-Agent': 'Mozilla/5.0',
    'X-Api-Key': api_key
}

# 🌐 Endpoint
url = 'https://newsapi.org/v2/everything'

# 📁 Output directory
output_dir = r"N:\Moringa\afterM\Leo NLP 004 USAID 01.06.2025\USAID-Kenya-Sentiment-Analysis\data\mj 001 raw data"
os.makedirs(output_dir, exist_ok=True)

# 📥 Fetch and process articles
response = requests.get(url, params=params, headers=headers)

if response.status_code == 200:
    articles = response.json().get("articles", [])
    news_data = []

    for article in articles:
        news_data.append({
            "source": article["source"]["name"],
            "author": article["author"],
            "title": article["title"],
            "description": article["description"],
            "url": article["url"],
            "publishedAt": article["publishedAt"],
            "content": article["content"]
        })

    # Save to DataFrame and CSV
    df_news = pd.DataFrame(news_data)
    filename = f"news_usaid_kenya_recent"
    csv_path = os.path.join(output_dir, f"{filename}.csv")
    df_news.to_csv(csv_path, index=False)
    print(f"📰 Saved {len(df_news)} news articles to: {csv_path}")

else:
    print(f"❌ Failed to fetch articles. Status code: {response.status_code}")
    print(f"💡 Response content: {response.text}")


📰 Saved 27 news articles to: N:\Moringa\afterM\Leo NLP 004 USAID 01.06.2025\USAID-Kenya-Sentiment-Analysis\data\mj 001 raw data\news_usaid_kenya_recent.csv


Combine NewsAPI (to get article URLs) with newspaper3k

In [28]:
import requests
import pandas as pd
import os
from datetime import datetime, timedelta
from newspaper import Article
import time

# 📌 NewsAPI key
api_key = "92ec9d796b7d493295eb56be553d8208"  # Replace with your actual key

# 📆 Limit dates for Free Plan (last 30 days)
today = datetime.today()
from_date = (today - timedelta(days=29)).strftime('%Y-%m-%d')
to_date = today.strftime('%Y-%m-%d')

# 🌐 API query
params = {
    'q': 'USAID Kenya',
    'from': from_date,
    'to': to_date,
    'language': 'en',
    'sortBy': 'relevancy',
    'pageSize': 100
}
headers = {
    'User-Agent': 'Mozilla/5.0',
    'X-Api-Key': api_key
}
url = 'https://newsapi.org/v2/everything'

# 📁 Storage path
output_dir = r"N:\Moringa\afterM\Leo NLP 004 USAID 01.06.2025\USAID-Kenya-Sentiment-Analysis\data\mj 001 raw data"
os.makedirs(output_dir, exist_ok=True)

# 🔍 Fetch article metadata
response = requests.get(url, params=params, headers=headers)

if response.status_code == 200:
    articles = response.json().get("articles", [])
    enriched_data = []

    for idx, article in enumerate(articles):
        print(f"🔍 Extracting full text for article {idx+1}/{len(articles)}")
        article_url = article["url"]

        try:
            news_article = Article(article_url)
            news_article.download()
            news_article.parse()

            full_text = news_article.text
            enriched_data.append({
                "source": article["source"]["name"],
                "author": article["author"],
                "title": article["title"],
                "description": article["description"],
                "url": article_url,
                "publishedAt": article["publishedAt"],
                "summary": article["content"],
                "full_text": full_text
            })

        except Exception as e:
            print(f"⚠️ Skipping article due to error: {e}")
            continue

        time.sleep(1)  # Be nice to servers

    # 💾 Save to CSV
    df_full = pd.DataFrame(enriched_data)
    csv_path = os.path.join(output_dir, "news_usaid_kenya_fulltext.csv")
    df_full.to_csv(csv_path, index=False, encoding="utf-8")
    print(f"✅ Saved {len(df_full)} articles with full text to: {csv_path}")

else:
    print(f"❌ Failed to fetch articles. Status code: {response.status_code}")
    print(f"💡 Response: {response.text}")


🔍 Extracting full text for article 1/27
🔍 Extracting full text for article 2/27
🔍 Extracting full text for article 3/27
🔍 Extracting full text for article 4/27
🔍 Extracting full text for article 5/27
🔍 Extracting full text for article 6/27
🔍 Extracting full text for article 7/27
🔍 Extracting full text for article 8/27
🔍 Extracting full text for article 9/27
⚠️ Skipping article due to error: Article `download()` failed with 403 Client Error: Max restarts limit reached for url: https://www.forbes.com/sites/sverrealvik/2025/05/20/africa-needs-more-renewables-so-why-is-it-investing-in-fossil-fuels/ on URL https://www.forbes.com/sites/sverrealvik/2025/05/20/africa-needs-more-renewables-so-why-is-it-investing-in-fossil-fuels/
🔍 Extracting full text for article 10/27
🔍 Extracting full text for article 11/27
🔍 Extracting full text for article 12/27
🔍 Extracting full text for article 13/27
🔍 Extracting full text for article 14/27
🔍 Extracting full text for article 15/27
⚠️ Skipping article due 

#### Twitter Scrapping 