### Data Collection 


#### 1. Reddit Scrapping

In [8]:

import os
import praw
import pandas as pd

# ✅ Custom output directory
output_dir = r"N:\Moringa\afterM\Leo NLP 004 USAID 01.06.2025\USAID-Kenya-Sentiment-Analysis\data\mj 001 raw data"
os.makedirs(output_dir, exist_ok=True)

# Replace these with your credentials
client_id = 'uRoMjHrYqBAUjIFZgJilRg'
client_secret = 'dQkm_47qP1qUUzOdvO8piv5AP--L0A'
user_agent = 'usaidKenyaScraper by u/IngenuityStunning997'

# ✅ Initialize Reddit instance
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

# ✅ Define search parameters
subreddits = ['Kenya', 'development', 'worldnews']
query = "USAID funding OR donor cuts OR USAID health"
limit = 1000

# ✅ Collect posts
posts = []
for subreddit in subreddits:
    for submission in reddit.subreddit(subreddit).search(query, limit=limit):
        posts.append({
            'title': submission.title,
            'score': submission.score,
            'url': submission.url,
            'created': pd.to_datetime(submission.created_utc, unit='s'),
            'subreddit': submission.subreddit.display_name,
            'selftext': submission.selftext
        })

# ✅ Convert to DataFrame
df = pd.DataFrame(posts)

# ✅ Save as CSV and JSON
filename = "reddit_usaid_kenya"
csv_path = os.path.join(output_dir, f"{filename}.csv")
json_path = os.path.join(output_dir, f"{filename}.json")

df.to_csv(csv_path, index=False)
df.to_json(json_path, orient="records", lines=True)

print(f"✅ Saved {len(df)} Reddit posts to:")
print(f"   📄 CSV:  {csv_path}")
print(f"   📄 JSON: {json_path}")


✅ Saved 17 Reddit posts to:
   📄 CSV:  N:\Moringa\afterM\Leo NLP 004 USAID 01.06.2025\USAID-Kenya-Sentiment-Analysis\data\mj 001 raw data\reddit_usaid_kenya.csv
   📄 JSON: N:\Moringa\afterM\Leo NLP 004 USAID 01.06.2025\USAID-Kenya-Sentiment-Analysis\data\mj 001 raw data\reddit_usaid_kenya.json


In [None]:
'''

import praw
import pandas as pd
import os

# Replace these with your credentials
client_id = 'uRoMjHrYqBAUjIFZgJilRg'
client_secret = 'dQkm_47qP1qUUzOdvO8piv5AP--L0A'
user_agent = 'usaidKenyaScraper by u/IngenuityStunning997'

# Reddit instance
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

# Define your search parameters
subreddits = ['Kenya', 'development', 'worldnews']  # Can add more
query = "USAID funding OR donor cuts OR health"
limit = 1000

# Fetch posts
posts = []
for subreddit in subreddits:
    for submission in reddit.subreddit(subreddit).search(query, limit=limit):
        posts.append({
            'title': submission.title,
            'score': submission.score,
            'url': submission.url,
            'created': pd.to_datetime(submission.created_utc, unit='s'),
            'subreddit': submission.subreddit.display_name,
            'selftext': submission.selftext
        })

# Convert to DataFrame
df = pd.DataFrame(posts)

# Save to CSV or JSON
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)
filename = "reddit_usaid_kenya"
csv_path = os.path.join(output_dir, f"{filename}.csv")
df.to_csv(csv_path, index=False)
print(f"✅ Saved {len(df)} Reddit posts to: {csv_path}")

'''

✅ Saved 163 Reddit posts to: output\reddit_usaid_kenya.csv


#### 2. NewsAPI.org

In [27]:
import requests
import pandas as pd
import os
from datetime import datetime, timedelta

# 📌 Your NewsAPI key
api_key = "92ec9d796b7d493295eb56be553d8208"  # Replace with your actual key

# ✅ Limit to the last 30 days
today = datetime.today()
from_date = (today - timedelta(days=29)).strftime('%Y-%m-%d')
to_date = today.strftime('%Y-%m-%d')

# 🔍 Search parameters
params = {
    'q': 'USAID Kenya',
    'from': from_date,
    'to': to_date,
    'language': 'en',
    'sortBy': 'relevancy',
    'pageSize': 100
}

# ✅ Headers to avoid 426 errors
headers = {
    'User-Agent': 'Mozilla/5.0',
    'X-Api-Key': api_key
}

# 🌐 Endpoint
url = 'https://newsapi.org/v2/everything'

# 📁 Output directory
output_dir = r"N:\Moringa\afterM\Leo NLP 004 USAID 01.06.2025\USAID-Kenya-Sentiment-Analysis\data\mj 001 raw data"
os.makedirs(output_dir, exist_ok=True)

# 📥 Fetch and process articles
response = requests.get(url, params=params, headers=headers)

if response.status_code == 200:
    articles = response.json().get("articles", [])
    news_data = []

    for article in articles:
        news_data.append({
            "source": article["source"]["name"],
            "author": article["author"],
            "title": article["title"],
            "description": article["description"],
            "url": article["url"],
            "publishedAt": article["publishedAt"],
            "content": article["content"]
        })

    # Save to DataFrame and CSV
    df_news = pd.DataFrame(news_data)
    filename = f"news_usaid_kenya_recent"
    csv_path = os.path.join(output_dir, f"{filename}.csv")
    df_news.to_csv(csv_path, index=False)
    print(f"📰 Saved {len(df_news)} news articles to: {csv_path}")

else:
    print(f"❌ Failed to fetch articles. Status code: {response.status_code}")
    print(f"💡 Response content: {response.text}")


📰 Saved 27 news articles to: N:\Moringa\afterM\Leo NLP 004 USAID 01.06.2025\USAID-Kenya-Sentiment-Analysis\data\mj 001 raw data\news_usaid_kenya_recent.csv


Combine NewsAPI (to get article URLs) with newspaper3k

News Headline Scraping (Basic Demo)

In [None]:
# Example from The Standard (adjust for real pages)
url = "https://www.standardmedia.co.ke/topic/usaid"
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')

# Collect headlines
articles = soup.find_all("h2")  # You may need to update this tag
news_data = [{"headline": a.text.strip()} for a in articles if a.text.strip()]

# Save to JSON
df_news = pd.DataFrame(news_data)
news_path = os.path.join(output_dir, "news_usaid_kenya.json")
df_news.to_json(news_path, orient="records", lines=True)
print(f"✅ Saved {len(df_news)} news headlines to: {news_path}")


#### Twitter Scrapping 

In [None]:
import requests
import pandas as pd
import os
import time

# === Your Bearer Token ===
BEARER_TOKEN = "YOUR_BEARER_TOKEN_HERE"

# === Query and output settings ===
query = "(USAID funding OR donor cuts OR USAID health) Kenya lang:en"
output_dir = "./twitter_data"  # Customize your storage directory
filename = "tweets_usaid_kenya"  # Filename without extension

os.makedirs(output_dir, exist_ok=True)  # Ensure dir exists

# === Twitter API v2 endpoint and headers ===
search_url = "https://api.twitter.com/2/tweets/search/recent"
headers = {"Authorization": f"Bearer {BEARER_TOKEN}"}

# === Search parameters ===
params = {
    "query": query,
    "max_results": 100,
    "tweet.fields": "created_at,author_id,text",
}

# === Collect tweets ===
all_tweets = []

for _ in range(10):  # Up to 1000 tweets
    response = requests.get(search_url, headers=headers, params=params)
    if response.status_code != 200:
        print("❌ Error:", response.status_code, response.text)
        break

    data = response.json()
    for tweet in data.get("data", []):
        all_tweets.append({
            "date": tweet["created_at"],
            "author_id": tweet["author_id"],
            "text": tweet["text"]
        })

    next_token = data.get("meta", {}).get("next_token")
    if not next_token:
        break
    params["next_token"] = next_token
    time.sleep(1)  # Prevent rate limit

# === Save collected tweets ===
if all_tweets:
    df_tweets = pd.DataFrame(all_tweets)

    # Save as JSON
    json_path = os.path.join(output_dir, f"{filename}.json")
    df_tweets.to_json(json_path, orient="records", lines=True)
    print(f"✅ Saved JSON: {json_path}")

    # Optional: Save as CSV
    csv_path = os.path.join(output_dir, f"{filename}.csv")
    df_tweets.to_csv(csv_path, index=False)
    print(f"📄 Also saved as CSV: {csv_path}")
else:
    print("⚠️ No tweets collected.")
