### USAUD FUNDING CUTS SENTIMENT ANALYSIS 

### Introduction

.....(light intro text)
.....(TBD)

### Data 

This is data preparation phase for the project. The dataset used here is compiled from two primary sources: Reddit (via web scraping) and NewsAPI (via API calls). Each contributor collected data independently from these platforms, targeting relevant topics for analysis. Below, we begin by importing the collected datasets, merging them, and performing initial cleaning steps to prepare the data for further exploration and modeling.



#### Data Importation

##### news_data

In [2]:
import os
import pandas as pd

# Set the path to your news_data folder
folder_path = r'N:\Moringa\afterM\Leo NLP 004 USAID 01.06.2025\USAID-Kenya-Sentiment-Analysis\data\raw\news_data'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Read and display columns for each CSV file
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    try:
        df = pd.read_csv(file_path, nrows=0)  # Read only headers
        print(f"Columns in {file}:")
        print(list(df.columns))
        print("-" * 50)
    except Exception as e:
        print(f"Error reading {file}: {e}")


Columns in Agatha_news.csv:
['keyword', 'source', 'author', 'title', 'description', 'content', 'publishedAt', 'url']
--------------------------------------------------
Columns in cecilia.newsapi.csv:
['keyword', 'source', 'title', 'description', 'url', 'publishedAt']
--------------------------------------------------
Columns in gnews_usaid_kenya_full.csv:
['title', 'url', 'published_date', 'source', 'text']
--------------------------------------------------
Columns in gnews_usaid_kenya_full_en_sw.csv:
['title', 'url', 'published_date', 'source', 'language', 'text']
--------------------------------------------------
Columns in leo_newsapi_articles.csv:
['source', 'author', 'title', 'description', 'content', 'url', 'published_at']
--------------------------------------------------
Columns in leo_newsapi_articles_enriched.csv:
['source', 'author', 'title', 'description', 'content', 'url', 'published_at', 'full_text']
--------------------------------------------------
Columns in Mbego_news

##### reddit_data

In [4]:
import os
import pandas as pd

# Set the path to your news_data folder
folder_path = r'N:\Moringa\afterM\Leo NLP 004 USAID 01.06.2025\USAID-Kenya-Sentiment-Analysis\data\raw\reddit_data'

# List all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Read and display columns for each CSV file
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    try:
        df = pd.read_csv(file_path, nrows=0)  # Read only headers
        print(f"Columns in {file}:")
        print(list(df.columns))
        print("-" * 50)
    except Exception as e:
        print(f"Error reading {file}: {e}")


Columns in Agatha_reddit.csv:
['title', 'selftext', 'subreddit', 'author', 'created_utc', 'url', 'score', 'num_comments', 'keyword']
--------------------------------------------------
Columns in cecilia.redditsubs.csv:
['subreddit', 'keyword', 'title', 'text', 'date_posted', 'upvotes', 'comments', 'url', 'permalink']
--------------------------------------------------
Columns in cecilia.reddit_nbo_ke_africa.csv:
['subreddit', 'keyword', 'title', 'text', 'date_posted', 'upvotes', 'comments', 'url', 'permalink']
--------------------------------------------------
Columns in leo_reddit_posts.csv:
['subreddit', 'search_term', 'title', 'text', 'created_utc', 'created_date', 'score', 'num_comments', 'permalink', 'url']
--------------------------------------------------
Columns in Mbego_reddit_usaid_kenya.csv:
['title', 'score', 'url', 'created', 'subreddit', 'selftext']
--------------------------------------------------
Columns in Mbego_reddit_usaid_kenya2.csv:
['title', 'score', 'url', 'creat

#### Data Merging 

##### news_data



In [None]:
import os
import pandas as pd

# Folder containing all News CSVs
folder_path = 'news_data'

# Final save location
save_path = r"N:\Moringa\afterM\Leo NLP 004 USAID 01.06.2025\USAID-Kenya-Sentiment-Analysis\data\processed"

# All .csv files in the news_data folder
news_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Define the final standardized columns
standard_news_cols = [
    'keyword', 'source', 'author', 'title', 'description', 'content',
    'summary', 'full_text', 'publishedAt', 'url', 'language'
]

# Create empty master DataFrame
merged_news_df = pd.DataFrame(columns=standard_news_cols)

# Loop through each file
for file in news_files:
    file_path = os.path.join(folder_path, file)
    try:
        df = pd.read_csv(file_path)

        # Rename columns to match standard
        df.rename(columns={
            'published_at': 'publishedAt',
            'published_date': 'publishedAt',
            'text': 'content',
        }, inplace=True)

        # Add missing columns
        for col in standard_news_cols:
            if col not in df.columns:
                df[col] = pd.NA

        # Align column order
        df = df[standard_news_cols]

        # Add to master DataFrame
        merged_news_df = pd.concat([merged_news_df, df], ignore_index=True)

        print(f"✅ Merged: {file}")
    except Exception as e:
        print(f"❌ Error processing {file}: {e}")

# Save merged file
output_path = os.path.join(save_path, 'all_news_merged.csv')
merged_news_df.to_csv(output_path, index=False)

print(f"\n✅ All News files merged and saved to '{output_path}'")


##### reddit_data

In [None]:
import os
import pandas as pd

# all Reddit CSVs
folder_path = 'N:\Moringa\afterM\Leo NLP 004 USAID 01.06.2025\USAID-Kenya-Sentiment-Analysis\data\raw\reddit_data' 
reddit_files = [
    'Agatha_reddit.csv',
    'cecilia.redditsubs.csv',
    'cecilia.reddit_nbo_ke_africa.csv',
    'leo_reddit_posts.csv',
    'Mbego_reddit_usaid_kenya.csv',
    'Mbego_reddit_usaid_kenya2.csv',
    'reddit_usaid_sentiment.csv',
    'ruth_reddit.csv'
]

# standard columns
standard_cols = [
    'title', 'selftext', 'subreddit', 'author', 'created_utc',
    'created_date', 'score', 'num_comments', 'keyword', 'search_term',
    'date_posted', 'upvotes', 'comments', 'url', 'permalink'
]

merged_df = pd.DataFrame(columns=standard_cols)

# Load and align each file
for file in reddit_files:
    file_path = os.path.join(folder_path, file)
    try:
        df = pd.read_csv(file_path)

        # Rename common variations manually if needed
        df.rename(columns={
            'text': 'selftext',
            'created': 'created_utc',
        }, inplace=True)

        # Add missing columns with NaNs
        for col in standard_cols:
            if col not in df.columns:
                df[col] = pd.NA

        # Keep only standard columns (in that order)
        df = df[standard_cols]

        # Append to master DataFrame
        merged_df = pd.concat([merged_df, df], ignore_index=True)

        print(f"Merged: {file}")
    except Exception as e:
        print(f"❌ Error reading {file}: {e}")

# Save final merged file
merged_df.to_csv(os.path.join(save_path, 'mbego_all_reddit_merged.csv'), index=False)
print("\n✅ All Reddit files merged and saved to 'all_reddit_merged.csv'")




#### Data Understanding 