### Merge Reddit Data

In [None]:
import pandas as pd
import os
import glob

# Define paths
reddit_raw_path = r"C:\Users\user\Desktop\USAID-Kenya-Sentiment-Analysis\USAID-Kenya-Sentiment-Analysis\data\raw\reddit_data"
output_file = r"C:\Users\user\Desktop\USAID-Kenya-Sentiment-Analysis\USAID-Kenya-Sentiment-Analysis\data\processed\Agatha_merged_reddit_dataset.csv"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

# Define final Reddit column structure
final_columns = [
    "source", "subreddit", "post_title", "text", "author",
    "keyword", "published_date", "url"
]

# Load and normalize all Reddit files
reddit_files = glob.glob(os.path.join(reddit_raw_path, "*.csv"))
reddit_dfs = []

for file in reddit_files:
    df = pd.read_csv(file)

    # ==== Rename columns as needed ====
    if 'title' in df.columns:
        df.rename(columns={'title': 'post_title'}, inplace=True)

    # Handle published_date
    if 'date_posted' in df.columns:
        df['published_date'] = df['date_posted']
    elif 'created_utc' in df.columns:
        df['published_date'] = df['created_utc']

    # Handle text column
    if 'text' not in df.columns and 'selftext' in df.columns:
        df.rename(columns={'selftext': 'text'}, inplace=True)

    # Add static source column
    df['source'] = 'Reddit'

    # Add missing final columns with None
    for col in final_columns:
        if col not in df.columns:
            df[col] = None

    # Format published_date to datetime
    df['published_date'] = df['published_date'].astype(str).str.strip().replace('', pd.NA)
    df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')

    # Keep only final standardized columns
    df = df[final_columns]
    reddit_dfs.append(df)

#  Combine all rows
reddit_df = pd.concat(reddit_dfs, ignore_index=True)

# Save to the output location
reddit_df.to_csv(output_file, index=False)

# print the five rows of the dataset
reddit_df.head()


  df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')
  df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')
  df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')


Unnamed: 0,source,subreddit,post_title,text,author,keyword,published_date,url
0,Reddit,Kenya,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,muerki,usaid kenya,2025-04-15 13:16:53,https://www.reddit.com/r/Kenya/comments/1jzrn2...
1,Reddit,Kenya,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,Morio_anzenza,usaid kenya,2025-04-07 04:21:12,https://www.reddit.com/r/Kenya/comments/1jtcvb...
2,Reddit,Kenya,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,vindtar,usaid kenya,2025-04-05 19:09:10,https://www.reddit.com/r/Kenya/comments/1jsb14...
3,Reddit,Kenya,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",Gold_Smart,usaid kenya,2025-03-25 08:18:04,https://www.reddit.com/r/Kenya/comments/1jjehw...
4,Reddit,Kenya,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,westmaxia,usaid kenya,2025-03-08 08:08:58,https://www.reddit.com/r/Kenya/comments/1j6cjz...


In [13]:
reddit_df.shape

(1293, 8)

In [22]:
# Shows the number of missing (NaN) values in each column
reddit_df.isna().sum()


source              0
subreddit           0
post_title          0
text              389
author            827
keyword           547
published_date    347
url                 0
dtype: int64