# USAID-Kenya-Sentiment-Analysis

#    Merging Reddit CSV Files

### Import neccessary libraries

In [1]:
import pandas as pd
import glob
import os


### Folder path

In [2]:
folder_path = 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data'


### CSV file paths

In [3]:
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
csv_files

['C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\Agatha_reddit.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\cecilia.redditsubs.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\cecilia.reddit_nbo_ke_africa.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\leo_reddit_posts.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\Mbego_reddit_usaid_kenya.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\Mbego_reddit_usaid_kenya2.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\reddit_usaid_kenya.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\reddit_usaid_sentiment.csv',
 'C:/Users/hp/Deskto

### Reading the files and combining them into one dataframe

In [4]:

preferred_columns = ['post_title', 'text', 'keyword', 'published_date', 'url']
all_dfs = []

for file in csv_files:
    df = pd.read_csv(file)
    
    # Rename common column variants
    if 'title' in df.columns:
        df.rename(columns={'title': 'post_title'}, inplace=True)
    if 'selftext' in df.columns and 'text' not in df.columns:
        df.rename(columns={'selftext': 'text'}, inplace=True)
    if 'date_posted' in df.columns:
        df.rename(columns={'date_posted': 'published_date'}, inplace=True)
    elif 'created_utc' in df.columns:
        df.rename(columns={'created_utc': 'published_date'}, inplace=True)
    
    # Add missing columns
    for col in preferred_columns:
        if col not in df.columns:
            df[col] = None

    # Convert published_date to datetime
    df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')

    # Track file source
    df['source_file'] = os.path.basename(file)
    
    # Keep only standardized columns
    df = df[preferred_columns + ['source_file']]

    all_dfs.append(df)

# Combine all
combined_df = pd.concat(all_dfs, ignore_index=True)
combined_df.head()


Unnamed: 0,post_title,text,keyword,published_date,url,source_file
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,usaid kenya,2025-04-15 13:16:53,https://www.reddit.com/r/Kenya/comments/1jzrn2...,Agatha_reddit.csv
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,usaid kenya,2025-04-07 04:21:12,https://www.reddit.com/r/Kenya/comments/1jtcvb...,Agatha_reddit.csv
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,usaid kenya,2025-04-05 19:09:10,https://www.reddit.com/r/Kenya/comments/1jsb14...,Agatha_reddit.csv
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",usaid kenya,2025-03-25 08:18:04,https://www.reddit.com/r/Kenya/comments/1jjehw...,Agatha_reddit.csv
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,usaid kenya,2025-03-08 08:08:58,https://www.reddit.com/r/Kenya/comments/1j6cjz...,Agatha_reddit.csv


In [5]:
combined_df.shape #check the shape of the dataset

(1306, 6)

In [6]:
combined_df.isna().sum() #check for missing values

post_title          0
text              398
keyword           564
published_date    197
url                 0
source_file         0
dtype: int64

In [7]:
combined_df.columns

Index(['post_title', 'text', 'keyword', 'published_date', 'url',
       'source_file'],
      dtype='object')

In [8]:
# Extract date and time
combined_df['time'] = combined_df['published_date'].dt.time
combined_df['published_date'] = combined_df['published_date'].dt.date

#Drop the time column
combined_df.drop(columns='time', inplace=True)
combined_df.head()

Unnamed: 0,post_title,text,keyword,published_date,url,source_file
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,usaid kenya,2025-04-15,https://www.reddit.com/r/Kenya/comments/1jzrn2...,Agatha_reddit.csv
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,usaid kenya,2025-04-07,https://www.reddit.com/r/Kenya/comments/1jtcvb...,Agatha_reddit.csv
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,usaid kenya,2025-04-05,https://www.reddit.com/r/Kenya/comments/1jsb14...,Agatha_reddit.csv
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",usaid kenya,2025-03-25,https://www.reddit.com/r/Kenya/comments/1jjehw...,Agatha_reddit.csv
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,usaid kenya,2025-03-08,https://www.reddit.com/r/Kenya/comments/1j6cjz...,Agatha_reddit.csv


In [9]:
combined_df.sample(10) #random sample of 10 rows

Unnamed: 0,post_title,text,keyword,published_date,url,source_file
381,How to make US foreign aid work for Africa and...,,kenya foreign aid,2025-02-03,https://www.semafor.com/article/02/03/2025/how...,Agatha_reddit.csv
107,I might be wrong but..,Trump's decision to phase out illegal immigran...,kenya foreign aid,2025-01-24,https://www.reddit.com/r/Kenya/comments/1i95i0...,Agatha_reddit.csv
847,Why are we playing so safe?,I recently came across a group of guys online ...,,1970-01-01,https://www.reddit.com/r/Kenya/comments/1jx7it...,leo_reddit_posts.csv
744,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,,1970-01-01,https://www.reddit.com/r/Kenya/comments/1jtcvb...,leo_reddit_posts.csv
322,Are there Kenyans who are into FIRE (Financial...,"FIRE stands for Financial Independence, Retire...",kenya donor funding,2025-05-19,https://www.reddit.com/r/Kenya/comments/1kqdr0...,Agatha_reddit.csv
401,Kenya proposes to South Africa.... 🇰🇪🇿🇦,"In December of 2019, two strangers were among ...",development aid kenya,2025-05-05,https://www.reddit.com/gallery/1kf99fq,Agatha_reddit.csv
1299,That world happiness survey is complete crap,"I usually do not do this, as this does not dir...",,2025-04-20,https://nypost.com/2017/03/22/that-world-happi...,ruth_reddit.csv
643,SCOTUS pauses order requiring Trump admin to p...,,"foreign aid, foreign aid",2025-02-27,https://www.wusa9.com/article/news/politics/fe...,cecilia.redditsubs.csv
848,Surviving in the city under the sun,"Hi cousins, I’m a Law student from trying to m...",,1970-01-01,https://www.reddit.com/r/Kenya/comments/1l6fbf...,leo_reddit_posts.csv
539,Ukraine reels from ‘worst-case scenario’ suspe...,,"foreign aid, foreign aid",2025-01-29,https://www.theguardian.com/world/2025/jan/29/...,cecilia.redditsubs.csv


### Save the final merged dataframe to csv

In [10]:
# Save to CSV 
if not combined_df.empty:
    output_filename = "C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_reddit_dataset.csv"
    combined_df.to_csv(output_filename, index=False, encoding='utf-8')
    print(f" Results successfully saved to:\n{output_filename}")
else:
    print(" No data to save. The DataFrame is empty.")


 Results successfully saved to:
C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_reddit_dataset.csv


# Merging News CSV Files

### Import necessary libraries

In [11]:
import pandas as pd
import glob
import os


### Folder path

In [12]:
folder_path = 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data'


### CSV file paths

In [13]:
folder_path = 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data'
omit_file = os.path.join(folder_path, 'Agatha_news_fulltext.csv')

news_csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
news_csv_files = [f for f in news_csv_files if os.path.normpath(f) != os.path.normpath(omit_file)]

news_csv_files


['C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\Agatha_news.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\cecilia.newsapi.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\leo_newsapi_articles_enriched.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\Mbego_news_usaid_kenya_fulltext.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\Mbego_news_usaid_kenya_recent.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\newsapi_usaid_articles.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\ruth_news.csv']

### Reading the files and combining them into one dataframe

In [14]:
import pandas as pd
import os

news_columns = ['title', 'description', 'text', 'url', 'keyword', 'published_date', 'source_file']
all_news_dfs = []

for file in news_csv_files:  
    df = pd.read_csv(file)

    # Rename content to text 
    if 'content' in df.columns and 'text' not in df.columns:
        df.rename(columns={'content': 'text'}, inplace=True)

    # Rename publishedAt to published_date
    if 'publishedAt' in df.columns:
        df.rename(columns={'publishedAt': 'published_date'}, inplace=True)

    # Add missing columns
    for col in news_columns:
        if col not in df.columns:
            df[col] = None

    # Convert published_date to datetime
    df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')



    # source file info
    df['source_file'] = os.path.basename(file)

    # standardized columns
    df = df[news_columns]
    all_news_dfs.append(df)

# Combine all
news_combined_df = pd.concat(all_news_dfs, ignore_index=True)

# Save to CSV
output_path = "C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_news_dataset.csv"
if not news_combined_df.empty:
    news_combined_df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"NewsAPI data saved to:\n{output_path}")
else:
    print("No NewsAPI data to save. DataFrame is empty.")


NewsAPI data saved to:
C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_news_dataset.csv


In [15]:
news_combined_df.shape

(2638, 7)

In [16]:
news_combined_df.sample(10)

Unnamed: 0,title,description,text,url,keyword,published_date,source_file
308,Global Times: Readers’ Reflections: China is r...,Global Times highlights China’s Global South t...,"Beijing, China, June 04, 2025 (GLOBE NEWSWIRE)...",https://www.globenewswire.com/news-release/202...,development aid kenya,2025-06-04 13:41:00+00:00,Agatha_news.csv
1244,It's Elon Musk's last official day in DC. Here...,Elon Musk is leaving DC. People across the wor...,President Donald Trump is set to give Elon Mus...,https://www.businessinsider.com/elon-musk-leav...,budget cuts,2025-05-30 15:47:01+00:00,cecilia.newsapi.csv
937,"Debunking Vance, Rubio claim that only 12 cent...",U.S. President Donald Trump's allies often cit...,Claim:\r\nPeople in need have received only 12...,https://www.snopes.com//fact-check/vance-rubio...,aid budget,2025-06-11 21:12:00+00:00,cecilia.newsapi.csv
1323,Taiwan eyeing new INGO center to promote track...,Taiwan's foreign ministry is looking for a loc...,ROC Central News Agency\r\n06/03/2025 03:25 PM...,https://www.globalsecurity.org/wmd/library/new...,NGO funding,2025-06-04 07:54:37+00:00,cecilia.newsapi.csv
1619,UK and allies will sanction far-right Israeli ...,The countries will freeze assets and impose tr...,The United Kingdom and some of its allies will...,https://www.aljazeera.com/news/2025/6/10/uk-an...,foreign aid,2025-06-10 14:56:42+00:00,cecilia.newsapi.csv
283,Has DOGE really saved the US government $180bn?,Elon Musk first claimed the department would m...,President Donald Trump and adviser Elon Musk c...,https://www.aljazeera.com/news/2025/6/6/has-do...,development aid kenya,2025-06-06 11:21:51+00:00,Agatha_news.csv
2586,War at the Top,The falling out between Donald Trump and Elon ...,Photograph Source: The White House – Public Do...,https://www.counterpunch.org/2025/06/16/war-at...,,2025-06-16 05:56:36+00:00,newsapi_usaid_articles.csv
1600,‘Piracy’: World reacts to Israel’s seizure of ...,Governments and NGOs condemn Israel's intercep...,"Israel has intercepted a Gaza-bound aid ship, ...",https://www.aljazeera.com/news/2025/6/9/piracy...,foreign aid,2025-06-09 12:00:22+00:00,cecilia.newsapi.csv
1034,Between Washington and Beijing,"Amid Trump’s tariffs, Africa faces trade disru...","Amid Trump’s tariffs, Africa faces trade disru...",https://africasacountry.com/2025/06/between-wa...,Trump Kenya,2025-06-04 12:14:54+00:00,cecilia.newsapi.csv
286,"AFRICOM asks for help deterring terrorism, aft...",Groups like al-Shabaab see USAID withdrawal as...,Deterring the spread of terrorism in Africa an...,https://www.defenseone.com/threats/2025/05/afr...,development aid kenya,2025-05-29 21:15:17+00:00,Agatha_news.csv


In [17]:
news_combined_df.isna().sum()

title               0
description        16
text               25
url                 2
keyword           259
published_date     99
source_file         0
dtype: int64

### FINAL REDDIT DATA TO BE USED ~ mbego_all_reddit_merged

In [18]:
import pandas as pd
import seaborn as sns

In [19]:
reddit_df = pd.read_csv('C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/mbego_all_reddit_merged.csv')
reddit_df.head()

Unnamed: 0,title,selftext,subreddit,author,created_utc,created_date,score,num_comments,keyword,search_term,date_posted,upvotes,comments,url,permalink
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,Kenya,muerki,4/15/2025 13:16,,3.0,5.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jzrn2...,
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,Kenya,Morio_anzenza,4/7/2025 4:21,,169.0,95.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jtcvb...,
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,Kenya,vindtar,4/5/2025 19:09,,2.0,2.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jsb14...,
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",Kenya,Gold_Smart,3/25/2025 8:18,,13.0,20.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jjehw...,
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,Kenya,westmaxia,3/8/2025 8:08,,1.0,6.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1j6cjz...,


In [20]:
reddit_df.tail()

Unnamed: 0,title,selftext,subreddit,author,created_utc,created_date,score,num_comments,keyword,search_term,date_posted,upvotes,comments,url,permalink
1284,Weekly Sub-Saharan Africa Security Situation a...,#Somalia 🇸🇴\r\n#Sudan 🇸🇩\r\nDemocratic Republi...,Africa,,4/18/2025 14:09,,3.0,2.0,,,,,,https://open.substack.com/pub/hasretkargin/p/w...,
1285,No evidence that Burkina Faso paid off all its...,,Africa,,4/18/2025 8:23,,52.0,25.0,,,,,,https://www.reuters.com/fact-check/burkina-fas...,
1286,Ghana orders foreigners to exit gold market by...,Ghana has ordered foreigners to exit its gold ...,Africa,,4/17/2025 17:59,,101.0,12.0,,,,,,https://eastleighvoice.co.ke/west%20african/13...,
1287,Unending Frustration Regarding Sudan War.,https://www.reuters.com/world/britain-boosts-a...,Africa,,4/16/2025 19:33,,11.0,8.0,,,,,,https://www.reddit.com/gallery/1k0t8ed,
1288,Tanzania's Authoritarian Government Has Just B...,Tanzania's main opposition party has been barr...,Africa,,4/14/2025 11:31,,52.0,14.0,,,,,,https://www.reddit.com/r/Africa/comments/1jywl...,


In [21]:
reddit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1289 entries, 0 to 1288
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1289 non-null   object 
 1   selftext      901 non-null    object 
 2   subreddit     1289 non-null   object 
 3   author        466 non-null    object 
 4   created_utc   1289 non-null   object 
 5   created_date  150 non-null    object 
 6   score         1013 non-null   float64
 7   num_comments  833 non-null    float64
 8   keyword       742 non-null    object 
 9   search_term   150 non-null    object 
 10  date_posted   0 non-null      float64
 11  upvotes       276 non-null    float64
 12  comments      276 non-null    float64
 13  url           1289 non-null   object 
 14  permalink     426 non-null    object 
dtypes: float64(5), object(10)
memory usage: 151.2+ KB


In [22]:
reddit_df.shape

(1289, 15)

In [23]:
reddit_df.columns

Index(['title', 'selftext', 'subreddit', 'author', 'created_utc',
       'created_date', 'score', 'num_comments', 'keyword', 'search_term',
       'date_posted', 'upvotes', 'comments', 'url', 'permalink'],
      dtype='object')

In [24]:
reddit_df.dtypes

title            object
selftext         object
subreddit        object
author           object
created_utc      object
created_date     object
score           float64
num_comments    float64
keyword          object
search_term      object
date_posted     float64
upvotes         float64
comments        float64
url              object
permalink        object
dtype: object

In [25]:
reddit_df.isna().sum()

title              0
selftext         388
subreddit          0
author           823
created_utc        0
created_date    1139
score            276
num_comments     456
keyword          547
search_term     1139
date_posted     1289
upvotes         1013
comments        1013
url                0
permalink        863
dtype: int64

In [26]:
reddit_df.describe()

Unnamed: 0,score,num_comments,date_posted,upvotes,comments
count,1013.0,833.0,0.0,276.0,276.0
mean,344.626851,31.34934,,337.518116,35.905797
std,3106.309319,68.060435,,1623.914761,73.531775
min,0.0,0.0,,0.0,0.0
25%,3.0,4.0,,6.75,5.0
50%,10.0,9.0,,43.5,13.0
75%,54.0,31.0,,111.25,31.25
max,79088.0,706.0,,22208.0,558.0


## Data Cleaning

### Dropping Unneccessary Columns

In [27]:
reddit_df.drop(columns=['subreddit', 'author','created_date','score','num_comments','search_term','date_posted','upvotes','comments','permalink'], inplace=True)
reddit_df.head()

Unnamed: 0,title,selftext,created_utc,keyword,url
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,4/15/2025 13:16,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,4/7/2025 4:21,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,4/5/2025 19:09,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",3/25/2025 8:18,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,3/8/2025 8:08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...


In [28]:
reddit_df.columns

Index(['title', 'selftext', 'created_utc', 'keyword', 'url'], dtype='object')

### Checking for missing values

In [29]:
reddit_df.isna().sum()

title            0
selftext       388
created_utc      0
keyword        547
url              0
dtype: int64

In [30]:
reddit_df[['selftext', 'title','url']].sample(10)

Unnamed: 0,selftext,title,url
195,It has come to light that USAID had been fundi...,USAID wasn't worth it.,https://i.redd.it/cxv28znrhohe1.jpeg
399,What's your take on the new policy by Donald T...,Interest post from X,https://i.redd.it/9y5f6d1s4dfe1.png
577,How are things going at development banks - es...,Are development banks also affected by the US ...,https://www.reddit.com/r/InternationalDev/comm...
322,"FIRE stands for Financial Independence, Retire...",Are there Kenyans who are into FIRE (Financial...,https://www.reddit.com/r/Kenya/comments/1kqdr0...
726,"Guys with the recent halting of funds for HIV,...",USAID HIV FUND CUTS,https://www.reddit.com/r/Kenya/comments/1j0am0...
170,(**Update:** He broke up with me so you guys d...,Would you take her back?,https://www.reddit.com/r/Kenya/comments/1ji01m...
1142,Tenderpreneurship is awful. Governments buildi...,Kenyan politicians build stuff that don’t work,https://www.reddit.com/r/Kenya/comments/1lb7mu...
1047,,Hong Kong's biggest pro-democracy party moves ...,https://apnews.com/article/hong-kong-china-dem...
137,My neighbour’s wife was a very big shot in USA...,USAID Repercussions + Economy,https://www.reddit.com/r/Kenya/comments/1kmhn8...
207,Wagwan wadau!\r\n\r\nSo I just realized that s...,How do I cut off friends?,https://www.reddit.com/r/Kenya/comments/1i8yu6...


In [31]:
reddit_df.rename(columns={'title':'post_title','selftext':'text','created_utc':'published_date'},inplace=True)
reddit_df.head()

Unnamed: 0,post_title,text,published_date,keyword,url
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,4/15/2025 13:16,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,4/7/2025 4:21,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,4/5/2025 19:09,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",3/25/2025 8:18,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,3/8/2025 8:08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...


In [35]:
# # Convert to datetime if not already
# reddit_df['published_date'] = pd.to_datetime(reddit_df['published_date'])

# # Extract time and drop it
# reddit_df['time'] = reddit_df['published_date'].dt.time
# reddit_df['published_date'] = reddit_df['published_date'].dt.date
# reddit_df.drop(columns='time', inplace=True)


**parse** - analyze and convert a piece of data (like a string) into a different, more useful format.

The below code cleans the published_date column in the data. Some dates are written as normal dates (eg "2025-04-15") and others are written as numbers (UNIX timestamps). The code checks each value and tries to convert it into a proper date. If it can’t, it leaves it empty. After that, it removes the time part and keeps just the date. This helps make sure all the dates are in the same clean format.

In [38]:
import pandas as pd

# Function to handle both datetime strings and UNIX timestamps
def parse_mixed_dates(val):
    try:
        # Parse as a regular datetime string
        return pd.to_datetime(val)
    except:
        try:
            # Parse as a UNIX timestamp
            return pd.to_datetime(float(val), unit='s')
        except:
            return pd.NaT  # Return NaT if all parsing fails

# Apply the parsing function
reddit_df['published_date'] = reddit_df['published_date'].apply(parse_mixed_dates)

# Check how many dates failed to parse
print("Unparsed dates:", reddit_df['published_date'].isna().sum())

# Extract date only (remove time)
reddit_df['published_date'] = reddit_df['published_date'].dt.date
reddit_df.head()

Unparsed dates: 167


Unnamed: 0,post_title,text,published_date,keyword,url
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,2025-04-15,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,2025-04-07,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,2025-04-05,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",2025-03-25,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,2025-03-08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...


In [39]:
reddit_df['published_date'].isna().sum() #check the missing values in date

167

In [41]:
reddit_df.dropna(subset=['published_date'], inplace=True) #drop missing values in date

In [42]:
reddit_df.shape #check the shape of the dataset

(1122, 5)

In [43]:
reddit_df.isna().sum() #check for missing values

post_title          0
text              380
published_date      0
keyword           380
url                 0
dtype: int64

### Fill missing values in text with an empty string

I filled missing values in text with an empty string because these are likely link posts where the author did not add body text.

In [44]:
reddit_df['text'] = reddit_df['text'].fillna('')
reddit_df.isna().sum()

post_title          0
text                0
published_date      0
keyword           380
url                 0
dtype: int64

### Fill missing values in keyword with a placeholder 'Unknown'

In [47]:
reddit_df['keyword'] = reddit_df['keyword'].fillna('unknown')
reddit_df['keyword'].sample(10)

263       development aid kenya
251       development aid kenya
168     usaid kenya funding cut
38            kenya foreign aid
974                     unknown
202     usaid kenya funding cut
1093                    unknown
1231                    unknown
411       development aid kenya
1134                    unknown
Name: keyword, dtype: object

In [48]:
reddit_df.isna().sum()

post_title        0
text              0
published_date    0
keyword           0
url               0
dtype: int64

Missing values filled with an empty string and placeholder to maintain the integrity of the data and maintain a good amount of rows

### Check for duplicates 

In [53]:
reddit_df.duplicated().sum()

75

In [57]:
reddit_df.drop_duplicates(inplace=True) #drop all duplicates

In [58]:
reddit_df.duplicated().sum() #confirm that all duplicates have been dropped

0

In [59]:
reddit_df.shape #shape of the new data after cleaning

(1047, 5)