# USAID-Kenya-Sentiment-Analysis

#    Merging Reddit CSV Files

### Import neccessary libraries

In [1]:
import pandas as pd
import glob
import os


### Folder path

In [2]:
folder_path = 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data'


### CSV file paths

In [3]:
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
csv_files

['C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\Agatha_reddit.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\cecilia.redditsubs.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\cecilia.reddit_nbo_ke_africa.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\leo_reddit_posts.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\Mbego_reddit_usaid_kenya.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\Mbego_reddit_usaid_kenya2.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\reddit_usaid_kenya.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\reddit_usaid_sentiment.csv',
 'C:/Users/hp/Deskto

### Reading the files and combining them into one dataframe

In [4]:

preferred_columns = ['post_title', 'text', 'keyword', 'published_date', 'url']
all_dfs = []

for file in csv_files:
    df = pd.read_csv(file)
    
    # Rename common column variants
    if 'title' in df.columns:
        df.rename(columns={'title': 'post_title'}, inplace=True)
    if 'selftext' in df.columns and 'text' not in df.columns:
        df.rename(columns={'selftext': 'text'}, inplace=True)
    if 'date_posted' in df.columns:
        df.rename(columns={'date_posted': 'published_date'}, inplace=True)
    elif 'created_utc' in df.columns:
        df.rename(columns={'created_utc': 'published_date'}, inplace=True)
    
    # Add missing columns
    for col in preferred_columns:
        if col not in df.columns:
            df[col] = None

    # Convert published_date to datetime
    df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')

    # Track file source
    df['source_file'] = os.path.basename(file)
    
    # Keep only standardized columns
    df = df[preferred_columns + ['source_file']]

    all_dfs.append(df)

# Combine all
combined_df = pd.concat(all_dfs, ignore_index=True)
combined_df.head()


Unnamed: 0,post_title,text,keyword,published_date,url,source_file
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,usaid kenya,2025-04-15 13:16:53,https://www.reddit.com/r/Kenya/comments/1jzrn2...,Agatha_reddit.csv
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,usaid kenya,2025-04-07 04:21:12,https://www.reddit.com/r/Kenya/comments/1jtcvb...,Agatha_reddit.csv
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,usaid kenya,2025-04-05 19:09:10,https://www.reddit.com/r/Kenya/comments/1jsb14...,Agatha_reddit.csv
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",usaid kenya,2025-03-25 08:18:04,https://www.reddit.com/r/Kenya/comments/1jjehw...,Agatha_reddit.csv
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,usaid kenya,2025-03-08 08:08:58,https://www.reddit.com/r/Kenya/comments/1j6cjz...,Agatha_reddit.csv


In [5]:
combined_df.shape #check the shape of the dataset

(1306, 6)

In [6]:
combined_df.isna().sum() #check for missing values

post_title          0
text              398
keyword           564
published_date    197
url                 0
source_file         0
dtype: int64

In [7]:
combined_df.columns

Index(['post_title', 'text', 'keyword', 'published_date', 'url',
       'source_file'],
      dtype='object')

In [8]:
# Extract date and time
combined_df['time'] = combined_df['published_date'].dt.time
combined_df['published_date'] = combined_df['published_date'].dt.date

#Drop the time column
combined_df.drop(columns='time', inplace=True)
combined_df.head()

Unnamed: 0,post_title,text,keyword,published_date,url,source_file
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,usaid kenya,2025-04-15,https://www.reddit.com/r/Kenya/comments/1jzrn2...,Agatha_reddit.csv
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,usaid kenya,2025-04-07,https://www.reddit.com/r/Kenya/comments/1jtcvb...,Agatha_reddit.csv
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,usaid kenya,2025-04-05,https://www.reddit.com/r/Kenya/comments/1jsb14...,Agatha_reddit.csv
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",usaid kenya,2025-03-25,https://www.reddit.com/r/Kenya/comments/1jjehw...,Agatha_reddit.csv
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,usaid kenya,2025-03-08,https://www.reddit.com/r/Kenya/comments/1j6cjz...,Agatha_reddit.csv


In [9]:
combined_df.sample(10) #random sample of 10 rows

Unnamed: 0,post_title,text,keyword,published_date,url,source_file
617,US to finish $671 million in foreign aid payme...,,"foreign aid, foreign aid",2025-03-21,https://www.reuters.com/world/us/us-finish-671...,cecilia.redditsubs.csv
1015,"Senator Bob Corker says, USAID Contracting in ...",,,NaT,http://sputniknews.com/us/20150117/1017005125....,Mbego_reddit_usaid_kenya2.csv
1011,USAID Says Building Of 'Cuban Twitter' Was Par...,,,NaT,http://www.npr.org/blogs/thetwo-way/2014/04/07...,Mbego_reddit_usaid_kenya2.csv
596,House passes rescission package to claw back $...,,"foreign aid, foreign aid",2025-06-12,https://www.cbsnews.com/news/house-vote-rescis...,cecilia.redditsubs.csv
237,Racism - how often do you as a Kenyan feel rac...,I have lived half of my life in Kenya. I've ju...,development aid kenya,2025-05-19,https://www.reddit.com/r/Kenya/comments/1kq6n2...,Agatha_reddit.csv
259,Do you think this applies to Kenya? Why or why...,Kenya is the most developed nation in East Afr...,development aid kenya,2025-04-03,https://i.redd.it/05b87nicklse1.jpeg,Agatha_reddit.csv
1299,That world happiness survey is complete crap,"I usually do not do this, as this does not dir...",,2025-04-20,https://nypost.com/2017/03/22/that-world-happi...,ruth_reddit.csv
863,Africa’s main problem is being a sellout,"Guys hear me out, in all the major historical ...",,1970-01-01,https://www.reddit.com/r/Kenya/comments/1k5zel...,leo_reddit_posts.csv
1225,Men’s Mental Health Matters,Male suicide is still one of the most ignored ...,,2025-06-09,https://www.reddit.com/r/Kenya/comments/1l70xu...,ruth_reddit.csv
675,How ending foreign aid could damage U.S. natio...,,"foreign aid, foreign aid",2025-02-06,https://www.wbur.org/onpoint/2025/02/05/ending...,cecilia.redditsubs.csv


### Save the final merged dataframe to csv

In [10]:
# Save to CSV 
if not combined_df.empty:
    output_filename = "C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_reddit_dataset.csv"
    combined_df.to_csv(output_filename, index=False, encoding='utf-8')
    print(f" Results successfully saved to:\n{output_filename}")
else:
    print(" No data to save. The DataFrame is empty.")


 Results successfully saved to:
C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_reddit_dataset.csv


# Merging News CSV Files

### Import necessary libraries

In [11]:
import pandas as pd
import glob
import os


### Folder path

In [12]:
folder_path = 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data'


### CSV file paths

In [13]:
folder_path = 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data'
omit_file = os.path.join(folder_path, 'Agatha_news_fulltext.csv')

news_csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
news_csv_files = [f for f in news_csv_files if os.path.normpath(f) != os.path.normpath(omit_file)]

news_csv_files


['C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\Agatha_news.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\cecilia.newsapi.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\leo_newsapi_articles_enriched.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\Mbego_news_usaid_kenya_fulltext.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\Mbego_news_usaid_kenya_recent.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\newsapi_usaid_articles.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\ruth_news.csv']

### Reading the files and combining them into one dataframe

In [14]:
import pandas as pd
import os

news_columns = ['title', 'description', 'text', 'url', 'keyword', 'published_date', 'source_file']
all_news_dfs = []

for file in news_csv_files:  
    df = pd.read_csv(file)

    # Rename content to text 
    if 'content' in df.columns and 'text' not in df.columns:
        df.rename(columns={'content': 'text'}, inplace=True)

    # Rename publishedAt to published_date
    if 'publishedAt' in df.columns:
        df.rename(columns={'publishedAt': 'published_date'}, inplace=True)

    # Add missing columns
    for col in news_columns:
        if col not in df.columns:
            df[col] = None

    # Convert published_date to datetime
    df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')



    # source file info
    df['source_file'] = os.path.basename(file)

    # standardized columns
    df = df[news_columns]
    all_news_dfs.append(df)

# Combine all
news_combined_df = pd.concat(all_news_dfs, ignore_index=True)

# Save to CSV
output_path = "C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_news_dataset.csv"
if not news_combined_df.empty:
    news_combined_df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"NewsAPI data saved to:\n{output_path}")
else:
    print("No NewsAPI data to save. DataFrame is empty.")


NewsAPI data saved to:
C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_news_dataset.csv


In [15]:
news_combined_df.shape

(2638, 7)

In [16]:
news_combined_df.sample(10)

Unnamed: 0,title,description,text,url,keyword,published_date,source_file
1244,It's Elon Musk's last official day in DC. Here...,Elon Musk is leaving DC. People across the wor...,President Donald Trump is set to give Elon Mus...,https://www.businessinsider.com/elon-musk-leav...,budget cuts,2025-05-30 15:47:01+00:00,cecilia.newsapi.csv
1588,The military's role in LA as protests persist....,A curfew has been enforced in downtown LA as a...,Good morning. You're reading the Up First news...,https://www.npr.org/2025/06/11/g-s1-72078/up-f...,foreign aid,2025-06-11 11:03:43+00:00,cecilia.newsapi.csv
1031,Cloudy skies can’t dim joy as thousands fill W...,"Gray skies and drizzle gave way to sunshine, m...","Gray skies and drizzle gave way to sunshine, m...",https://japantoday.com/category/world/cloudy-s...,Trump Kenya,2025-06-08 02:00:56+00:00,cecilia.newsapi.csv
1330,Brussels bets on securitisation reboot to boos...,Critics warn the move is unlikely to help plug...,The European Commission proposed sweeping chan...,https://www.euractiv.com/section/economy-jobs/...,NGO funding,2025-06-17 15:20:01+00:00,cecilia.newsapi.csv
1982,EXCLUSIVE: State Department Rebuts Democrat Cl...,FIRST ON THE DAILY SIGNAL—Congressional Democr...,FIRST ON THE DAILY SIGNALCongressional Democra...,https://www.dailysignal.com/2025/06/11/exclusi...,State Department cuts,2025-06-11 14:00:00+00:00,cecilia.newsapi.csv
148,Trump-Musk feud escalates: What happened? And ...,Elon Musk suggests Donald Trump should be impe...,"Washington, DC The ties between United States ...",https://www.aljazeera.com/news/2025/6/6/trump-...,usaid budget cut,2025-06-06 00:57:34+00:00,Agatha_news.csv
1307,Beijing-linked entities fund anti-ICE nonprofi...,"The Chinese-American Planning Council (CPC), a...",<ul><li>The Chinese-American Planning Council ...,https://www.naturalnews.com/2025-05-25-china-l...,NGO funding,2025-05-25 06:00:00+00:00,cecilia.newsapi.csv
926,"Trump’s big, beautiful bill, explained in 5 ch...",The fight over President Donald Trump’s so-cal...,"President Donald Trump, joined by Speaker of t...",https://www.vox.com/trump-administration/41582...,aid budget,2025-06-06 22:51:42+00:00,cecilia.newsapi.csv
1319,Cash for sharks: the unintended consequences o...,Listen to conservation scientist Hollie Booth ...,"As Jaws marks its 50th anniversary this year, ...",https://theconversation.com/cash-for-sharks-th...,NGO funding,2025-06-12 09:23:44+00:00,cecilia.newsapi.csv
1998,The billionaire breakup,Plus: Pink slips and red flags.,"Good morning, Quartz readers!\r\nHeres what yo...",https://qz.com/emails/daily-brief/1851783777/t...,State Department cuts,2025-06-05 22:00:00+00:00,cecilia.newsapi.csv


In [17]:
news_combined_df.isna().sum()

title               0
description        16
text               25
url                 2
keyword           259
published_date     99
source_file         0
dtype: int64

### FINAL REDDIT DATA TO BE USED ~ mbego_all_reddit_merged

In [18]:
import pandas as pd
import seaborn as sns

In [19]:
reddit_df = pd.read_csv('C:/Users/hp/Desktop/USAID backuppp for me/mbego_all_reddit_merged.csv')
reddit_df.head()

Unnamed: 0,title,selftext,subreddit,author,created_utc,created_date,score,num_comments,keyword,search_term,date_posted,upvotes,comments,url,permalink
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,Kenya,muerki,4/15/2025 13:16,,3.0,5.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jzrn2...,
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,Kenya,Morio_anzenza,4/7/2025 4:21,,169.0,95.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jtcvb...,
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,Kenya,vindtar,4/5/2025 19:09,,2.0,2.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jsb14...,
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",Kenya,Gold_Smart,3/25/2025 8:18,,13.0,20.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jjehw...,
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,Kenya,westmaxia,3/8/2025 8:08,,1.0,6.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1j6cjz...,


In [20]:
reddit_df.tail()

Unnamed: 0,title,selftext,subreddit,author,created_utc,created_date,score,num_comments,keyword,search_term,date_posted,upvotes,comments,url,permalink
1284,Weekly Sub-Saharan Africa Security Situation a...,#Somalia 🇸🇴\r\n#Sudan 🇸🇩\r\nDemocratic Republi...,Africa,,4/18/2025 14:09,,3.0,2.0,,,,,,https://open.substack.com/pub/hasretkargin/p/w...,
1285,No evidence that Burkina Faso paid off all its...,,Africa,,4/18/2025 8:23,,52.0,25.0,,,,,,https://www.reuters.com/fact-check/burkina-fas...,
1286,Ghana orders foreigners to exit gold market by...,Ghana has ordered foreigners to exit its gold ...,Africa,,4/17/2025 17:59,,101.0,12.0,,,,,,https://eastleighvoice.co.ke/west%20african/13...,
1287,Unending Frustration Regarding Sudan War.,https://www.reuters.com/world/britain-boosts-a...,Africa,,4/16/2025 19:33,,11.0,8.0,,,,,,https://www.reddit.com/gallery/1k0t8ed,
1288,Tanzania's Authoritarian Government Has Just B...,Tanzania's main opposition party has been barr...,Africa,,4/14/2025 11:31,,52.0,14.0,,,,,,https://www.reddit.com/r/Africa/comments/1jywl...,


In [21]:
reddit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1289 entries, 0 to 1288
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1289 non-null   object 
 1   selftext      901 non-null    object 
 2   subreddit     1289 non-null   object 
 3   author        466 non-null    object 
 4   created_utc   1289 non-null   object 
 5   created_date  150 non-null    object 
 6   score         1013 non-null   float64
 7   num_comments  833 non-null    float64
 8   keyword       742 non-null    object 
 9   search_term   150 non-null    object 
 10  date_posted   0 non-null      float64
 11  upvotes       276 non-null    float64
 12  comments      276 non-null    float64
 13  url           1289 non-null   object 
 14  permalink     426 non-null    object 
dtypes: float64(5), object(10)
memory usage: 151.2+ KB


In [22]:
reddit_df.shape

(1289, 15)

In [23]:
reddit_df.columns

Index(['title', 'selftext', 'subreddit', 'author', 'created_utc',
       'created_date', 'score', 'num_comments', 'keyword', 'search_term',
       'date_posted', 'upvotes', 'comments', 'url', 'permalink'],
      dtype='object')

In [24]:
reddit_df.dtypes

title            object
selftext         object
subreddit        object
author           object
created_utc      object
created_date     object
score           float64
num_comments    float64
keyword          object
search_term      object
date_posted     float64
upvotes         float64
comments        float64
url              object
permalink        object
dtype: object

In [25]:
reddit_df.isna().sum()

title              0
selftext         388
subreddit          0
author           823
created_utc        0
created_date    1139
score            276
num_comments     456
keyword          547
search_term     1139
date_posted     1289
upvotes         1013
comments        1013
url                0
permalink        863
dtype: int64

In [26]:
reddit_df.describe()

Unnamed: 0,score,num_comments,date_posted,upvotes,comments
count,1013.0,833.0,0.0,276.0,276.0
mean,344.626851,31.34934,,337.518116,35.905797
std,3106.309319,68.060435,,1623.914761,73.531775
min,0.0,0.0,,0.0,0.0
25%,3.0,4.0,,6.75,5.0
50%,10.0,9.0,,43.5,13.0
75%,54.0,31.0,,111.25,31.25
max,79088.0,706.0,,22208.0,558.0


## Data Cleaning

### Dropping Unneccessary Columns

In [27]:
reddit_df.drop(columns=['subreddit', 'author','created_date','score','num_comments','search_term','date_posted','upvotes','comments','permalink'], inplace=True)
reddit_df.head()

Unnamed: 0,title,selftext,created_utc,keyword,url
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,4/15/2025 13:16,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,4/7/2025 4:21,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,4/5/2025 19:09,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",3/25/2025 8:18,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,3/8/2025 8:08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...


In [28]:
reddit_df.columns

Index(['title', 'selftext', 'created_utc', 'keyword', 'url'], dtype='object')

### Checking for missing values

In [29]:
reddit_df.isna().sum()

title            0
selftext       388
created_utc      0
keyword        547
url              0
dtype: int64

In [30]:
reddit_df[['selftext', 'title','url']].sample(10)

Unnamed: 0,selftext,title,url
330,TL;DR: Kenya's development challenges are deep...,Vietnam gas CEO,https://www.reddit.com/r/Kenya/comments/1k7ego...
706,Israel has received $310B dollars since its fo...,Why does the US Taxpayer give Israel so much m...,https://www.reddit.com/r/AskConservatives/comm...
673,,Don’t Gut USAID: Trump Should Refashion the Fo...,https://www.foreignaffairs.com/united-states/a...
618,,Trump aides circulate plan for complete revamp...,https://www.politico.com/news/2025/03/19/trump...
1085,,US Congressman Scott Perry Accuses USAID of Fu...,https://streetsofkante.com/us-congressman-scot...
814,"Hey r/kenya,\r\n\r\nIt’s with a heavy heart th...",Closing My Electronics Repair Shop in Nairobi...,https://www.reddit.com/r/Kenya/comments/1k4lkw...
152,Someone on a different group (different websit...,"USAID left a month ago, do we have ARVs in Kenya?",https://www.reddit.com/r/Kenya/comments/1jzrn2...
995,,In a meeting held at the US Consulate in Erbil...,https://www.kurdistan24.net/en/news/d432ab5b-6...
919,Are you still in contact with the organisation...,EX-USAID people!! Let's talk,https://www.reddit.com/r/Kenya/comments/1jsb14...
826,I'm sure most of us are fully aware of the fru...,Kenyan Startup environment and what can be done,https://www.reddit.com/r/Kenya/comments/1jtthj...


In [31]:
reddit_df.rename(columns={'title':'post_title','selftext':'text','created_utc':'published_date'},inplace=True)
reddit_df.head()

Unnamed: 0,post_title,text,published_date,keyword,url
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,4/15/2025 13:16,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,4/7/2025 4:21,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,4/5/2025 19:09,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",3/25/2025 8:18,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,3/8/2025 8:08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...


**parse** - analyze and convert a piece of data (like a string) into a different, more useful format.

The below code cleans the published_date column in the data. Some dates are written as normal dates (eg "2025-04-15") and others are written as numbers (UNIX timestamps). The code checks each value and tries to convert it into a proper date. If it can’t, it leaves it empty. After that, it removes the time part and keeps just the date. This helps make sure all the dates are in the same clean format.

In [32]:
import pandas as pd

# Function to handle both datetime strings and UNIX timestamps
def parse_mixed_dates(val):
    try:
        # Parse as a regular datetime string
        return pd.to_datetime(val)
    except:
        try:
            # Parse as a UNIX timestamp
            return pd.to_datetime(float(val), unit='s')
        except:
            return pd.NaT  # Return NaT if all parsing fails

# Apply the parsing function
reddit_df['published_date'] = reddit_df['published_date'].apply(parse_mixed_dates)

# Check how many dates failed to parse
print("Unparsed dates:", reddit_df['published_date'].isna().sum())

# Extract date only (remove time)
reddit_df['published_date'] = reddit_df['published_date'].dt.date
reddit_df.head()

Unparsed dates: 0


Unnamed: 0,post_title,text,published_date,keyword,url
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,2025-04-15,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,2025-04-07,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,2025-04-05,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",2025-03-25,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,2025-03-08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...


In [33]:
reddit_df['published_date'].isna().sum() #check the missing values in date

0

In [34]:
reddit_df.dropna(subset=['published_date'], inplace=True) #drop missing values in date

In [35]:
reddit_df.shape #check the shape of the dataset

(1289, 5)

In [36]:
reddit_df.isna().sum() #check for missing values

post_title          0
text              388
published_date      0
keyword           547
url                 0
dtype: int64

### Fill missing values in text with an empty string

I filled missing values in text with an empty string because these are likely link posts where the author did not add body text.

In [37]:
reddit_df['text'] = reddit_df['text'].fillna('')
reddit_df.isna().sum()

post_title          0
text                0
published_date      0
keyword           547
url                 0
dtype: int64

### Fill missing values in keyword with a placeholder 'Unknown'

In [38]:
reddit_df['keyword'] = reddit_df['keyword'].fillna('unknown')
reddit_df['keyword'].sample(10)

858                      unknown
368             usaid budget cut
1203                     unknown
1071                     unknown
750                      unknown
507                        USAID
578     foreign aid, foreign aid
537     foreign aid, foreign aid
1189                     unknown
1241                     unknown
Name: keyword, dtype: object

In [39]:
reddit_df.isna().sum()

post_title        0
text              0
published_date    0
keyword           0
url               0
dtype: int64

Missing values filled with an empty string and placeholder to maintain the integrity of the data and maintain a good amount of rows

### Check for duplicates 

In [40]:
reddit_df.duplicated().sum()

145

In [41]:
reddit_df.drop_duplicates(inplace=True) #drop all duplicates

In [42]:
reddit_df.duplicated().sum() #confirm that all duplicates have been dropped

0

In [43]:
reddit_df.shape #shape of the new data after cleaning

(1144, 5)

In [44]:
reddit_df['post_title'].sample(10)

993     No alternative but to disband Hong Kong’s Demo...
1262               A liberal democracy isn’t the only way
1136    Looking for freelance software developers and ...
421     World Health Organization warns of possible tu...
674             What the data says about U.S. foreign aid
552     According to a White House insider via Politic...
911     Saw almost an entire department in a hospital ...
984     China warns Hong Kong's last major opposition ...
1113                               "Wapi hawa wanafunzi?"
581               Message from Ken Jackson to USAID staff
Name: post_title, dtype: object

## Sentiment Analysis Data Cleaning

Creating a new column 'full_text' that combines 'post_title','text' and the empty string '', to help in sentiment analysis

In [45]:
reddit_df['full_text'] = reddit_df['post_title'] + ' ' + reddit_df['text']
reddit_df.head() #sample top 5 rows after combining the columns

Unnamed: 0,post_title,text,published_date,keyword,url,full_text
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,2025-04-15,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...,"USAID left a month ago, do we have ARVs in Ken..."
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,2025-04-07,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...,Classism in r/Kenya and r/nairobi The classism...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,2025-04-05,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...,EX-USAID people!! Let's talk Are you still in ...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",2025-03-25,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...,Why western powers back Israel no matter what ...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,2025-03-08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...,Is kenya capable of funding its needs now that...


In [46]:
reddit_df['full_text'].isna().sum() #confirm that there are no missing values in the dataset

0

### Lowercasing 

Converting all text to lowercase

In [47]:
reddit_df['full_text'] = reddit_df['full_text'].str.lower()
reddit_df['full_text'].sample(10)

597    house approves trump’s request to cut funding ...
484    ignorance or malice? i have just watched the m...
315                                  the star today eh! 
956    canada calls usaid shutdown a ‘dangerous retre...
465    trump says he could withhold aid from jordan a...
430    geneva plans to pay ngo wages after us foreign...
578    the stop work orders are blatantly illegal. wh...
395    trump to cut off funding for south africa over...
227    kenyans in diaspora i have recently been diggi...
212    economy for the experts in matters economy and...
Name: full_text, dtype: object

### Translate all text to English

In [48]:
!pip install langdetect #detect which language the text are in


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#detect': Expected package name at the start of dependency specifier
    #detect
    ^


In [49]:
from langdetect import detect

# Language detection
reddit_df['language'] = reddit_df['full_text'].apply(lambda x: detect(x) if pd.notnull(x) else 'unknown')
reddit_df['language'].value_counts()

en    1128
tl       3
et       2
fr       2
sw       2
ca       2
sl       1
da       1
nl       1
es       1
id       1
Name: language, dtype: int64

In [50]:
# View full text rows where the language is not English
non_english_df = reddit_df[reddit_df['language'] != 'en']

# Display the full_text column of non-English rows
print(non_english_df[['language', 'full_text']])


     language                                          full_text
28         tl  ...na bado mnasema ruto must go bila tangible ...
175        tl  ...na bado mnasema ruto must go bila tangible ...
236        sl                                design job nairobi 
273        tl  ...na bado mnasema ruto must go bila tangible ...
301        et  just seeing ki, sa, che, ko, la, si...pure tri...
634        fr  supreme court rules trump administration must ...
638        fr  supreme court denies trump administration requ...
664        da  judge orders us to restore funds for foreign a...
853        sw  drunk and orderly it so happened that one time...
903        nl  us strike kills 16 afghan policemen in helmand...
943        et                            joblessness is looming 
1036       ca  russia expels usaid for 'political meddling' -...
1039       ca              us protest at usaid bolivia decision 
1061       es     chaos erupts as caravan reaches mexico border 
1100       id  filling kr

In [51]:
! pip install deep-translator #Translates text from one language to another



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#Translates': Expected package name at the start of dependency specifier
    #Translates
    ^


In [52]:
from deep_translator import GoogleTranslator

# Make a copy of non-English rows
non_english_df = reddit_df[reddit_df['language'] != 'en'].copy()

# Translate each full_text
non_english_df['translated_text'] = non_english_df['full_text'].apply(
    lambda text: GoogleTranslator(source='auto', target='en').translate(text)
)

# Preview translations
print(non_english_df[['language', 'full_text', 'translated_text']].head())


    language                                          full_text  \
28        tl  ...na bado mnasema ruto must go bila tangible ...   
175       tl  ...na bado mnasema ruto must go bila tangible ...   
236       sl                                design job nairobi    
273       tl  ...na bado mnasema ruto must go bila tangible ...   
301       et  just seeing ki, sa, che, ko, la, si...pure tri...   

                                       translated_text  
28   ... and you still say Ruto must go without tan...  
175  ... and you still say Ruto must go without tan...  
236                                 design job nairobi  
273  ... and you still say Ruto must go without tan...  
301  just seeing ki, sa, che, ko, la, si ... pure t...  


From the above preview, shows that the text with both english and swahili was able to be translated as shown in index 28

In [53]:
reddit_df['language'].value_counts() #check which other languages have been detected

en    1128
tl       3
et       2
fr       2
sw       2
ca       2
sl       1
da       1
nl       1
es       1
id       1
Name: language, dtype: int64

In [54]:
# Drop all non-English rows
reddit_df = reddit_df[reddit_df['language'] == 'en'].copy()

# Reset the index
reddit_df.reset_index(drop=True, inplace=True)

# Preview the cleaned data
print(reddit_df[['language', 'full_text']].head())


  language                                          full_text
0       en  usaid left a month ago, do we have arvs in ken...
1       en  classism in r/kenya and r/nairobi the classism...
2       en  ex-usaid people!! let's talk are you still in ...
3       en  why western powers back israel no matter what ...
4       en  is kenya capable of funding its needs now that...


In [55]:
reddit_df['language'].value_counts() #confirm that all rows are in English only

en    1128
Name: language, dtype: int64

In [56]:
reddit_df.sample(10)

Unnamed: 0,post_title,text,published_date,keyword,url,full_text,language
1062,Weekly Sub-Saharan Africa Security Situation a...,,2025-06-06,unknown,https://open.substack.com/pub/hasretkargin/p/w...,weekly sub-saharan africa security situation a...,en
529,Trump and Musk refusing to pay USAID's bills t...,,2025-03-02,"USAID, foreign aid, foreign aid",https://www.msnbc.com/opinion/msnbc-opinion/do...,trump and musk refusing to pay usaid's bills t...,en
870,Cambodian PM calls US democracy 'bloody and br...,,2017-08-25,unknown,https://www.rt.com/news/400811-cambodia-us-ngo...,cambodian pm calls us democracy 'bloody and br...,en
1054,At least 100 people killed in gunmen attack in...,* At least 100 people have been killed in a no...,2025-06-15,unknown,https://www.aljazeera.com/news/2025/6/14/at-le...,at least 100 people killed in gunmen attack in...,en
268,🤔 What If Kenya Allowed British Rule Beyond 19...,Hear me out: The white man stayed past 1963. I...,2025-03-19,development aid kenya,https://www.reddit.com/r/Kenya/comments/1jeqhx...,🤔 what if kenya allowed british rule beyond 19...,en
256,Do you think this applies to Kenya? Why or why...,Kenya is the most developed nation in East Afr...,2025-04-03,development aid kenya,https://i.redd.it/05b87nicklse1.jpeg,do you think this applies to kenya? why or why...,en
484,Meanwhile... Africans continue to celebrate Tr...,You can't make these stuff up!,2025-01-21,"foreign aid, foreign aid",https://www.reddit.com/r/Kenya/comments/1i6g3b...,meanwhile... africans continue to celebrate tr...,en
975,"I hope you see this, shit hurts and I'm missin...",My heart is fr breaking waah. Why is this one ...,2025-06-15,unknown,https://www.reddit.com/r/Kenya/comments/1lc1bh...,"i hope you see this, shit hurts and i'm missin...",en
429,Iran welcomes Trump’s move to cut foreign aid ...,,2025-02-05,kenya foreign aid,https://www.pbs.org/newshour/world/iran-welcom...,iran welcomes trump’s move to cut foreign aid ...,en
323,New to Trading with Old Mutual (Kenya) – Best ...,Hey fellow traders! I just opened an Old Mutua...,2025-04-29,kenya donor funding,https://www.reddit.com/r/Kenya/comments/1kar8p...,new to trading with old mutual (kenya) – best ...,en


In [57]:
reddit_df.drop('language', axis=1, inplace=True) #drop the language column as it is unnecessary now
reddit_df.head()

Unnamed: 0,post_title,text,published_date,keyword,url,full_text
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,2025-04-15,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...,"usaid left a month ago, do we have arvs in ken..."
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,2025-04-07,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...,classism in r/kenya and r/nairobi the classism...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,2025-04-05,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...,ex-usaid people!! let's talk are you still in ...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",2025-03-25,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...,why western powers back israel no matter what ...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,2025-03-08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...,is kenya capable of funding its needs now that...


In [59]:
reddit_df.shape

(1128, 6)

### Removing Punctuation Marks

In [58]:
import re
import string

# Function to remove punctuation
def remove_punctuation(text):
    return re.sub(f"[{re.escape(string.punctuation)}]", "", str(text))

# Apply to the 'full_text' column
reddit_df['full_text'] = reddit_df['full_text'].apply(remove_punctuation)

reddit_df.head()


Unnamed: 0,post_title,text,published_date,keyword,url,full_text
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,2025-04-15,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...,usaid left a month ago do we have arvs in keny...
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,2025-04-07,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...,classism in rkenya and rnairobi the classism i...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,2025-04-05,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...,exusaid people lets talk are you still in cont...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",2025-03-25,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...,why western powers back israel no matter what ...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,2025-03-08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...,is kenya capable of funding its needs now that...


### Removing Stop Words,Emojis,Non-Emoji Symbols and Lemmatization

In [65]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to remove emojis, emoticons, and other symbols
def remove_emojis_symbols(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        u"\U00002700-\U000027BF"  # dingbats
        u"\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
        u"\U0001FA70-\U0001FAFF"  # extended pictographic symbols
        u"\U00002500-\U00002BEF"  # misc symbols
        u"\U0000200D"             # zero width joiner
        u"\u2600-\u26FF"          # misc symbols
        u"\u2700-\u27BF"
        u"\uFE0F"                 # variation selector
        u"\u3030"
        u"\u00A9"                 # ©
        u"\u00AE"                 # ®
        u"\u2122"                 # ™
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Final cleaning function
def clean_text(text):
    text = remove_emojis_symbols(text)
    words = text.split()
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(cleaned_words)

# Apply to reddit_df
reddit_df['cleaned_text'] = reddit_df['full_text'].apply(clean_text)

# Preview sample
reddit_df.sample(10)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,post_title,text,published_date,keyword,url,full_text,cleaned_text
279,Is the Kenyan Digital Nomad Visa Official Yet?,"Friends, I've been seeing conflicting informat...",2025-02-24,development aid kenya,https://www.reddit.com/r/Kenya/comments/1iwvya...,is the kenyan digital nomad visa official yet ...,kenyan digital nomad visa official yet friend ...
418,"As Trump guts foreign aid, China and others le...",,2025-03-31,kenya foreign aid,https://www.nbcnews.com/news/world/myanmar-ear...,as trump guts foreign aid china and others lea...,trump gut foreign aid china others lead myanma...
631,US foreign aid funded legal advice for trans a...,,2025-02-10,"foreign aid, foreign aid",https://www.telegraph.co.uk/us/politics/2025/0...,us foreign aid funded legal advice for trans a...,u foreign aid funded legal advice trans asylum...
648,Guards at Islamic State camp left unpaid after...,,2025-01-29,"foreign aid, foreign aid",https://www.telegraph.co.uk/world-news/2025/01...,guards at islamic state camp left unpaid after...,guard islamic state camp left unpaid trump for...
999,From debugging stress to catching vibes at the...,"So today I was deep in code, chasing a stubbor...",2025-06-12,unknown,https://www.reddit.com/r/Kenya/comments/1l9s1m...,from debugging stress to catching vibes at the...,debugging stress catching vibe swamp life plan...
721,Not all skin folks are kin folks: Black Americ...,\r\nThis is an example of one of the many blac...,2025-04-30,unknown,https://i.redd.it/hf94fjtmdzxe1.png,not all skin folks are kin folks black america...,skin folk kin folk black american state operat...
813,Ignorance or malice?,I have just watched the MP of Homa Bay defendi...,2025-02-12,unknown,https://www.reddit.com/r/Kenya/comments/1inj4k...,ignorance or malice i have just watched the mp...,ignorance malice watched mp homa bay defending...
605,Chief Justice John Roberts pauses order for Tr...,,2025-02-27,"foreign aid, foreign aid",https://edition.cnn.com/2025/02/26/politics/su...,chief justice john roberts pauses order for tr...,chief justice john robert pause order trump ad...
771,Is it time to come to countenance that grass i...,\r\n\r\nThe Kenyan government has been lying ...,2025-05-16,unknown,https://www.reddit.com/r/Kenya/comments/1knxk0...,is it time to come to countenance that grass i...,time come countenance grass isn’t greener side...
90,The disease that's likely to kill you,,2025-02-19,kenya foreign aid,https://i.redd.it/xxoeuq65c1ke1.jpeg,the disease thats likely to kill you,disease thats likely kill


In [66]:
reddit_df.iloc[268]['cleaned_text']


'kenya allowed british rule beyond 1963 let’s talk hear white man stayed past 1963 it’s controversial thought begs question—how would kenya look today allowed british govern u year would developed south africa vibrant hong kong can’t help wonder truly ready independence honestly strongly believe weren’t—and maybe still aren’t freedom premature move let brit continue pulling string sake development i’m opening floor debate convince otherwise good people let’s talk kenya history development freedomdebate'

In [68]:
reddit_df.iloc[268]['full_text']


'🤔 what if kenya allowed british rule beyond 1963 let’s talk about it hear me out the white man stayed past 1963 it’s a controversial thought but it begs the question—how would kenya look today if we allowed the british to govern us for a few more years would we be as developed as south africa or as vibrant as hong kong  \r\n\r\ni can’t help but wonder if we were truly ready for independence honestly i strongly believe we weren’t—and maybe still aren’t was freedom a premature move should we have let the brits continue pulling the strings for the sake of development  \r\n\r\ni’m opening the floor for debate convince me otherwise good people let’s talk 🧐 kenya history development freedomdebate'