# USAID-Kenya-Sentiment-Analysis

#    Merging Reddit CSV Files

### Import neccessary libraries

In [1]:
import pandas as pd
import glob
import os


### Folder path

In [2]:
folder_path = 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data'


### CSV file paths

In [3]:
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
csv_files

['C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\Agatha_reddit.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\cecilia.redditsubs.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\cecilia.reddit_nbo_ke_africa.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\leo_reddit_posts.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\Mbego_reddit_usaid_kenya.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\Mbego_reddit_usaid_kenya2.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\reddit_usaid_kenya.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\reddit_usaid_sentiment.csv',
 'C:/Users/hp/Deskto

### Reading the files and combining them into one dataframe

In [4]:

preferred_columns = ['post_title', 'text', 'keyword', 'published_date', 'url']
all_dfs = []

for file in csv_files:
    df = pd.read_csv(file)
    
    # Rename common column variants
    if 'title' in df.columns:
        df.rename(columns={'title': 'post_title'}, inplace=True)
    if 'selftext' in df.columns and 'text' not in df.columns:
        df.rename(columns={'selftext': 'text'}, inplace=True)
    if 'date_posted' in df.columns:
        df.rename(columns={'date_posted': 'published_date'}, inplace=True)
    elif 'created_utc' in df.columns:
        df.rename(columns={'created_utc': 'published_date'}, inplace=True)
    
    # Add missing columns
    for col in preferred_columns:
        if col not in df.columns:
            df[col] = None

    # Convert published_date to datetime
    df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')

    # Track file source
    df['source_file'] = os.path.basename(file)
    
    # Keep only standardized columns
    df = df[preferred_columns + ['source_file']]

    all_dfs.append(df)

# Combine all
combined_df = pd.concat(all_dfs, ignore_index=True)
combined_df.head()


Unnamed: 0,post_title,text,keyword,published_date,url,source_file
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,usaid kenya,2025-04-15 13:16:53,https://www.reddit.com/r/Kenya/comments/1jzrn2...,Agatha_reddit.csv
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,usaid kenya,2025-04-07 04:21:12,https://www.reddit.com/r/Kenya/comments/1jtcvb...,Agatha_reddit.csv
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,usaid kenya,2025-04-05 19:09:10,https://www.reddit.com/r/Kenya/comments/1jsb14...,Agatha_reddit.csv
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",usaid kenya,2025-03-25 08:18:04,https://www.reddit.com/r/Kenya/comments/1jjehw...,Agatha_reddit.csv
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,usaid kenya,2025-03-08 08:08:58,https://www.reddit.com/r/Kenya/comments/1j6cjz...,Agatha_reddit.csv


In [5]:
combined_df.shape #check the shape of the dataset

(1306, 6)

In [6]:
combined_df.isna().sum() #check for missing values

post_title          0
text              398
keyword           564
published_date    197
url                 0
source_file         0
dtype: int64

In [7]:
combined_df.columns

Index(['post_title', 'text', 'keyword', 'published_date', 'url',
       'source_file'],
      dtype='object')

In [8]:
# Extract date and time
combined_df['time'] = combined_df['published_date'].dt.time
combined_df['published_date'] = combined_df['published_date'].dt.date

#Drop the time column
combined_df.drop(columns='time', inplace=True)
combined_df.head()

Unnamed: 0,post_title,text,keyword,published_date,url,source_file
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,usaid kenya,2025-04-15,https://www.reddit.com/r/Kenya/comments/1jzrn2...,Agatha_reddit.csv
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,usaid kenya,2025-04-07,https://www.reddit.com/r/Kenya/comments/1jtcvb...,Agatha_reddit.csv
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,usaid kenya,2025-04-05,https://www.reddit.com/r/Kenya/comments/1jsb14...,Agatha_reddit.csv
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",usaid kenya,2025-03-25,https://www.reddit.com/r/Kenya/comments/1jjehw...,Agatha_reddit.csv
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,usaid kenya,2025-03-08,https://www.reddit.com/r/Kenya/comments/1j6cjz...,Agatha_reddit.csv


In [9]:
combined_df.sample(10) #random sample of 10 rows

Unnamed: 0,post_title,text,keyword,published_date,url,source_file
165,Tech Startups,We all saw that Maas post (Mzungu as a Service...,usaid kenya funding cut,2025-03-28,https://www.reddit.com/r/Kenya/comments/1jlt9j...,Agatha_reddit.csv
1052,Russian ships carrying stolen Ukrainian grains...,,,NaT,https://markets.businessinsider.com/news/commo...,Mbego_reddit_usaid_kenya2.csv
352,UAE purchase/lease of galana kulalu,This article paints a rosy picture but isn't t...,kenya donor funding,2025-03-06,https://www.reddit.com/r/Kenya/comments/1j4v1q...,Agatha_reddit.csv
452,Funding cut for team tracking Ukrainian abduct...,,usaid kenya funding cut,2025-03-15,https://www.pravda.com.ua/eng/news/2025/03/15/...,Agatha_reddit.csv
391,Why?,\r\n\r\n\r\n\r\n\r\nWhy???\r\n\r\nhttps://www....,usaid kenya funding cut,2025-03-14,https://www.reddit.com/gallery/1jbgdq7,Agatha_reddit.csv
1090,USAID HIV FUND CUTS,"Guys with the recent halting of funds for HIV,...",,1970-01-01,https://www.reddit.com/r/Kenya/comments/1j0am0...,reddit_usaid_sentiment.csv
358,Advice for every kenyan who works for an NGO (...,I hate to be the bearer of bad news but the re...,kenya donor funding,2025-02-06,https://www.reddit.com/r/Kenya/comments/1iiyyk...,Agatha_reddit.csv
747,USAID Repercussions + Economy,My neighbour’s wife was a very big shot in USA...,,1970-01-01,https://www.reddit.com/r/Kenya/comments/1kmhn8...,leo_reddit_posts.csv
1096,Economy,For the experts in matters economy and finance...,,1970-01-01,https://www.reddit.com/r/Kenya/comments/1jsyty...,reddit_usaid_sentiment.csv
738,My experience with the USAID Money.,The list of companies who've been getting mone...,"USAID, USAID money, donors, NGOs",2025-02-08,https://www.reddit.com/r/nairobi/comments/1ikl...,cecilia.reddit_nbo_ke_africa.csv


### Save the final merged dataframe to csv

In [10]:
# Save to CSV 
if not combined_df.empty:
    output_filename = "C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_reddit_dataset.csv"
    combined_df.to_csv(output_filename, index=False, encoding='utf-8')
    print(f" Results successfully saved to:\n{output_filename}")
else:
    print(" No data to save. The DataFrame is empty.")


 Results successfully saved to:
C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_reddit_dataset.csv


# Merging News CSV Files

### Import necessary libraries

In [11]:
import pandas as pd
import glob
import os


### Folder path

In [12]:
folder_path = 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data'


### CSV file paths

In [13]:
folder_path = 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data'
omit_file = os.path.join(folder_path, 'Agatha_news_fulltext.csv')

news_csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
news_csv_files = [f for f in news_csv_files if os.path.normpath(f) != os.path.normpath(omit_file)]

news_csv_files


['C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\Agatha_news.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\cecilia.newsapi.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\leo_newsapi_articles_enriched.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\Mbego_news_usaid_kenya_fulltext.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\Mbego_news_usaid_kenya_recent.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\newsapi_usaid_articles.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\ruth_news.csv']

### Reading the files and combining them into one dataframe

In [14]:
import pandas as pd
import os

news_columns = ['title', 'description', 'text', 'url', 'keyword', 'published_date', 'source_file']
all_news_dfs = []

for file in news_csv_files:  
    df = pd.read_csv(file)

    # Rename content to text 
    if 'content' in df.columns and 'text' not in df.columns:
        df.rename(columns={'content': 'text'}, inplace=True)

    # Rename publishedAt to published_date
    if 'publishedAt' in df.columns:
        df.rename(columns={'publishedAt': 'published_date'}, inplace=True)

    # Add missing columns
    for col in news_columns:
        if col not in df.columns:
            df[col] = None

    # Convert published_date to datetime
    df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')



    # source file info
    df['source_file'] = os.path.basename(file)

    # standardized columns
    df = df[news_columns]
    all_news_dfs.append(df)

# Combine all
news_combined_df = pd.concat(all_news_dfs, ignore_index=True)

# Save to CSV
output_path = "C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_news_dataset.csv"
if not news_combined_df.empty:
    news_combined_df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"NewsAPI data saved to:\n{output_path}")
else:
    print("No NewsAPI data to save. DataFrame is empty.")


NewsAPI data saved to:
C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_news_dataset.csv


In [15]:
news_combined_df.shape

(2638, 7)

In [16]:
news_combined_df.sample(10)

Unnamed: 0,title,description,text,url,keyword,published_date,source_file
2543,Elizabeth Warren demands answers over reports ...,Elizabeth Warren demands answers over reports ...,Elizabeth Warren at the US Capitol on 11 Febru...,https://www.newsbreak.com/share/4062317831155-...,,2025-06-22 14:02:34+00:00,newsapi_usaid_articles.csv
1923,US vetoes Security Council resolution demandin...,A draft resolution calling for an immediate an...,By Vibhu Mishra4 June 2025 - A draft resolutio...,https://www.globalsecurity.org/military/librar...,aid withdrawal,2025-06-05 07:30:42+00:00,cecilia.newsapi.csv
648,Trump Administration Sends $9.4 Billion Rescis...,The Trump administration sent a $9.4 billion r...,The Trump administration sent a $9.4 billion r...,https://www.breitbart.com/politics/2025/06/03/...,USAID,2025-06-04 03:18:10+00:00,cecilia.newsapi.csv
971,House GOP Passes DOGE Cuts by a Whisker After ...,The House Republicans successfully passed a re...,The House Republicans successfully passed a re...,https://www.dailysignal.com/2025/06/12/house-g...,aid budget,2025-06-12 20:01:08+00:00,cecilia.newsapi.csv
2348,Charlize Theron's Andy Fights Quynh in a 500 Y...,Netflix has released a new clip from the upcom...,Netflix has released a new clip from the upcom...,https://geektyrant.com/news/charlize-therons-a...,NGOs,2025-06-09 21:00:00+00:00,cecilia.newsapi.csv
396,Trump’s Liquidation of U.S Global Leadership,Some geopolitical aspects of how Trump's mazim...,"Yves here, It is striking to see the degree to...",https://www.nakedcapitalism.com/2025/05/trumps...,foreign aid cut,2025-05-27 07:00:05+00:00,Agatha_news.csv
2373,"In Europe, the ground is being prepared for an...",The genocide denial and Holocaust revisionism ...,"On April 15, Austrian Nobel laureate Peter Han...",https://www.aljazeera.com/opinions/2025/6/14/i...,NGOs,2025-06-14 13:08:28+00:00,cecilia.newsapi.csv
1052,Congress Should Quickly Approve Trump’s Rescis...,President Donald Trump‘s rescission legislatio...,President Donald Trumps rescission legislation...,https://www.dailysignal.com/2025/06/10/congres...,USAID Kenya,2025-06-10 12:00:00+00:00,cecilia.newsapi.csv
900,Trump Is Trying To Defund An Agency That Saves...,The White House wants to slash the $85 million...,President Trump isn't just trying to get rid o...,https://www.forbes.com/sites/thomasbrewster/20...,aid budget,2025-06-13 18:05:21+00:00,cecilia.newsapi.csv
301,Bill Gates to give most of $200 billion fund t...,Bill Gates has urged African leaders to join h...,US billionaire Bill Gates on Tuesday announced...,https://www.dw.com/en/bill-gates-to-give-most-...,development aid kenya,2025-06-03 07:09:00+00:00,Agatha_news.csv


In [17]:
news_combined_df.isna().sum()

title               0
description        16
text               25
url                 2
keyword           259
published_date     99
source_file         0
dtype: int64

## FINAL REDDIT DATA TO BE USED ~ mbego_all_reddit_merged

In [18]:
import pandas as pd
import seaborn as sns

In [19]:
reddit_df = pd.read_csv('C:/Users/hp/Desktop/USAID backuppp for me/mbego_all_reddit_merged.csv')
reddit_df.head()

Unnamed: 0,title,selftext,subreddit,author,created_utc,created_date,score,num_comments,keyword,search_term,date_posted,upvotes,comments,url,permalink
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,Kenya,muerki,4/15/2025 13:16,,3.0,5.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jzrn2...,
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,Kenya,Morio_anzenza,4/7/2025 4:21,,169.0,95.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jtcvb...,
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,Kenya,vindtar,4/5/2025 19:09,,2.0,2.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jsb14...,
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",Kenya,Gold_Smart,3/25/2025 8:18,,13.0,20.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jjehw...,
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,Kenya,westmaxia,3/8/2025 8:08,,1.0,6.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1j6cjz...,


In [20]:
reddit_df.tail()

Unnamed: 0,title,selftext,subreddit,author,created_utc,created_date,score,num_comments,keyword,search_term,date_posted,upvotes,comments,url,permalink
1284,Weekly Sub-Saharan Africa Security Situation a...,#Somalia 🇸🇴\r\n#Sudan 🇸🇩\r\nDemocratic Republi...,Africa,,4/18/2025 14:09,,3.0,2.0,,,,,,https://open.substack.com/pub/hasretkargin/p/w...,
1285,No evidence that Burkina Faso paid off all its...,,Africa,,4/18/2025 8:23,,52.0,25.0,,,,,,https://www.reuters.com/fact-check/burkina-fas...,
1286,Ghana orders foreigners to exit gold market by...,Ghana has ordered foreigners to exit its gold ...,Africa,,4/17/2025 17:59,,101.0,12.0,,,,,,https://eastleighvoice.co.ke/west%20african/13...,
1287,Unending Frustration Regarding Sudan War.,https://www.reuters.com/world/britain-boosts-a...,Africa,,4/16/2025 19:33,,11.0,8.0,,,,,,https://www.reddit.com/gallery/1k0t8ed,
1288,Tanzania's Authoritarian Government Has Just B...,Tanzania's main opposition party has been barr...,Africa,,4/14/2025 11:31,,52.0,14.0,,,,,,https://www.reddit.com/r/Africa/comments/1jywl...,


In [21]:
reddit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1289 entries, 0 to 1288
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1289 non-null   object 
 1   selftext      901 non-null    object 
 2   subreddit     1289 non-null   object 
 3   author        466 non-null    object 
 4   created_utc   1289 non-null   object 
 5   created_date  150 non-null    object 
 6   score         1013 non-null   float64
 7   num_comments  833 non-null    float64
 8   keyword       742 non-null    object 
 9   search_term   150 non-null    object 
 10  date_posted   0 non-null      float64
 11  upvotes       276 non-null    float64
 12  comments      276 non-null    float64
 13  url           1289 non-null   object 
 14  permalink     426 non-null    object 
dtypes: float64(5), object(10)
memory usage: 151.2+ KB


In [22]:
reddit_df.shape

(1289, 15)

In [23]:
reddit_df.columns

Index(['title', 'selftext', 'subreddit', 'author', 'created_utc',
       'created_date', 'score', 'num_comments', 'keyword', 'search_term',
       'date_posted', 'upvotes', 'comments', 'url', 'permalink'],
      dtype='object')

In [24]:
reddit_df.dtypes

title            object
selftext         object
subreddit        object
author           object
created_utc      object
created_date     object
score           float64
num_comments    float64
keyword          object
search_term      object
date_posted     float64
upvotes         float64
comments        float64
url              object
permalink        object
dtype: object

In [25]:
reddit_df.isna().sum()

title              0
selftext         388
subreddit          0
author           823
created_utc        0
created_date    1139
score            276
num_comments     456
keyword          547
search_term     1139
date_posted     1289
upvotes         1013
comments        1013
url                0
permalink        863
dtype: int64

In [26]:
reddit_df.describe()

Unnamed: 0,score,num_comments,date_posted,upvotes,comments
count,1013.0,833.0,0.0,276.0,276.0
mean,344.626851,31.34934,,337.518116,35.905797
std,3106.309319,68.060435,,1623.914761,73.531775
min,0.0,0.0,,0.0,0.0
25%,3.0,4.0,,6.75,5.0
50%,10.0,9.0,,43.5,13.0
75%,54.0,31.0,,111.25,31.25
max,79088.0,706.0,,22208.0,558.0


## Data Cleaning

### Dropping Unneccessary Columns

In [27]:
reddit_df.drop(columns=['subreddit', 'author','created_date','score','num_comments','search_term','date_posted','upvotes','comments','permalink'], inplace=True)
reddit_df.head()

Unnamed: 0,title,selftext,created_utc,keyword,url
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,4/15/2025 13:16,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,4/7/2025 4:21,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,4/5/2025 19:09,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",3/25/2025 8:18,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,3/8/2025 8:08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...


In [28]:
reddit_df.columns

Index(['title', 'selftext', 'created_utc', 'keyword', 'url'], dtype='object')

### Checking for missing values

In [29]:
reddit_df.isna().sum()

title            0
selftext       388
created_utc      0
keyword        547
url              0
dtype: int64

In [30]:
reddit_df[['selftext', 'title','url']].sample(10)

Unnamed: 0,selftext,title,url
164,In reference to this context 👇😂https://www.red...,Ghosting,https://www.reddit.com/r/Kenya/comments/1jr25y...
1071,,China investigates Canadian couple suspected o...,http://www.cnn.com/2014/08/05/world/asia/china...
1047,,Hong Kong's biggest pro-democracy party moves ...,https://apnews.com/article/hong-kong-china-dem...
1064,,US government provides $173m Bangladesh to cou...,https://thewonderweb.org/us-government-provide...
742,My neighbour’s wife was a very big shot in USA...,USAID Repercussions + Economy,https://www.reddit.com/r/Kenya/comments/1kmhn8...
201,Project 2025 is a political plan developed by ...,Trump/ The Republicans are not guessing he is ...,https://www.reddit.com/r/Kenya/comments/1ii5ve...
500,,How to make US foreign aid work for Africa and...,https://www.semafor.com/article/02/03/2025/how...
383,What's your take on the new policy by Donald T...,Interest post from X,https://i.redd.it/9y5f6d1s4dfe1.png
796,I missed the damn SGR train from Nairobi to Vo...,Missed the SGR... Ended Up Discovering the Rea...,https://www.reddit.com/r/Kenya/comments/1kc5lp...
1145,If you've watched several of the biggest court...,You Cannot Give Up,https://www.reddit.com/r/Kenya/comments/1lb18c...


In [31]:
reddit_df.rename(columns={'title':'post_title','selftext':'text','created_utc':'published_date'},inplace=True)
reddit_df.head()

Unnamed: 0,post_title,text,published_date,keyword,url
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,4/15/2025 13:16,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,4/7/2025 4:21,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,4/5/2025 19:09,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",3/25/2025 8:18,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,3/8/2025 8:08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...


**parse** - analyze and convert a piece of data (like a string) into a different, more useful format.

The below code cleans the published_date column in the data. Some dates are written as normal dates (eg "2025-04-15") and others are written as numbers (UNIX timestamps). The code checks each value and tries to convert it into a proper date. If it can’t, it leaves it empty. After that, it removes the time part and keeps just the date. This helps make sure all the dates are in the same clean format.

In [32]:
import pandas as pd

# Function to handle both datetime strings and UNIX timestamps
def parse_mixed_dates(val):
    try:
        # Parse as a regular datetime string
        return pd.to_datetime(val)
    except:
        try:
            # Parse as a UNIX timestamp
            return pd.to_datetime(float(val), unit='s')
        except:
            return pd.NaT  # Return NaT if all parsing fails

# Apply the parsing function
reddit_df['published_date'] = reddit_df['published_date'].apply(parse_mixed_dates)

# Check how many dates failed to parse
print("Unparsed dates:", reddit_df['published_date'].isna().sum())

# Extract date only (remove time)
reddit_df['published_date'] = reddit_df['published_date'].dt.date
reddit_df.head()

Unparsed dates: 0


Unnamed: 0,post_title,text,published_date,keyword,url
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,2025-04-15,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,2025-04-07,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,2025-04-05,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",2025-03-25,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,2025-03-08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...


In [33]:
reddit_df['published_date'].isna().sum() #check the missing values in date

0

In [34]:
reddit_df.dropna(subset=['published_date'], inplace=True) #drop missing values in date

In [35]:
reddit_df.shape #check the shape of the dataset

(1289, 5)

In [36]:
reddit_df.isna().sum() #check for missing values

post_title          0
text              388
published_date      0
keyword           547
url                 0
dtype: int64

### Fill missing values in text with an empty string

I filled missing values in text with an empty string because these are likely link posts where the author did not add body text.

In [37]:
reddit_df['text'] = reddit_df['text'].fillna('')
reddit_df.isna().sum()

post_title          0
text                0
published_date      0
keyword           547
url                 0
dtype: int64

### Fill missing values in keyword with a placeholder 'Unknown'

In [38]:
reddit_df['keyword'] = reddit_df['keyword'].fillna('unknown')
reddit_df['keyword'].sample(10)

323          kenya donor funding
32              usaid budget cut
329          kenya donor funding
869                      unknown
1100                     unknown
948                      unknown
527     foreign aid, foreign aid
1230                     unknown
1139                     unknown
487                        USAID
Name: keyword, dtype: object

In [39]:
reddit_df.isna().sum()

post_title        0
text              0
published_date    0
keyword           0
url               0
dtype: int64

Missing values filled with an empty string and placeholder to maintain the integrity of the data and maintain a good amount of rows

### Check for duplicates 

In [40]:
reddit_df.duplicated().sum()

145

In [41]:
reddit_df.drop_duplicates(inplace=True) #drop all duplicates

In [42]:
reddit_df.duplicated().sum() #confirm that all duplicates have been dropped

0

In [43]:
reddit_df.shape #shape of the new data after cleaning

(1144, 5)

In [44]:
reddit_df['post_title'].sample(10)

409     ‘What a project, what a challenge!’: Africa’s ...
258      The Deplorable State of Web Development in Kenya
59                             Kenya Isn’t That Bad… Ama?
223                         Conspiracy theorists assemble
640     Supreme Court Rejects Trump on USAID Foreign-A...
1041    USAID work halted in Russia, dealing blow to O...
1215    At least 100 people killed in gunmen attack in...
207                             How do I cut off friends?
1129    I'm I the only one whose dreams and ambitions ...
918     Musk Vs USaid (Battle of the wealthiest man vs...
Name: post_title, dtype: object

## Sentiment Analysis Cleaning

Creating a new column 'full_text' that combines 'post_title','text' and the empty string '', to help in sentiment analysis

In [45]:
reddit_df['full_text'] = reddit_df['post_title'] + ' ' + reddit_df['text']
reddit_df.head() #sample top 5 rows after combining the columns

Unnamed: 0,post_title,text,published_date,keyword,url,full_text
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,2025-04-15,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...,"USAID left a month ago, do we have ARVs in Ken..."
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,2025-04-07,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...,Classism in r/Kenya and r/nairobi The classism...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,2025-04-05,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...,EX-USAID people!! Let's talk Are you still in ...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",2025-03-25,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...,Why western powers back Israel no matter what ...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,2025-03-08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...,Is kenya capable of funding its needs now that...


In [46]:
reddit_df['full_text'].isna().sum() #confirm that there are no missing values in the dataset

0

### Lowercasing 

Converting all text to lowercase

In [47]:
reddit_df['full_text'] = reddit_df['full_text'].str.lower()
reddit_df['full_text'].sample(10)

453     rohingya refugees in bangladesh brace for upco...
796     missed the sgr... ended up discovering the rea...
655     judge again orders us to unfreeze foreign aid,...
625     how foreign aid cuts are setting the stage for...
1237    defiant tanzanian opposition leader tundu liss...
866     starting a tech business here in kenya- which ...
574     accurate statistics the usual figure that i ha...
107     i might be wrong but.. trump's decision to pha...
215     usaid hiv fund cuts guys with the recent halti...
804     tech bros & sis! what’s the best path now? mor...
Name: full_text, dtype: object

### Translate all text to English

In [48]:
!pip install langdetect #detect which language the text are in


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#detect': Expected package name at the start of dependency specifier
    #detect
    ^


In [49]:
from langdetect import detect

# Language detection
reddit_df['language'] = reddit_df['full_text'].apply(lambda x: detect(x) if pd.notnull(x) else 'unknown')
reddit_df['language'].value_counts()

en    1125
sw       5
tl       3
et       2
id       2
fr       2
sl       1
da       1
nl       1
ca       1
es       1
Name: language, dtype: int64

In [50]:
# View full text rows where the language is not English
non_english_df = reddit_df[reddit_df['language'] != 'en']

# Display the full_text column of non-English rows
print(non_english_df[['language', 'full_text']])


     language                                          full_text
28         tl  ...na bado mnasema ruto must go bila tangible ...
175        tl  ...na bado mnasema ruto must go bila tangible ...
236        sl                                design job nairobi 
273        tl  ...na bado mnasema ruto must go bila tangible ...
301        et  just seeing ki, sa, che, ko, la, si...pure tri...
372        id  does kenya have a shared national identity? i ...
634        fr  supreme court rules trump administration must ...
638        fr  supreme court denies trump administration requ...
664        da  judge orders us to restore funds for foreign a...
852        sw  "wapi mtoto?" it was always an escape for me d...
853        sw  drunk and orderly it so happened that one time...
903        nl  us strike kills 16 afghan policemen in helmand...
943        et                            joblessness is looming 
1036       ca  russia expels usaid for 'political meddling' -...
1061       es     chaos e

In [51]:
! pip install deep-translator #Translates text from one language to another



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#Translates': Expected package name at the start of dependency specifier
    #Translates
    ^


In [52]:
from deep_translator import GoogleTranslator

# Make a copy of non-English rows
non_english_df = reddit_df[reddit_df['language'] != 'en'].copy()

# Translate each full_text
non_english_df['translated_text'] = non_english_df['full_text'].apply(
    lambda text: GoogleTranslator(source='auto', target='en').translate(text)
)

# Preview translations
print(non_english_df[['language', 'full_text', 'translated_text']].head())


    language                                          full_text  \
28        tl  ...na bado mnasema ruto must go bila tangible ...   
175       tl  ...na bado mnasema ruto must go bila tangible ...   
236       sl                                design job nairobi    
273       tl  ...na bado mnasema ruto must go bila tangible ...   
301       et  just seeing ki, sa, che, ko, la, si...pure tri...   

                                       translated_text  
28   ... and you still say Ruto must go without tan...  
175  ... and you still say Ruto must go without tan...  
236                                 design job nairobi  
273  ... and you still say Ruto must go without tan...  
301  just seeing ki, sa, che, ko, la, si ... pure t...  


From the above preview, shows that the text with both english and swahili was able to be translated as shown in index 28

In [53]:
reddit_df['language'].value_counts() #check which other languages have been detected

en    1125
sw       5
tl       3
et       2
id       2
fr       2
sl       1
da       1
nl       1
ca       1
es       1
Name: language, dtype: int64

In [54]:
# Drop all non-English rows
reddit_df = reddit_df[reddit_df['language'] == 'en'].copy()

# Reset the index
reddit_df.reset_index(drop=True, inplace=True)

# Preview the cleaned data
print(reddit_df[['language', 'full_text']].head())


  language                                          full_text
0       en  usaid left a month ago, do we have arvs in ken...
1       en  classism in r/kenya and r/nairobi the classism...
2       en  ex-usaid people!! let's talk are you still in ...
3       en  why western powers back israel no matter what ...
4       en  is kenya capable of funding its needs now that...


In [55]:
reddit_df['language'].value_counts() #confirm that all rows are in English only

en    1125
Name: language, dtype: int64

In [56]:
reddit_df.sample(10)

Unnamed: 0,post_title,text,published_date,keyword,url,full_text,language
961,Family Curse?,I''ll try to keep this as simple as possible.\...,2025-06-17,unknown,https://www.reddit.com/r/Kenya/comments/1ldecs...,family curse? i''ll try to keep this as simple...,en
599,Divided Supreme Court says judge can force Tru...,,2025-03-05,"foreign aid, foreign aid",https://www.washingtonpost.com/politics/2025/0...,divided supreme court says judge can force tru...,en
159,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,2025-04-05,usaid kenya funding cut,https://www.reddit.com/r/Kenya/comments/1jsb14...,ex-usaid people!! let's talk are you still in ...,en
463,Economy,For the experts in matters economy and finance...,2025-04-06,USAID,https://www.reddit.com/r/Kenya/comments/1jsyty...,economy for the experts in matters economy and...,en
78,Kenya’s Foreign Policy: Vibes or Strategy?,"High folks, long time Reddit lurker over here....",2025-03-25,kenya foreign aid,https://youtu.be/n2vX71nH-8s?si=0h3QIjA_WM5I4k8A,kenya’s foreign policy: vibes or strategy? hig...,en
603,DOGE's Foreign Aid Cuts Have Sparked 'Total Ch...,,2025-03-04,"foreign aid, foreign aid",https://www.wired.com/story/rights-con-taipei-...,doge's foreign aid cuts have sparked 'total ch...,en
333,Sign the Petition to Declare Femicide a Crime ...,Why this petition matters\r\n\r\n\r\nStarted b...,2025-04-10,kenya donor funding,https://chng.it/4L2zDbNGMR,sign the petition to declare femicide a crime ...,en
232,Our education system,I saw this post on X and it has left me thinki...,2025-05-22,development aid kenya,https://i.redd.it/avdn059x6d2f1.png,our education system i saw this post on x and ...,en
1016,Marrying at 72,"Someone said their mom got married at 72, i do...",2025-06-11,unknown,https://www.reddit.com/r/Kenya/comments/1l8rbz...,marrying at 72 someone said their mom got marr...,en
1095,Ugandan Judge sentenced to over six years for ...,"Lydia Mugambe, 50, has been sentenced to six y...",2025-05-03,unknown,https://www.cps.gov.uk/cps/news/ugandan-judge-...,ugandan judge sentenced to over six years for ...,en


In [57]:
reddit_df.drop('language', axis=1, inplace=True) #drop the language column as it is unnecessary now
reddit_df.head()

Unnamed: 0,post_title,text,published_date,keyword,url,full_text
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,2025-04-15,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...,"usaid left a month ago, do we have arvs in ken..."
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,2025-04-07,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...,classism in r/kenya and r/nairobi the classism...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,2025-04-05,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...,ex-usaid people!! let's talk are you still in ...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",2025-03-25,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...,why western powers back israel no matter what ...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,2025-03-08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...,is kenya capable of funding its needs now that...


In [58]:
reddit_df.shape

(1125, 6)

### Removing Punctuation Marks

In [59]:
import re
import string

# Function to remove punctuation
def remove_punctuation(text):
    return re.sub(f"[{re.escape(string.punctuation)}]", "", str(text))

# Apply to the 'full_text' column
reddit_df['full_text'] = reddit_df['full_text'].apply(remove_punctuation)

reddit_df.head()


Unnamed: 0,post_title,text,published_date,keyword,url,full_text
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,2025-04-15,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...,usaid left a month ago do we have arvs in keny...
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,2025-04-07,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...,classism in rkenya and rnairobi the classism i...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,2025-04-05,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...,exusaid people lets talk are you still in cont...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",2025-03-25,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...,why western powers back israel no matter what ...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,2025-03-08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...,is kenya capable of funding its needs now that...


### Removing Stop Words,Emojis,Non-Emoji Symbols and Lemmatization

In [60]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to remove emojis, emoticons, and other symbols
def remove_emojis_symbols(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        u"\U00002700-\U000027BF"  # dingbats
        u"\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
        u"\U0001FA70-\U0001FAFF"  # extended pictographic symbols
        u"\U00002500-\U00002BEF"  # misc symbols
        u"\U0000200D"             # zero width joiner
        u"\u2600-\u26FF"          # misc symbols
        u"\u2700-\u27BF"
        u"\uFE0F"                 # variation selector
        u"\u3030"
        u"\u00A9"                 # ©
        u"\u00AE"                 # ®
        u"\u2122"                 # ™
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Final cleaning function
def clean_text(text):
    text = remove_emojis_symbols(text)
    words = text.split()
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(cleaned_words)

# Apply to reddit_df
reddit_df['cleaned_text'] = reddit_df['full_text'].apply(clean_text)

# Preview sample
reddit_df.sample(10)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,post_title,text,published_date,keyword,url,full_text,cleaned_text
637,Foreign Aid Freeze Leaves Millions Without H.I...,,2025-02-06,"foreign aid, foreign aid",https://www.nytimes.com/2025/02/05/health/trum...,foreign aid freeze leaves millions without hiv...,foreign aid freeze leaf million without hiv tr...
277,Unpopular opinion,Kenya will never become truly wealthy by relyi...,2025-02-25,development aid kenya,https://www.reddit.com/r/Kenya/comments/1ixyio...,unpopular opinion kenya will never become trul...,unpopular opinion kenya never become truly wea...
58,Kenya Isn’t That Bad… Ama?,So I have this friend who lives in Europe. Sin...,2025-04-25,kenya foreign aid,https://www.reddit.com/r/Kenya/comments/1k7s8a...,kenya isn’t that bad… ama so i have this frien...,kenya isn’t bad… ama friend life europe since ...
364,Title: I grew up idolizing France. Now I see t...,I want to believe we still have what it takes—...,2025-05-12,kenya foreign aid,https://www.reddit.com/r/Africa/comments/1kknb...,title i grew up idolizing france now i see the...,title grew idolizing france see whole system w...
223,Thinking of Importing from Alibaba to Kenya? H...,If you’re new to importing from Alibaba to Ken...,2025-06-09,development aid kenya,https://www.reddit.com/r/Kenya/comments/1l7f9r...,thinking of importing from alibaba to kenya he...,thinking importing alibaba kenya here’s beginn...
383,Why?,\r\n\r\n\r\n\r\n\r\nWhy???\r\n\r\nhttps://www....,2025-03-14,usaid kenya funding cut,https://www.reddit.com/gallery/1jbgdq7,why \r\n\r\n\r\n\r\n\r\nwhy\r\n\r\nhttpswwwbbc...,httpswwwbbccomnewsarticlesckg815277g5oamp make...
749,Is this a general erosion in culture? Or is it...,"Firstly, Happy mother's day to all the mums ho...",2025-05-11,unknown,https://www.reddit.com/r/Kenya/comments/1kjxv8...,is this a general erosion in culture or is it ...,general erosion culture moral decay firstly ha...
682,Kenyan Startup environment and what can be done,I'm sure most of us are fully aware of the fru...,2025-04-07,unknown,https://www.reddit.com/r/Kenya/comments/1jtthj...,kenyan startup environment and what can be don...,kenyan startup environment done im sure u full...
649,Stop-Work Order on US Foreign Aid Puts China F...,,2025-01-27,"foreign aid, foreign aid",https://www.justsecurity.org/106876/us-foreign...,stopwork order on us foreign aid puts china fi...,stopwork order u foreign aid put china first a...
728,"Kenyan comfort food - ugali, nyama na sukuma w...","\r\nI swear anytime anywhere, nyama ugali and ...",2025-05-28,unknown,https://www.reddit.com/r/Kenya/comments/1kxgbt...,kenyan comfort food ugali nyama na sukuma wik...,kenyan comfort food ugali nyama na sukuma wiki...


In [61]:
reddit_df.iloc[268]['cleaned_text']


'kenya allowed british rule beyond 1963 let’s talk hear white man stayed past 1963 it’s controversial thought begs question—how would kenya look today allowed british govern u year would developed south africa vibrant hong kong can’t help wonder truly ready independence honestly strongly believe weren’t—and maybe still aren’t freedom premature move let brit continue pulling string sake development i’m opening floor debate convince otherwise good people let’s talk kenya history development freedomdebate'

In [62]:
reddit_df.iloc[268]['full_text']


'🤔 what if kenya allowed british rule beyond 1963 let’s talk about it hear me out the white man stayed past 1963 it’s a controversial thought but it begs the question—how would kenya look today if we allowed the british to govern us for a few more years would we be as developed as south africa or as vibrant as hong kong  \r\n\r\ni can’t help but wonder if we were truly ready for independence honestly i strongly believe we weren’t—and maybe still aren’t was freedom a premature move should we have let the brits continue pulling the strings for the sake of development  \r\n\r\ni’m opening the floor for debate convince me otherwise good people let’s talk 🧐 kenya history development freedomdebate'

### Tokenization

In [63]:
import nltk
from nltk.tokenize import word_tokenize

# Download tokenizer data 
nltk.download('punkt')

#  tokenization function
def tokenize_text(text):
    return word_tokenize(text)

# Apply to cleaned text
reddit_df['tokens'] = reddit_df['cleaned_text'].apply(tokenize_text)

# display the first 5 rows
reddit_df[['cleaned_text', 'tokens']].head()
reddit_df.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,post_title,text,published_date,keyword,url,full_text,cleaned_text,tokens
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,2025-04-15,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...,usaid left a month ago do we have arvs in keny...,usaid left month ago arvs kenya someone differ...,"[usaid, left, month, ago, arvs, kenya, someone..."
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,2025-04-07,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...,classism in rkenya and rnairobi the classism i...,classism rkenya rnairobi classism im seeing su...,"[classism, rkenya, rnairobi, classism, im, see..."
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,2025-04-05,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...,exusaid people lets talk are you still in cont...,exusaid people let talk still contact organisa...,"[exusaid, people, let, talk, still, contact, o..."
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",2025-03-25,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...,why western powers back israel no matter what ...,western power back israel matter kenya avoid u...,"[western, power, back, israel, matter, kenya, ..."
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,2025-03-08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...,is kenya capable of funding its needs now that...,kenya capable funding need usaid disbanded ken...,"[kenya, capable, funding, need, usaid, disband..."


## FINAL NEWS DATA TO BE USED ~ Cecilia_merged_news_dataset

In [64]:
import pandas as pd
import seaborn as sns

In [65]:
news_df = pd.read_csv('C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_news_dataset.csv')
news_df.head()

Unnamed: 0,title,description,text,url,keyword,published_date,source_file
0,Has DOGE really saved the US government $180bn?,Elon Musk first claimed the department would m...,President Donald Trump and adviser Elon Musk c...,https://www.aljazeera.com/news/2025/6/6/has-do...,usaid kenya,2025-06-06 11:21:51+00:00,Agatha_news.csv
1,The Life Story of Ecomobilus Technologies Limi...,By Prof Geoffrey Gitau Here is a story showcas...,By Prof Geoffrey Gitau\r\nHere is a story show...,https://cleantechnica.com/2025/05/26/the-life-...,usaid kenya,2025-05-26 17:13:41+00:00,Agatha_news.csv
2,"Death, Sexual Violence and Human Trafficking: ...",by Brett Murphy and Anna Maria Barry-Jester \n...,ProPublica is a nonprofit newsroom that invest...,https://www.propublica.org/article/trump-usaid...,usaid kenya,2025-05-28 18:45:00+00:00,Agatha_news.csv
3,Congress Should Quickly Approve Trump’s Rescis...,President Donald Trump‘s rescission legislatio...,President Donald Trumps rescission legislation...,https://www.dailysignal.com/2025/06/10/congres...,usaid kenya,2025-06-10 12:00:00+00:00,Agatha_news.csv
4,Food Safety Depends On Every Link In The Suppl...,Almost 1 in 10 people globally fall ill from c...,Colorful fish and vegetables can be purchased ...,https://www.forbes.com/sites/daniellenierenber...,usaid kenya,2025-06-06 13:55:41+00:00,Agatha_news.csv


In [66]:
news_df.tail()

Unnamed: 0,title,description,text,url,keyword,published_date,source_file
2633,"Death, Sexual Violence and Human Trafficking: ...",by Brett Murphy and Anna Maria Barry-Jester \r...,ProPublica is a nonprofit newsroom that invest...,https://www.propublica.org/article/trump-usaid...,,2025-05-28 18:45:00+00:00,ruth_news.csv
2634,The Life Story of Ecomobilus Technologies Limi...,By Prof Geoffrey Gitau Here is a story showcas...,By Prof Geoffrey Gitau\r\nHere is a story show...,https://cleantechnica.com/2025/05/26/the-life-...,,2025-05-26 17:13:41+00:00,ruth_news.csv
2635,Stakeholders’ perspectives on the status of fa...,Introduction Facility- and community-based dif...,Abstract\r\nIntroduction\r\nFacility- and comm...,https://journals.plos.org/plosone/article?id=1...,,2025-05-22 14:00:00+00:00,ruth_news.csv
2636,"Africa Needs More Renewables, So Why Is It Inv...",Sub-Saharan Africa has an energy funding deficit.,"LICHTENBURG, SOUTH AFRICA - MAY 8: A man is se...",https://www.forbes.com/sites/sverrealvik/2025/...,,2025-05-20 09:57:16+00:00,ruth_news.csv
2637,May 2025 Updates,Every month we send an email newsletter to our...,Every month we send an email newsletter to our...,https://blog.givewell.org/2025/05/19/may-2025-...,,2025-05-19 19:26:41+00:00,ruth_news.csv


In [67]:
news_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2638 entries, 0 to 2637
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   title           2638 non-null   object
 1   description     2622 non-null   object
 2   text            2613 non-null   object
 3   url             2636 non-null   object
 4   keyword         2379 non-null   object
 5   published_date  2539 non-null   object
 6   source_file     2638 non-null   object
dtypes: object(7)
memory usage: 144.4+ KB


In [68]:
news_df.shape

(2638, 7)

In [69]:
news_df.columns

Index(['title', 'description', 'text', 'url', 'keyword', 'published_date',
       'source_file'],
      dtype='object')

In [70]:
news_df.dtypes

title             object
description       object
text              object
url               object
keyword           object
published_date    object
source_file       object
dtype: object

In [71]:
news_df.isna().sum()

title               0
description        16
text               25
url                 2
keyword           259
published_date     99
source_file         0
dtype: int64

In [72]:
news_df.describe()

Unnamed: 0,title,description,text,url,keyword,published_date,source_file
count,2638,2622,2613,2636,2379,2539,2638
unique,1453,1457,1445,1486,30,1341,7
top,"Death, Sexual Violence and Human Trafficking: ...",Elon Musk first claimed the department would m...,ProPublica is a nonprofit newsroom that invest...,https://www.propublica.org/article/trump-usaid...,foreign aid,2025-05-28 18:45:00+00:00,cecilia.newsapi.csv
freq,23,22,22,23,196,23,1787


## Data Cleaning

### Feature Engineering

##### Create column 'time' from 'published_date'

In [73]:
# Convert published_date to datetime format
news_df['published_date'] = pd.to_datetime(news_df['published_date'])

# Extract time into a new column 'time'
news_df['time'] = news_df['published_date'].dt.time

# Keep only the date (drop the time) in published_date
news_df['published_date'] = news_df['published_date'].dt.date

# Preview
news_df.head()


Unnamed: 0,title,description,text,url,keyword,published_date,source_file,time
0,Has DOGE really saved the US government $180bn?,Elon Musk first claimed the department would m...,President Donald Trump and adviser Elon Musk c...,https://www.aljazeera.com/news/2025/6/6/has-do...,usaid kenya,2025-06-06,Agatha_news.csv,11:21:51
1,The Life Story of Ecomobilus Technologies Limi...,By Prof Geoffrey Gitau Here is a story showcas...,By Prof Geoffrey Gitau\r\nHere is a story show...,https://cleantechnica.com/2025/05/26/the-life-...,usaid kenya,2025-05-26,Agatha_news.csv,17:13:41
2,"Death, Sexual Violence and Human Trafficking: ...",by Brett Murphy and Anna Maria Barry-Jester \n...,ProPublica is a nonprofit newsroom that invest...,https://www.propublica.org/article/trump-usaid...,usaid kenya,2025-05-28,Agatha_news.csv,18:45:00
3,Congress Should Quickly Approve Trump’s Rescis...,President Donald Trump‘s rescission legislatio...,President Donald Trumps rescission legislation...,https://www.dailysignal.com/2025/06/10/congres...,usaid kenya,2025-06-10,Agatha_news.csv,12:00:00
4,Food Safety Depends On Every Link In The Suppl...,Almost 1 in 10 people globally fall ill from c...,Colorful fish and vegetables can be purchased ...,https://www.forbes.com/sites/daniellenierenber...,usaid kenya,2025-06-06,Agatha_news.csv,13:55:41


### Dropping Unneccessary Columns

In [74]:
news_df.drop(columns=['source_file', 'time'], inplace=True)
news_df.head()


Unnamed: 0,title,description,text,url,keyword,published_date
0,Has DOGE really saved the US government $180bn?,Elon Musk first claimed the department would m...,President Donald Trump and adviser Elon Musk c...,https://www.aljazeera.com/news/2025/6/6/has-do...,usaid kenya,2025-06-06
1,The Life Story of Ecomobilus Technologies Limi...,By Prof Geoffrey Gitau Here is a story showcas...,By Prof Geoffrey Gitau\r\nHere is a story show...,https://cleantechnica.com/2025/05/26/the-life-...,usaid kenya,2025-05-26
2,"Death, Sexual Violence and Human Trafficking: ...",by Brett Murphy and Anna Maria Barry-Jester \n...,ProPublica is a nonprofit newsroom that invest...,https://www.propublica.org/article/trump-usaid...,usaid kenya,2025-05-28
3,Congress Should Quickly Approve Trump’s Rescis...,President Donald Trump‘s rescission legislatio...,President Donald Trumps rescission legislation...,https://www.dailysignal.com/2025/06/10/congres...,usaid kenya,2025-06-10
4,Food Safety Depends On Every Link In The Suppl...,Almost 1 in 10 people globally fall ill from c...,Colorful fish and vegetables can be purchased ...,https://www.forbes.com/sites/daniellenierenber...,usaid kenya,2025-06-06


### Checking for missing values

In [75]:
news_df.isna().sum() #check for missing values

title               0
description        16
text               25
url                 2
keyword           259
published_date     99
dtype: int64

In [76]:
# Fill columns with placeholders and empty string
news_df['description'].fillna(' ', inplace=True)
news_df['text'].fillna(' ', inplace=True)

# Drop rows where url, keyword or published_date is missing
news_df.dropna(subset=['url', 'published_date','keyword'], inplace=True)


In [77]:
news_df.isna().sum() #confirm that there are no more missing values and data has been cleaned well

title             0
description       0
text              0
url               0
keyword           0
published_date    0
dtype: int64

In [78]:
news_df.shape #check the shape of the data after dealing with missing values. 
#there is still a good amount of data left, since only around 300 rows were dropped.

(2379, 6)

### Check for duplicates

In [79]:
news_df.duplicated().sum() #check the number of uplicates in the data

98

#### Drop duplicates

In [80]:
news_df.drop_duplicates(inplace=True) #drop the duplicates

In [81]:
news_df.duplicated().sum() #confirm that all duplicates have been dropped, and the data is clean

0

In [82]:
news_df.shape #check the new shape of the data

(2281, 6)

## Sentiment Analysis Cleaning

### Feature Engineering

Creating a new column 'full_text' byy combining 'title', 'description', 'text' columns and the empty strings used in filling na

In [83]:
news_df['full_text'] = news_df['title'] + ' ' + news_df['description'] + news_df['text']
news_df.iloc[4]['full_text']  #sample check for row number 5

'Food Safety Depends On Every Link In The Supply Chain Almost 1 in 10 people globally fall ill from contaminated food every year. Making our food supply safer for communities depends on everyone.Colorful fish and vegetables can be purchased at a public market.\r\ngetty\r\nFor communities to be nourished, their food supply must be safe to eat.\r\nThis sounds obvious, but its worth repeating, becaus… [+4445 chars]'

### Lowercasing

Converting all text to lowercase

In [84]:
news_df['full_text'] = news_df['full_text'].str.lower()
news_df.iloc[4]['full_text'] #sample check for row number 5 and confirm that all text has been lowercased

'food safety depends on every link in the supply chain almost 1 in 10 people globally fall ill from contaminated food every year. making our food supply safer for communities depends on everyone.colorful fish and vegetables can be purchased at a public market.\r\ngetty\r\nfor communities to be nourished, their food supply must be safe to eat.\r\nthis sounds obvious, but its worth repeating, becaus… [+4445 chars]'

Cleaning the text and remove \r\n, backslashes \, and any similar unwanted characters

In [85]:
# Remove \r, \n, and backslashes from the 'text' column
news_df['full_text'] = news_df['full_text'].str.replace(r'\r', ' ', regex=True)
news_df['full_text'] = news_df['full_text'].str.replace(r'\n', ' ', regex=True)
news_df['full_text'] = news_df['full_text'].str.replace(r'\\', '', regex=True)


In [86]:
news_df.iloc[4]['full_text']

'food safety depends on every link in the supply chain almost 1 in 10 people globally fall ill from contaminated food every year. making our food supply safer for communities depends on everyone.colorful fish and vegetables can be purchased at a public market.  getty  for communities to be nourished, their food supply must be safe to eat.  this sounds obvious, but its worth repeating, becaus… [+4445 chars]'

### Translate all text to English

In [87]:
from langdetect import detect

# Language detection, detect what language the 'full_text' is in
news_df['language'] = news_df['full_text'].apply(lambda x: detect(x) if pd.notnull(x) else 'unknown')
news_df['language'].value_counts() 
#from the output, all 2281 rows are in English, so there is no need for translation to English

en    2281
Name: language, dtype: int64

### Removing Punctuation Marks

In [88]:
import re
import string

# Function to remove punctuation
def remove_punctuation(text):
    return re.sub(f"[{re.escape(string.punctuation)}]", "", str(text))

# Apply to the 'full_text' column
news_df['full_text'] = news_df['full_text'].apply(remove_punctuation)

news_df.iloc[4]['full_text'] #using column 5 as an example to see whether all punctuation marks have been removed and from my observation, all punctuation marks have been removed


'food safety depends on every link in the supply chain almost 1 in 10 people globally fall ill from contaminated food every year making our food supply safer for communities depends on everyonecolorful fish and vegetables can be purchased at a public market  getty  for communities to be nourished their food supply must be safe to eat  this sounds obvious but its worth repeating becaus… 4445 chars'

### Removing Stop Words,Emojis,Non-Emoji Symbols and Lemmatization

Removes emojis and special symbols – Eliminates non-textual elements to reduce noise.

Removes stopwords – Discards common words with little meaning to focus on key terms.

Lemmatizes words – Converts words to their base forms for standardization.


In [89]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download NLTK data
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to remove emojis, emoticons, and other symbols
def remove_emojis_symbols(text):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags
        u"\U00002700-\U000027BF"  # dingbats
        u"\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
        u"\U0001FA70-\U0001FAFF"  # extended pictographic symbols
        u"\U00002500-\U00002BEF"  # misc symbols
        u"\U0000200D"             # zero width joiner
        u"\u2600-\u26FF"          # misc symbols
        u"\u2700-\u27BF"
        u"\uFE0F"                 # variation selector
        u"\u3030"
        u"\u00A9"                 # ©
        u"\u00AE"                 # ®
        u"\u2122"                 # ™
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

# Final cleaning function
def clean_text(text):
    text = remove_emojis_symbols(text)
    words = text.split()
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(cleaned_words)

# Apply to news_df
news_df['cleaned_text'] = news_df['full_text'].apply(clean_text) #Stores cleaned text in a new column – Preserves original data while saving processed output.

# Preview sample
news_df.iloc[4]['cleaned_text']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


'food safety depends every link supply chain almost 1 10 people globally fall ill contaminated food every year making food supply safer community depends everyonecolorful fish vegetable purchased public market getty community nourished food supply must safe eat sound obvious worth repeating becaus… 4445 char'

## Tokenization

In [90]:
import nltk
from nltk.tokenize import word_tokenize

# Download tokenizer data 
nltk.download('punkt')

#  tokenization function
def tokenize_text(text):
    return word_tokenize(text)

# Apply to cleaned text
news_df['tokens'] = news_df['cleaned_text'].apply(tokenize_text)

# display row number 11
news_df[['cleaned_text', 'tokens']].head()
news_df.iloc[10]['cleaned_text']

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


'may 2025 update every month send email newsletter supporter sharing recent update work publish selected portion newsletter blog make news accessible people visit website key update latest insta…every month send email newsletter supporter sharing recent update work publish selected portion newsletter blog make news accessible people … 3507 char'

In [91]:
news_df.iloc[10]['tokens']

['may',
 '2025',
 'update',
 'every',
 'month',
 'send',
 'email',
 'newsletter',
 'supporter',
 'sharing',
 'recent',
 'update',
 'work',
 'publish',
 'selected',
 'portion',
 'newsletter',
 'blog',
 'make',
 'news',
 'accessible',
 'people',
 'visit',
 'website',
 'key',
 'update',
 'latest',
 'insta…every',
 'month',
 'send',
 'email',
 'newsletter',
 'supporter',
 'sharing',
 'recent',
 'update',
 'work',
 'publish',
 'selected',
 'portion',
 'newsletter',
 'blog',
 'make',
 'news',
 'accessible',
 'people',
 '…',
 '3507',
 'char']

This code below cleans tokens by:

Removing numbers and words like 'char'/'chars',

Splitting truncated words containing ellipses (e.g., "insta...every" → ["insta", "every"]),

Keeping only clean, meaningful tokens while preserving their order, ensuring it remains readable

In [92]:
def clean_tokens(tokens):
    """Remove unwanted tokens including numbers, 'char(s)', and truncated words"""
    cleaned = []
    for token in tokens:
        # Skip numbers, 'char'/'chars', and standalone ellipses
        if token.isdigit() or token in {'char', 'chars', '…'}:
            continue
        # Fix words with ellipses (e.g., 'insta…every' → 'insta', 'every')
        if '…' in token:
            parts = [p for p in token.split('…') if p]
            cleaned.extend(parts)
        else:
            cleaned.append(token)
    return cleaned

# Apply to tokens
news_df['cleaned_tokens'] = news_df['tokens'].apply(clean_tokens)

# Preview
print(news_df.iloc[10]['cleaned_tokens'])

['may', 'update', 'every', 'month', 'send', 'email', 'newsletter', 'supporter', 'sharing', 'recent', 'update', 'work', 'publish', 'selected', 'portion', 'newsletter', 'blog', 'make', 'news', 'accessible', 'people', 'visit', 'website', 'key', 'update', 'latest', 'insta', 'every', 'month', 'send', 'email', 'newsletter', 'supporter', 'sharing', 'recent', 'update', 'work', 'publish', 'selected', 'portion', 'newsletter', 'blog', 'make', 'news', 'accessible', 'people']


This code below cleans already tokenized text by:

Removing numbers and words like 'char'/'chars',

Splitting truncated words containing ellipses (e.g., "insta...every" → ["insta", "every"]),

Keeping only clean, meaningful tokens while preserving their order, ensuring it remains readable

In [93]:
import re

def clean_full_text(text):
    """
    Clean raw text by:
    1. Removing [NNN chars] or NNN chars patterns
    2. Fixing words with ellipses (insta…every → insta every)
    3. Removing standalone numbers and 'char'/'chars'
    4. Removing standalone ellipses
    """
    # Remove char count patterns
    text = re.sub(r"\[\+\d+\s*chars\]|\d+\s*chars", "", text)
    
    # Fix words with ellipses by replacing … with space
    text = re.sub(r"(\w)…(\w)", r"\1 \2", text)
    
    # Remove standalone numbers, 'char', 'chars'
    text = ' '.join(
        word for word in text.split() 
        if not (word.isdigit() or word in {'char', 'chars'})
    )
    
    # Remove any remaining standalone ellipses
    text = text.replace('…', '')
    
    return text.strip()

# Apply to full text column
news_df['cleaned_text'] = news_df['full_text'].apply(clean_full_text)

# Preview before/after
print("Original:", news_df.iloc[10]['full_text'])
print("Cleaned:", news_df.iloc[10]['cleaned_text'])

Original: may 2025 updates every month we send an email newsletter to our supporters sharing recent updates from our work we publish selected portions of the newsletter on our blog to make this news more accessible to people who visit our website for key updates from the latest insta…every month we send an email newsletter to our supporters sharing recent updates from our work we publish selected portions of the newsletter on our blog to make this news more accessible to people … 3507 chars
Cleaned: may updates every month we send an email newsletter to our supporters sharing recent updates from our work we publish selected portions of the newsletter on our blog to make this news more accessible to people who visit our website for key updates from the latest insta every month we send an email newsletter to our supporters sharing recent updates from our work we publish selected portions of the newsletter on our blog to make this news more accessible to people


In [94]:
news_df.head()

Unnamed: 0,title,description,text,url,keyword,published_date,full_text,language,cleaned_text,tokens,cleaned_tokens
0,Has DOGE really saved the US government $180bn?,Elon Musk first claimed the department would m...,President Donald Trump and adviser Elon Musk c...,https://www.aljazeera.com/news/2025/6/6/has-do...,usaid kenya,2025-06-06,has doge really saved the us government 180bn ...,en,has doge really saved the us government 180bn ...,"[doge, really, saved, u, government, 180bn, el...","[doge, really, saved, u, government, 180bn, el..."
1,The Life Story of Ecomobilus Technologies Limi...,By Prof Geoffrey Gitau Here is a story showcas...,By Prof Geoffrey Gitau\r\nHere is a story show...,https://cleantechnica.com/2025/05/26/the-life-...,usaid kenya,2025-05-26,the life story of ecomobilus technologies limi...,en,the life story of ecomobilus technologies limi...,"[life, story, ecomobilus, technology, limited,...","[life, story, ecomobilus, technology, limited,..."
2,"Death, Sexual Violence and Human Trafficking: ...",by Brett Murphy and Anna Maria Barry-Jester \n...,ProPublica is a nonprofit newsroom that invest...,https://www.propublica.org/article/trump-usaid...,usaid kenya,2025-05-28,death sexual violence and human trafficking fa...,en,death sexual violence and human trafficking fa...,"[death, sexual, violence, human, trafficking, ...","[death, sexual, violence, human, trafficking, ..."
3,Congress Should Quickly Approve Trump’s Rescis...,President Donald Trump‘s rescission legislatio...,President Donald Trumps rescission legislation...,https://www.dailysignal.com/2025/06/10/congres...,usaid kenya,2025-06-10,congress should quickly approve trump’s rescis...,en,congress should quickly approve trump’s rescis...,"[congress, quickly, approve, trump, ’, s, resc...","[congress, quickly, approve, trump, ’, s, resc..."
4,Food Safety Depends On Every Link In The Suppl...,Almost 1 in 10 people globally fall ill from c...,Colorful fish and vegetables can be purchased ...,https://www.forbes.com/sites/daniellenierenber...,usaid kenya,2025-06-06,food safety depends on every link in the suppl...,en,food safety depends on every link in the suppl...,"[food, safety, depends, every, link, supply, c...","[food, safety, depends, every, link, supply, c..."


### Dropping columns ~ language, tokens

In [95]:
news_df.drop(columns=['language','tokens'],inplace=True)
news_df.head()

Unnamed: 0,title,description,text,url,keyword,published_date,full_text,cleaned_text,cleaned_tokens
0,Has DOGE really saved the US government $180bn?,Elon Musk first claimed the department would m...,President Donald Trump and adviser Elon Musk c...,https://www.aljazeera.com/news/2025/6/6/has-do...,usaid kenya,2025-06-06,has doge really saved the us government 180bn ...,has doge really saved the us government 180bn ...,"[doge, really, saved, u, government, 180bn, el..."
1,The Life Story of Ecomobilus Technologies Limi...,By Prof Geoffrey Gitau Here is a story showcas...,By Prof Geoffrey Gitau\r\nHere is a story show...,https://cleantechnica.com/2025/05/26/the-life-...,usaid kenya,2025-05-26,the life story of ecomobilus technologies limi...,the life story of ecomobilus technologies limi...,"[life, story, ecomobilus, technology, limited,..."
2,"Death, Sexual Violence and Human Trafficking: ...",by Brett Murphy and Anna Maria Barry-Jester \n...,ProPublica is a nonprofit newsroom that invest...,https://www.propublica.org/article/trump-usaid...,usaid kenya,2025-05-28,death sexual violence and human trafficking fa...,death sexual violence and human trafficking fa...,"[death, sexual, violence, human, trafficking, ..."
3,Congress Should Quickly Approve Trump’s Rescis...,President Donald Trump‘s rescission legislatio...,President Donald Trumps rescission legislation...,https://www.dailysignal.com/2025/06/10/congres...,usaid kenya,2025-06-10,congress should quickly approve trump’s rescis...,congress should quickly approve trump’s rescis...,"[congress, quickly, approve, trump, ’, s, resc..."
4,Food Safety Depends On Every Link In The Suppl...,Almost 1 in 10 people globally fall ill from c...,Colorful fish and vegetables can be purchased ...,https://www.forbes.com/sites/daniellenierenber...,usaid kenya,2025-06-06,food safety depends on every link in the suppl...,food safety depends on every link in the suppl...,"[food, safety, depends, every, link, supply, c..."


In [96]:
news_df.iloc[10]['cleaned_tokens']

['may',
 'update',
 'every',
 'month',
 'send',
 'email',
 'newsletter',
 'supporter',
 'sharing',
 'recent',
 'update',
 'work',
 'publish',
 'selected',
 'portion',
 'newsletter',
 'blog',
 'make',
 'news',
 'accessible',
 'people',
 'visit',
 'website',
 'key',
 'update',
 'latest',
 'insta',
 'every',
 'month',
 'send',
 'email',
 'newsletter',
 'supporter',
 'sharing',
 'recent',
 'update',
 'work',
 'publish',
 'selected',
 'portion',
 'newsletter',
 'blog',
 'make',
 'news',
 'accessible',
 'people']

In [97]:
news_df.shape

(2281, 9)

## Exploratory Data Analysis