# USAID-Kenya-Sentiment-Analysis

#    Merging Reddit CSV Files

### Import neccessary libraries

In [1]:
import pandas as pd
import glob
import os


### Folder path

In [2]:
folder_path = 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data'


### CSV file paths

In [3]:
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
csv_files

['C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\Agatha_reddit.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\cecilia.redditsubs.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\cecilia.reddit_nbo_ke_africa.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\leo_reddit_posts.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\Mbego_reddit_usaid_kenya.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\Mbego_reddit_usaid_kenya2.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\reddit_usaid_kenya.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/reddit_data\\reddit_usaid_sentiment.csv',
 'C:/Users/hp/Deskto

### Reading the files and combining them into one dataframe

In [4]:

preferred_columns = ['post_title', 'text', 'keyword', 'published_date', 'url']
all_dfs = []

for file in csv_files:
    df = pd.read_csv(file)
    
    # Rename common column variants
    if 'title' in df.columns:
        df.rename(columns={'title': 'post_title'}, inplace=True)
    if 'selftext' in df.columns and 'text' not in df.columns:
        df.rename(columns={'selftext': 'text'}, inplace=True)
    if 'date_posted' in df.columns:
        df.rename(columns={'date_posted': 'published_date'}, inplace=True)
    elif 'created_utc' in df.columns:
        df.rename(columns={'created_utc': 'published_date'}, inplace=True)
    
    # Add missing columns
    for col in preferred_columns:
        if col not in df.columns:
            df[col] = None

    # Convert published_date to datetime
    df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')

    # Track file source
    df['source_file'] = os.path.basename(file)
    
    # Keep only standardized columns
    df = df[preferred_columns + ['source_file']]

    all_dfs.append(df)

# Combine all
combined_df = pd.concat(all_dfs, ignore_index=True)
combined_df.head()


Unnamed: 0,post_title,text,keyword,published_date,url,source_file
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,usaid kenya,2025-04-15 13:16:53,https://www.reddit.com/r/Kenya/comments/1jzrn2...,Agatha_reddit.csv
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,usaid kenya,2025-04-07 04:21:12,https://www.reddit.com/r/Kenya/comments/1jtcvb...,Agatha_reddit.csv
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,usaid kenya,2025-04-05 19:09:10,https://www.reddit.com/r/Kenya/comments/1jsb14...,Agatha_reddit.csv
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",usaid kenya,2025-03-25 08:18:04,https://www.reddit.com/r/Kenya/comments/1jjehw...,Agatha_reddit.csv
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,usaid kenya,2025-03-08 08:08:58,https://www.reddit.com/r/Kenya/comments/1j6cjz...,Agatha_reddit.csv


In [5]:
combined_df.shape #check the shape of the dataset

(1306, 6)

In [6]:
combined_df.isna().sum() #check for missing values

post_title          0
text              398
keyword           564
published_date    197
url                 0
source_file         0
dtype: int64

In [7]:
combined_df.columns

Index(['post_title', 'text', 'keyword', 'published_date', 'url',
       'source_file'],
      dtype='object')

In [8]:
# Extract date and time
combined_df['time'] = combined_df['published_date'].dt.time
combined_df['published_date'] = combined_df['published_date'].dt.date

#Drop the time column
combined_df.drop(columns='time', inplace=True)
combined_df.head()

Unnamed: 0,post_title,text,keyword,published_date,url,source_file
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,usaid kenya,2025-04-15,https://www.reddit.com/r/Kenya/comments/1jzrn2...,Agatha_reddit.csv
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,usaid kenya,2025-04-07,https://www.reddit.com/r/Kenya/comments/1jtcvb...,Agatha_reddit.csv
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,usaid kenya,2025-04-05,https://www.reddit.com/r/Kenya/comments/1jsb14...,Agatha_reddit.csv
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",usaid kenya,2025-03-25,https://www.reddit.com/r/Kenya/comments/1jjehw...,Agatha_reddit.csv
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,usaid kenya,2025-03-08,https://www.reddit.com/r/Kenya/comments/1j6cjz...,Agatha_reddit.csv


In [9]:
combined_df.sample(10) #random sample of 10 rows

Unnamed: 0,post_title,text,keyword,published_date,url,source_file
652,Federal judge rules Trump administration has t...,,"foreign aid, foreign aid",2025-02-25,https://abcnews.go.com/amp/Politics/federal-ju...,cecilia.redditsubs.csv
707,What is the point of America rebuilding Gaza i...,I am against all foreign aid unless there is s...,"foreign aid, foreign aid",2025-02-06,https://www.reddit.com/r/AskConservatives/comm...,cecilia.redditsubs.csv
684,Elon Musk's DOGE Is Still Blocking HIV/AIDS Re...,,"foreign aid, foreign aid",2025-02-04,https://www.wired.com/story/usaid-researchers-...,cecilia.redditsubs.csv
60,Africa doesn’t lack potential—it lacks a minds...,"""Kenya’s real problem is not a lack of money o...",kenya foreign aid,2025-04-25,https://www.reddit.com/r/Kenya/comments/1k7je9...,Agatha_reddit.csv
1071,China investigates Canadian couple suspected o...,,,NaT,http://www.cnn.com/2014/08/05/world/asia/china...,Mbego_reddit_usaid_kenya2.csv
1126,Me and ChatGPT,**Me:** **We seek the same type of love we wer...,,2025-06-17,https://www.reddit.com/r/Kenya/comments/1lduqq...,ruth_reddit.csv
520,So sad😂😬,It just hit me...our brave men and women in un...,"foreign aid, foreign aid",2025-01-24,https://www.reddit.com/r/nairobi/comments/1i8z...,cecilia.redditsubs.csv
264,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",development aid kenya,2025-03-25,https://www.reddit.com/r/Kenya/comments/1jjehw...,Agatha_reddit.csv
662,U.S. farmers face uncertainty from USAID forei...,,"USAID, foreign aid, foreign aid",2025-02-14,https://www.marketplace.org/2025/02/12/usaid-f...,cecilia.redditsubs.csv
804,Tech bros & sis! What’s the best path now? Mor...,Hey fam! So I’ve been diving into tech lately ...,,1970-01-01,https://www.reddit.com/r/Kenya/comments/1jrx4b...,leo_reddit_posts.csv


### Save the final merged dataframe to csv

In [10]:
# Save to CSV 
if not combined_df.empty:
    output_filename = "C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_reddit_dataset.csv"
    combined_df.to_csv(output_filename, index=False, encoding='utf-8')
    print(f" Results successfully saved to:\n{output_filename}")
else:
    print(" No data to save. The DataFrame is empty.")


 Results successfully saved to:
C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_reddit_dataset.csv


# Merging News CSV Files

### Import necessary libraries

In [11]:
import pandas as pd
import glob
import os


### Folder path

In [12]:
folder_path = 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data'


### CSV file paths

In [13]:
folder_path = 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data'
omit_file = os.path.join(folder_path, 'Agatha_news_fulltext.csv')

news_csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
news_csv_files = [f for f in news_csv_files if os.path.normpath(f) != os.path.normpath(omit_file)]

news_csv_files


['C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\Agatha_news.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\cecilia.newsapi.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\leo_newsapi_articles_enriched.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\Mbego_news_usaid_kenya_fulltext.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\Mbego_news_usaid_kenya_recent.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\newsapi_usaid_articles.csv',
 'C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/raw/news_data\\ruth_news.csv']

### Reading the files and combining them into one dataframe

In [14]:
import pandas as pd
import os

news_columns = ['title', 'description', 'text', 'url', 'keyword', 'published_date', 'source_file']
all_news_dfs = []

for file in news_csv_files:  
    df = pd.read_csv(file)

    # Rename content to text 
    if 'content' in df.columns and 'text' not in df.columns:
        df.rename(columns={'content': 'text'}, inplace=True)

    # Rename publishedAt to published_date
    if 'publishedAt' in df.columns:
        df.rename(columns={'publishedAt': 'published_date'}, inplace=True)

    # Add missing columns
    for col in news_columns:
        if col not in df.columns:
            df[col] = None

    # Convert published_date to datetime
    df['published_date'] = pd.to_datetime(df['published_date'], errors='coerce')



    # source file info
    df['source_file'] = os.path.basename(file)

    # standardized columns
    df = df[news_columns]
    all_news_dfs.append(df)

# Combine all
news_combined_df = pd.concat(all_news_dfs, ignore_index=True)

# Save to CSV
output_path = "C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_news_dataset.csv"
if not news_combined_df.empty:
    news_combined_df.to_csv(output_path, index=False, encoding='utf-8')
    print(f"NewsAPI data saved to:\n{output_path}")
else:
    print("No NewsAPI data to save. DataFrame is empty.")


NewsAPI data saved to:
C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/Cecilia_merged_news_dataset.csv


In [15]:
news_combined_df.shape

(2638, 7)

In [16]:
news_combined_df.sample(10)

Unnamed: 0,title,description,text,url,keyword,published_date,source_file
838,Massive Tech Layoffs May Be the Fault of a 201...,A new theory of the tech industry's economic w...,The good times in Silicon Valley are overat le...,https://gizmodo.com/massive-tech-layoffs-may-b...,funding cuts,2025-06-09 20:50:46+00:00,cecilia.newsapi.csv
1522,Trump’s Liquidation of U.S Global Leadership,Some geopolitical aspects of how Trump's mazim...,"Yves here, It is striking to see the degree to...",https://www.nakedcapitalism.com/2025/05/trumps...,USAID funding,2025-05-27 07:00:05+00:00,cecilia.newsapi.csv
65,MEPs urge Brussels to cut all funding for Buda...,Members of the European Parliament have urged ...,Over 20 members of the European Parliament hav...,https://www.rt.com/news/618035-hungary-eu-fund...,usaid funding,2025-05-23 09:30:13+00:00,Agatha_news.csv
1358,LGBT+ in Hungary: ‘I never went to the Pride M...,The attack on democracy in Hungary continues. ...,"For many years now, the government of Viktor O...",https://voxeurop.eu/en/lgbt-hungary-pride/,NGO funding,2025-05-30 08:07:08+00:00,cecilia.newsapi.csv
2154,6/10/25 ☀️ AM: ... But the reconciliation figh...,Punchbowl News:\n6/10/25 ☀️ AM: … But the reco...,memeorandum is an auto-generated summary of th...,https://www.memeorandum.com/250610/p9,end of USAID,2025-06-10 10:10:02+00:00,cecilia.newsapi.csv
694,Israeli forces seize Gaza-bound boat and detai...,Israeli forces said they took command of the v...,"AMMAN, Jordan Israeli forces seized a ship car...",https://www.npr.org/2025/06/09/nx-s1-5427962/i...,foreign aid,2025-06-09 08:44:38+00:00,cecilia.newsapi.csv
2630,Kenya plans to relocate health data from the U...,Kenya is preparing to move critical health dat...,Kenya is preparing to move critical health dat...,https://techcabal.com/2025/06/02/kenya-health-...,,2025-06-02 10:53:32+00:00,ruth_news.csv
1350,Turning used cooking oil into soap in a countr...,A group of young environmentalists in Honduras...,"Fritz PinnowReporter, Comayagua, Honduras\r\nH...",https://www.bbc.com/news/articles/c9djx7llj44o,NGO funding,2025-06-01 00:27:31+00:00,cecilia.newsapi.csv
799,Fentanyl deaths among the young are dropping. ...,Fentanyl and other street drugs killed more th...,"Justin Carlyle, age 23, photographed on the st...",https://www.npr.org/2025/06/10/1253920671/fent...,funding cuts,2025-06-10 22:41:10+00:00,cecilia.newsapi.csv
1571,What Trump’s Harvard Visa Restriction Means fo...,The U.S. President has launched his latest att...,President Donald Trump has escalated his stand...,https://time.com/7291333/harvard-university-tr...,foreign aid,2025-06-05 06:30:00+00:00,cecilia.newsapi.csv


In [17]:
news_combined_df.isna().sum()

title               0
description        16
text               25
url                 2
keyword           259
published_date     99
source_file         0
dtype: int64

### FINAL REDDIT DATA TO BE USED ~ mbego_all_reddit_merged

In [18]:
import pandas as pd
import seaborn as sns

In [19]:
reddit_df = pd.read_csv('C:/Users/hp/Desktop/DATA NEXUS PROJECTS/USAID-Kenya-Sentiment-Analysis/data/processed/individual datasets/mbego_all_reddit_merged.csv')
reddit_df.head()

Unnamed: 0,title,selftext,subreddit,author,created_utc,created_date,score,num_comments,keyword,search_term,date_posted,upvotes,comments,url,permalink
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,Kenya,muerki,4/15/2025 13:16,,3.0,5.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jzrn2...,
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,Kenya,Morio_anzenza,4/7/2025 4:21,,169.0,95.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jtcvb...,
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,Kenya,vindtar,4/5/2025 19:09,,2.0,2.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jsb14...,
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",Kenya,Gold_Smart,3/25/2025 8:18,,13.0,20.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jjehw...,
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,Kenya,westmaxia,3/8/2025 8:08,,1.0,6.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1j6cjz...,


In [20]:
reddit_df.tail()

Unnamed: 0,title,selftext,subreddit,author,created_utc,created_date,score,num_comments,keyword,search_term,date_posted,upvotes,comments,url,permalink
1284,Weekly Sub-Saharan Africa Security Situation a...,#Somalia 🇸🇴\r\n#Sudan 🇸🇩\r\nDemocratic Republi...,Africa,,4/18/2025 14:09,,3.0,2.0,,,,,,https://open.substack.com/pub/hasretkargin/p/w...,
1285,No evidence that Burkina Faso paid off all its...,,Africa,,4/18/2025 8:23,,52.0,25.0,,,,,,https://www.reuters.com/fact-check/burkina-fas...,
1286,Ghana orders foreigners to exit gold market by...,Ghana has ordered foreigners to exit its gold ...,Africa,,4/17/2025 17:59,,101.0,12.0,,,,,,https://eastleighvoice.co.ke/west%20african/13...,
1287,Unending Frustration Regarding Sudan War.,https://www.reuters.com/world/britain-boosts-a...,Africa,,4/16/2025 19:33,,11.0,8.0,,,,,,https://www.reddit.com/gallery/1k0t8ed,
1288,Tanzania's Authoritarian Government Has Just B...,Tanzania's main opposition party has been barr...,Africa,,4/14/2025 11:31,,52.0,14.0,,,,,,https://www.reddit.com/r/Africa/comments/1jywl...,


In [21]:
reddit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1289 entries, 0 to 1288
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         1289 non-null   object 
 1   selftext      901 non-null    object 
 2   subreddit     1289 non-null   object 
 3   author        466 non-null    object 
 4   created_utc   1289 non-null   object 
 5   created_date  150 non-null    object 
 6   score         1013 non-null   float64
 7   num_comments  833 non-null    float64
 8   keyword       742 non-null    object 
 9   search_term   150 non-null    object 
 10  date_posted   0 non-null      float64
 11  upvotes       276 non-null    float64
 12  comments      276 non-null    float64
 13  url           1289 non-null   object 
 14  permalink     426 non-null    object 
dtypes: float64(5), object(10)
memory usage: 151.2+ KB


In [22]:
reddit_df.shape

(1289, 15)

In [23]:
reddit_df.columns

Index(['title', 'selftext', 'subreddit', 'author', 'created_utc',
       'created_date', 'score', 'num_comments', 'keyword', 'search_term',
       'date_posted', 'upvotes', 'comments', 'url', 'permalink'],
      dtype='object')

In [24]:
reddit_df.dtypes

title            object
selftext         object
subreddit        object
author           object
created_utc      object
created_date     object
score           float64
num_comments    float64
keyword          object
search_term      object
date_posted     float64
upvotes         float64
comments        float64
url              object
permalink        object
dtype: object

In [25]:
reddit_df.isna().sum()

title              0
selftext         388
subreddit          0
author           823
created_utc        0
created_date    1139
score            276
num_comments     456
keyword          547
search_term     1139
date_posted     1289
upvotes         1013
comments        1013
url                0
permalink        863
dtype: int64

In [26]:
reddit_df.describe()

Unnamed: 0,score,num_comments,date_posted,upvotes,comments
count,1013.0,833.0,0.0,276.0,276.0
mean,344.626851,31.34934,,337.518116,35.905797
std,3106.309319,68.060435,,1623.914761,73.531775
min,0.0,0.0,,0.0,0.0
25%,3.0,4.0,,6.75,5.0
50%,10.0,9.0,,43.5,13.0
75%,54.0,31.0,,111.25,31.25
max,79088.0,706.0,,22208.0,558.0


## Data Cleaning

### Dropping Unneccessary Columns

In [27]:
reddit_df.drop(columns=['subreddit', 'author','created_date','score','num_comments','search_term','date_posted','upvotes','comments','permalink'], inplace=True)
reddit_df.head()

Unnamed: 0,title,selftext,created_utc,keyword,url
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,4/15/2025 13:16,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,4/7/2025 4:21,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,4/5/2025 19:09,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",3/25/2025 8:18,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,3/8/2025 8:08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...


In [28]:
reddit_df.columns

Index(['title', 'selftext', 'created_utc', 'keyword', 'url'], dtype='object')

### Checking for missing values

In [29]:
reddit_df.isna().sum()

title            0
selftext       388
created_utc      0
keyword        547
url              0
dtype: int64

In [30]:
reddit_df[['selftext', 'title','url']].sample(10)

Unnamed: 0,selftext,title,url
484,I have just watched the MP of Homa Bay defendi...,Ignorance or malice?,https://www.reddit.com/r/Kenya/comments/1inj4k...
418,,USAID Ukraine office ordered to suspend fundin...,https://unn.ua/en/news/usaid-ukraine-office-or...
568,Has anyone here gotten any criteria for the fo...,Review statuses?,https://www.reddit.com/r/InternationalDev/comm...
294,I work for an international company that focus...,Epc projects,https://www.reddit.com/r/Kenya/comments/1ikmwy...
963,,America's European allies are trying to pry th...,https://apnews.com/article/usaid-trump-foreign...
443,,Canadian universities report jump in US applic...,https://www.reuters.com/world/americas/canadia...
468,,Bill Gates 'horrified' by Trump cuts to US aid...,https://www.semafor.com/article/05/08/2025/bil...
936,"At first glance, BRICS, the rise of the AfD, T...",The Israel-Gaza War: A Symptom of a Looming Gl...,https://www.reddit.com/r/Kenya/comments/1iygom...
570,Finally.,"Public Citizen sues the Trump Administration, ...",https://www.citizen.org/news/nonprofits-sue-tr...
557,Georgetown SFS GHD v. American NRSD - Internat...,Georgetown SFS GHD v. American University NRSD...,https://www.reddit.com/r/InternationalDev/comm...


In [31]:
reddit_df.rename(columns={'title':'post_title','selftext':'text','created_utc':'published_date'},inplace=True)
reddit_df.head()

Unnamed: 0,post_title,text,published_date,keyword,url
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,4/15/2025 13:16,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,4/7/2025 4:21,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,4/5/2025 19:09,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",3/25/2025 8:18,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,3/8/2025 8:08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...


**parse** - analyze and convert a piece of data (like a string) into a different, more useful format.

The below code cleans the published_date column in the data. Some dates are written as normal dates (eg "2025-04-15") and others are written as numbers (UNIX timestamps). The code checks each value and tries to convert it into a proper date. If it can’t, it leaves it empty. After that, it removes the time part and keeps just the date. This helps make sure all the dates are in the same clean format.

In [32]:
import pandas as pd

# Function to handle both datetime strings and UNIX timestamps
def parse_mixed_dates(val):
    try:
        # Parse as a regular datetime string
        return pd.to_datetime(val)
    except:
        try:
            # Parse as a UNIX timestamp
            return pd.to_datetime(float(val), unit='s')
        except:
            return pd.NaT  # Return NaT if all parsing fails

# Apply the parsing function
reddit_df['published_date'] = reddit_df['published_date'].apply(parse_mixed_dates)

# Check how many dates failed to parse
print("Unparsed dates:", reddit_df['published_date'].isna().sum())

# Extract date only (remove time)
reddit_df['published_date'] = reddit_df['published_date'].dt.date
reddit_df.head()

Unparsed dates: 0


Unnamed: 0,post_title,text,published_date,keyword,url
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,2025-04-15,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,2025-04-07,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,2025-04-05,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",2025-03-25,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,2025-03-08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...


In [33]:
reddit_df['published_date'].isna().sum() #check the missing values in date

0

In [34]:
reddit_df.dropna(subset=['published_date'], inplace=True) #drop missing values in date

In [35]:
reddit_df.shape #check the shape of the dataset

(1289, 5)

In [36]:
reddit_df.isna().sum() #check for missing values

post_title          0
text              388
published_date      0
keyword           547
url                 0
dtype: int64

### Fill missing values in text with an empty string

I filled missing values in text with an empty string because these are likely link posts where the author did not add body text.

In [37]:
reddit_df['text'] = reddit_df['text'].fillna('')
reddit_df.isna().sum()

post_title          0
text                0
published_date      0
keyword           547
url                 0
dtype: int64

### Fill missing values in keyword with a placeholder 'Unknown'

In [38]:
reddit_df['keyword'] = reddit_df['keyword'].fillna('unknown')
reddit_df['keyword'].sample(10)

902                      unknown
1144                     unknown
627     foreign aid, foreign aid
896                      unknown
96             kenya foreign aid
235        development aid kenya
1037                     unknown
459      usaid kenya funding cut
1244                     unknown
932                      unknown
Name: keyword, dtype: object

In [39]:
reddit_df.isna().sum()

post_title        0
text              0
published_date    0
keyword           0
url               0
dtype: int64

Missing values filled with an empty string and placeholder to maintain the integrity of the data and maintain a good amount of rows

### Check for duplicates 

In [40]:
reddit_df.duplicated().sum()

145

In [41]:
reddit_df.drop_duplicates(inplace=True) #drop all duplicates

In [42]:
reddit_df.duplicated().sum() #confirm that all duplicates have been dropped

0

In [43]:
reddit_df.shape #shape of the new data after cleaning

(1144, 5)

In [44]:
reddit_df['post_title'].sample(10)

874                            Pitching for a writing gig
316     Alleged Abduction of Kenyan MP George Koimburi...
870     Looking for Entry-Level Opportunities in Procu...
1144                               'Misuse' of Billboards
903     US strike kills 16 Afghan policemen in Helmand...
1212                I don't know how to title this but...
1183    Called my ISP and spoke fluent Karen. Now my r...
90                       Increase Kenya military spending
147                                      Thieves. Rotten.
354     Kenya’s Economic Reality: Why Fighting Corrupt...
Name: post_title, dtype: object

## Sentiment Analysis Data Cleaning

Creating a new column 'full_text' that combines 'post_title','text' and the empty string '', to help in sentiment analysis

In [45]:
reddit_df['full_text'] = reddit_df['post_title'] + ' ' + reddit_df['text']
reddit_df.head() #sample top 5 rows after combining the columns

Unnamed: 0,post_title,text,published_date,keyword,url,full_text
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,2025-04-15,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...,"USAID left a month ago, do we have ARVs in Ken..."
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,2025-04-07,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...,Classism in r/Kenya and r/nairobi The classism...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,2025-04-05,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...,EX-USAID people!! Let's talk Are you still in ...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",2025-03-25,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...,Why western powers back Israel no matter what ...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,2025-03-08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...,Is kenya capable of funding its needs now that...


In [46]:
reddit_df['full_text'].isna().sum() #confirm that there are no missing values in the dataset

0

### Lowercasing 

Converting all text to lowercase

In [47]:
reddit_df['full_text'] = reddit_df['full_text'].str.lower()
reddit_df['full_text'].sample(10)

4       is kenya capable of funding its needs now that...
837     alleged abduction of kenyan mp george koimburi...
892     does usaid's exit mean it's a free reign for t...
398     us congressman scott perry accuses usaid of fu...
839     affordable housing gets ksh 95b in development...
1242    not just for sierra leone, for all of africa i...
813     anyone here with experience in contract farmin...
1182    nairobi jobs i just came across this while scr...
396     interest post from x what's your take on the n...
93      raila's loss and kenya foreign policy in light...
Name: full_text, dtype: object

### Translate all text to English

In [48]:
!pip install langdetect #detect which language the text are in


[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#detect': Expected package name at the start of dependency specifier
    #detect
    ^


In [49]:
from langdetect import detect

# Language detection
reddit_df['language'] = reddit_df['full_text'].apply(lambda x: detect(x) if pd.notnull(x) else 'unknown')
reddit_df['language'].value_counts()

en    1129
tl       3
sw       3
et       2
fr       2
nl       2
sl       1
da       1
id       1
Name: language, dtype: int64

In [50]:
# View full text rows where the language is not English
non_english_df = reddit_df[reddit_df['language'] != 'en']

# Display the full_text column of non-English rows
print(non_english_df[['language', 'full_text']])


     language                                          full_text
28         tl  ...na bado mnasema ruto must go bila tangible ...
175        tl  ...na bado mnasema ruto must go bila tangible ...
236        sl                                design job nairobi 
273        tl  ...na bado mnasema ruto must go bila tangible ...
301        et  just seeing ki, sa, che, ko, la, si...pure tri...
634        fr  supreme court rules trump administration must ...
638        fr  supreme court denies trump administration requ...
661        nl  “independent media” disintegrate as foreign ai...
664        da  judge orders us to restore funds for foreign a...
852        sw  "wapi mtoto?" it was always an escape for me d...
853        sw  drunk and orderly it so happened that one time...
903        nl  us strike kills 16 afghan policemen in helmand...
943        et                            joblessness is looming 
1100       id  filling kra returns at ksh 35 usitikie kupata ...
1143       sw  about my o

In [51]:
! pip install deep-translator #Translates text from one language to another



[notice] A new release of pip is available: 24.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: '#Translates': Expected package name at the start of dependency specifier
    #Translates
    ^


In [52]:
from deep_translator import GoogleTranslator

# Make a copy of non-English rows
non_english_df = reddit_df[reddit_df['language'] != 'en'].copy()

# Translate each full_text
non_english_df['translated_text'] = non_english_df['full_text'].apply(
    lambda text: GoogleTranslator(source='auto', target='en').translate(text)
)

# Preview translations
print(non_english_df[['language', 'full_text', 'translated_text']].head())


    language                                          full_text  \
28        tl  ...na bado mnasema ruto must go bila tangible ...   
175       tl  ...na bado mnasema ruto must go bila tangible ...   
236       sl                                design job nairobi    
273       tl  ...na bado mnasema ruto must go bila tangible ...   
301       et  just seeing ki, sa, che, ko, la, si...pure tri...   

                                       translated_text  
28   ... and you still say Ruto must go without tan...  
175  ... and you still say Ruto must go without tan...  
236                                 design job nairobi  
273  ... and you still say Ruto must go without tan...  
301  just seeing ki, sa, che, ko, la, si ... pure t...  


From the above preview, shows that the text with both english and swahili was able to be translated as shown in index 28

In [53]:
reddit_df['language'].value_counts() #check which other languages have been detected

en    1129
tl       3
sw       3
et       2
fr       2
nl       2
sl       1
da       1
id       1
Name: language, dtype: int64

In [54]:
# Drop all non-English rows
reddit_df = reddit_df[reddit_df['language'] == 'en'].copy()

# Reset the index
reddit_df.reset_index(drop=True, inplace=True)

# Preview the cleaned data
print(reddit_df[['language', 'full_text']].head())


  language                                          full_text
0       en  usaid left a month ago, do we have arvs in ken...
1       en  classism in r/kenya and r/nairobi the classism...
2       en  ex-usaid people!! let's talk are you still in ...
3       en  why western powers back israel no matter what ...
4       en  is kenya capable of funding its needs now that...


In [55]:
reddit_df['language'].value_counts() #confirm that all rows are in English only

en    1129
Name: language, dtype: int64

In [56]:
reddit_df.sample(10)

Unnamed: 0,post_title,text,published_date,keyword,url,full_text,language
50,Racism - how often do you as a Kenyan feel rac...,I have lived half of my life in Kenya. I've ju...,2025-05-19,kenya foreign aid,https://www.reddit.com/r/Kenya/comments/1kq6n2...,racism - how often do you as a kenyan feel rac...,en
998,Mental Health Emergency Contacts and Support,Hello [r/Kenya](https://www.reddit.com/r/Kenya...,2025-06-12,unknown,https://www.reddit.com/r/Kenya/comments/1l9xsr...,mental health emergency contacts and support h...,en
381,"Serbia protests Kenya’s recognition of Kosovo,...",,2025-03-28,usaid kenya funding cut,https://www.theeastafrican.co.ke/tea/news/east...,"serbia protests kenya’s recognition of kosovo,...",en
512,Order Without America: How the International S...,"\[SS from essay by Ngaire Woods, Professor of ...",2025-04-24,"foreign aid, foreign aid",https://www.foreignaffairs.com/united-states/d...,order without america: how the international s...,en
227,Kenyan Universities Need to Start Teaching Res...,I’ve reviewed a lot of resumes from software d...,2025-05-31,development aid kenya,https://www.reddit.com/r/Kenya/comments/1kzlc7...,kenyan universities need to start teaching res...,en
1120,Why Are So Many Africans Always Fighting Villa...,"Any small thing that happens, village people. ...",2025-04-21,unknown,https://www.reddit.com/r/Africa/comments/1k46p...,why are so many africans always fighting villa...,en
1124,Weekly Sub-Saharan Africa Security Situation a...,#Somalia 🇸🇴\r\n#Sudan 🇸🇩\r\nDemocratic Republi...,2025-04-18,unknown,https://open.substack.com/pub/hasretkargin/p/w...,weekly sub-saharan africa security situation a...,en
1089,US Cuts $50M in Medical Aid to Zambia Over 'Sy...,,2025-05-10,unknown,https://www.verity.news/story/2025/us-slashes-...,us cuts $50m in medical aid to zambia over 'sy...,en
971,I'm I the only one whose dreams and ambitions ...,"Honestly, I am tired of myself. Last weekend I...",2025-06-16,unknown,https://www.reddit.com/r/Kenya/comments/1lcjtw...,i'm i the only one whose dreams and ambitions ...,en
1025,Selective outrage,at least 20 documented cases of deaths in poli...,2025-06-10,unknown,https://www.reddit.com/r/Kenya/comments/1l8a9e...,selective outrage at least 20 documented cases...,en


In [58]:
reddit_df.drop('language', axis=1, inplace=True) #drop the language column as it is unnecessary now
reddit_df.head()

Unnamed: 0,post_title,text,published_date,keyword,url,full_text
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,2025-04-15,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jzrn2...,"usaid left a month ago, do we have arvs in ken..."
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,2025-04-07,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jtcvb...,classism in r/kenya and r/nairobi the classism...
2,EX-USAID people!! Let's talk,Are you still in contact with the organisation...,2025-04-05,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jsb14...,ex-usaid people!! let's talk are you still in ...
3,Why western powers back Israel no matter what ...,"I don't care what good book you read, but it's...",2025-03-25,usaid kenya,https://www.reddit.com/r/Kenya/comments/1jjehw...,why western powers back israel no matter what ...
4,Is kenya capable of funding its needs now that...,How is kenya prepared to fill the vacuum of US...,2025-03-08,usaid kenya,https://www.reddit.com/r/Kenya/comments/1j6cjz...,is kenya capable of funding its needs now that...


### Removing Punctuation Marks