# Reddit Datasets 

In [3]:
import pandas as pd
import glob

# Get all Reddit-related CSV files
reddit_files = glob.glob("../data/processed/*reddit*.csv")

# Dictionary to hold datasets
reddit_dfs = {}

# Load each file into a DataFrame
for file in reddit_files:
    name = file.split("/")[-1].replace(".csv", "")
    reddit_dfs[name] = pd.read_csv(file)

# Inspect each Reddit dataset
for name, df in reddit_dfs.items():
    display(f"\n====== {name} ======")
    display(f"Shape: {df.shape}")
    display("Columns:", df.columns.tolist())
    display("\nMissing values:\n", df.isnull().sum())
    display("\nUnique values per column:\n", df.nunique())
    display("\nSample rows:")
    display(df.head(2))




'Shape: (1289, 3)'

'Columns:'

['Unnamed: 0', 'Title', 'Selftext']

'\nMissing values:\n'

Unnamed: 0      0
Title           0
Selftext      388
dtype: int64

'\nUnique values per column:\n'

Unnamed: 0    1289
Title          828
Selftext       543
dtype: int64

'\nSample rows:'

Unnamed: 0.1,Unnamed: 0,Title,Selftext
0,0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...
1,1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...




'Shape: (1289, 15)'

'Columns:'

['title',
 'selftext',
 'subreddit',
 'author',
 'created_utc',
 'created_date',
 'score',
 'num_comments',
 'keyword',
 'search_term',
 'date_posted',
 'upvotes',
 'comments',
 'url',
 'permalink']

'\nMissing values:\n'

title              0
selftext         388
subreddit          0
author           823
created_utc      276
created_date    1139
score            276
num_comments     456
keyword          547
search_term     1139
date_posted     1013
upvotes         1013
comments        1013
url                0
permalink        863
dtype: int64

'\nUnique values per column:\n'

title           828
selftext        543
subreddit        11
author          243
created_utc     729
created_date    101
score           235
num_comments    112
keyword          27
search_term      35
date_posted     106
upvotes         143
comments         76
url             839
permalink       316
dtype: int64

'\nSample rows:'

Unnamed: 0,title,selftext,subreddit,author,created_utc,created_date,score,num_comments,keyword,search_term,date_posted,upvotes,comments,url,permalink
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,Kenya,muerki,2025-04-15 13:16:53,,3.0,5.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jzrn2...,
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,Kenya,Morio_anzenza,2025-04-07 04:21:12,,169.0,95.0,usaid kenya,,,,,https://www.reddit.com/r/Kenya/comments/1jtcvb...,




'Shape: (1289, 5)'

'Columns:'

['post_title', 'text', 'keyword', 'published_date', 'url']

'\nMissing values:\n'

post_title          0
text              388
keyword           547
published_date    347
url                 0
dtype: int64

'\nUnique values per column:\n'

post_title        828
text              543
keyword            27
published_date    568
url               839
dtype: int64

'\nSample rows:'

Unnamed: 0,post_title,text,keyword,published_date,url
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,usaid kenya,2025-04-15 13:16:53,https://www.reddit.com/r/Kenya/comments/1jzrn2...
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,usaid kenya,2025-04-07 04:21:12,https://www.reddit.com/r/Kenya/comments/1jtcvb...




'Shape: (1289, 6)'

'Columns:'

['post_title', 'text', 'keyword', 'published_date', 'url', 'source_file']

'\nMissing values:\n'

post_title          0
text              388
keyword           547
published_date    180
url                 0
source_file         0
dtype: int64

'\nUnique values per column:\n'

post_title        828
text              543
keyword            27
published_date    142
url               839
source_file         8
dtype: int64

'\nSample rows:'

Unnamed: 0,post_title,text,keyword,published_date,url,source_file
0,"USAID left a month ago, do we have ARVs in Kenya?",Someone on a different group (different websit...,usaid kenya,2025-04-15,https://www.reddit.com/r/Kenya/comments/1jzrn2...,Agatha_reddit.csv
1,Classism in r/Kenya and r/nairobi,The classism I'm seeing in both subs is a good...,usaid kenya,2025-04-07,https://www.reddit.com/r/Kenya/comments/1jtcvb...,Agatha_reddit.csv




'Shape: (542, 6)'

'Columns:'

['subreddit', 'title', 'text', 'url', 'created_date', 'keyword']

'\nMissing values:\n'

subreddit         0
title             0
text              0
url               0
created_date    348
keyword           0
dtype: int64

'\nUnique values per column:\n'

subreddit         9
title           537
text            542
url             542
created_date    173
keyword          40
dtype: int64

'\nSample rows:'

Unnamed: 0,subreddit,title,text,url,created_date,keyword
0,Kenya,Saw almost an entire department in a hospital ...,Dump's orders taking effect. I am mad at our g...,https://www.reddit.com/r/Kenya/comments/1ia6x0...,2025-01-26 05:32:16,Unknown
1,Kenya,USAID HIV FUND CUTS,"Guys with the recent halting of funds for HIV,...",https://www.reddit.com/r/Kenya/comments/1j0am0...,2025-02-28 15:43:13,Unknown


# News 

In [4]:

# Get all News-related CSV files
news_files = glob.glob("../data/processed/*news*.csv")

# Dictionary to hold datasets
news_dfs = {}

# Load each file into a DataFrame
for file in news_files:
    name = file.split("/")[-1].replace(".csv", "")
    news_dfs[name] = pd.read_csv(file)

# Inspect each News dataset
for name, df in news_dfs.items():
    display(f"\n====== {name} ======")
    display(f"Shape: {df.shape}")
    display("Columns:", df.columns.tolist())
    display("\nMissing values:\n", df.isnull().sum())
    display("\nUnique values per column:\n", df.nunique())   
    display("\nSample rows:")
    display(df.head(2))




'Shape: (2549, 7)'

'Columns:'

['title',
 'description',
 'text',
 'url',
 'keyword',
 'published_date',
 'source_file']

'\nMissing values:\n'

title               0
description        16
text               25
url                 2
keyword           170
published_date     99
source_file         0
dtype: int64

'\nUnique values per column:\n'

title             1410
description       1411
text              1401
url               1437
keyword             30
published_date      42
source_file          6
dtype: int64

'\nSample rows:'

Unnamed: 0,title,description,text,url,keyword,published_date,source_file
0,Has DOGE really saved the US government $180bn?,Elon Musk first claimed the department would m...,President Donald Trump and adviser Elon Musk c...,https://www.aljazeera.com/news/2025/6/6/has-do...,usaid kenya,2025-06-06,Agatha_news.csv
1,The Life Story of Ecomobilus Technologies Limi...,By Prof Geoffrey Gitau Here is a story showcas...,By Prof Geoffrey Gitau\r\nHere is a story show...,https://cleantechnica.com/2025/05/26/the-life-...,usaid kenya,2025-05-26,Agatha_news.csv




'Shape: (471, 7)'

'Columns:'

['source', 'title', 'description', 'text', 'url', 'keyword', 'published_date']

'\nMissing values:\n'

source              0
title               0
description       385
text                0
url                 0
keyword             0
published_date    153
dtype: int64

'\nUnique values per column:\n'

source            177
title             446
description        85
text              454
url               471
keyword            11
published_date    225
dtype: int64

'\nSample rows:'

Unnamed: 0,source,title,description,text,url,keyword,published_date
0,Al Jazeera English,Has DOGE really saved the US government $180bn?,,Elon Musk first claimed the Department of Gove...,https://www.aljazeera.com/news/2025/6/6/has-do...,usaid kenya,2025-06-06 00:00:00
1,Daily Signal,Congress Should Quickly Approve Trump’s Rescis...,,President Donald Trump‘s rescission legislatio...,https://www.dailysignal.com/2025/06/10/congres...,usaid kenya,2025-06-10 00:00:00




'Shape: (2549, 6)'

'Columns:'

['title', 'description', 'text', 'url', 'keyword', 'published_date']

'\nMissing values:\n'

title               0
description        16
text               25
url                 2
keyword           170
published_date     99
dtype: int64

'\nUnique values per column:\n'

title             1410
description       1411
text              1401
url               1437
keyword             30
published_date    1295
dtype: int64

'\nSample rows:'

Unnamed: 0,title,description,text,url,keyword,published_date
0,Has DOGE really saved the US government $180bn?,Elon Musk first claimed the department would m...,President Donald Trump and adviser Elon Musk c...,https://www.aljazeera.com/news/2025/6/6/has-do...,usaid kenya,2025-06-06 11:21:51+00:00
1,The Life Story of Ecomobilus Technologies Limi...,By Prof Geoffrey Gitau Here is a story showcas...,By Prof Geoffrey Gitau\r\nHere is a story show...,https://cleantechnica.com/2025/05/26/the-life-...,usaid kenya,2025-05-26 17:13:41+00:00




'Shape: (2549, 4)'

'Columns:'

['Unnamed: 0', 'title', 'description', 'content']

'\nMissing values:\n'

Unnamed: 0      0
title           0
description    16
content        25
dtype: int64

'\nUnique values per column:\n'

Unnamed: 0     2549
title          1410
description    1406
content        1401
dtype: int64

'\nSample rows:'

Unnamed: 0.1,Unnamed: 0,title,description,content
0,0,Has DOGE really saved the US government $180bn?,Elon Musk first claimed the department would m...,President Donald Trump and adviser Elon Musk c...
1,1,The Life Story of Ecomobilus Technologies Limi...,By Prof Geoffrey Gitau Here is a story showcas...,By Prof Geoffrey Gitau\r\nHere is a story show...




'Shape: (2549, 11)'

'Columns:'

['keyword',
 'source',
 'author',
 'title',
 'description',
 'content',
 'summary',
 'full_text',
 'publishedAt',
 'url',
 'language']

'\nMissing values:\n'

keyword         170
source            0
author          245
title             0
description      16
content          25
summary        2525
full_text      2439
publishedAt       2
url               2
language       2549
dtype: int64

'\nUnique values per column:\n'

keyword          30
source          290
author          923
title          1410
description    1411
content        1401
summary          23
full_text       102
publishedAt    1391
url            1437
language          0
dtype: int64

'\nSample rows:'

Unnamed: 0,keyword,source,author,title,description,content,summary,full_text,publishedAt,url,language
0,usaid kenya,Al Jazeera English,Al Jazeera,Has DOGE really saved the US government $180bn?,Elon Musk first claimed the department would m...,President Donald Trump and adviser Elon Musk c...,,,2025-06-06T11:21:51Z,https://www.aljazeera.com/news/2025/6/6/has-do...,
1,usaid kenya,CleanTechnica,Guest Contributor,The Life Story of Ecomobilus Technologies Limi...,By Prof Geoffrey Gitau Here is a story showcas...,By Prof Geoffrey Gitau\r\nHere is a story show...,,,2025-05-26T17:13:41Z,https://cleantechnica.com/2025/05/26/the-life-...,
