1. Pre-processing: Data cleaning and preparing for modelling

- We will be working on Topic Modelling with BERTopic using the Kaggle news dataset, with news aricles from different mediums in the period 01.10.2023 - 29.11.2023

In [2]:
# libraries
import pandas as pd
import numpy as np
from langdetect import detect

In [41]:
# read csv file
all_news_df = pd.read_csv('/Users/gresasmolica/Downloads/data-5.csv', encoding='utf-8')

In [42]:
# explore data
all_news_df.head()
all_news_df.info()
all_news_df.describe()
all_news_df.columns

# check latest date
all_news_df['published_at'].max()
all_news_df['published_at'].min()

# check unique source_name
all_news_df['source_name'].nunique()

# drop unnecessary columns
all_news_df.drop(['author', 'source_id', 'source_name', 'description', 'full_content', 'url', 'url_to_image'], axis=1, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105375 entries, 0 to 105374
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   article_id    105375 non-null  int64 
 1   source_id     24495 non-null   object
 2   source_name   105375 non-null  object
 3   author        97156 non-null   object
 4   title         105335 non-null  object
 5   description   104992 non-null  object
 6   url           105375 non-null  object
 7   url_to_image  99751 non-null   object
 8   published_at  105375 non-null  object
 9   content       105375 non-null  object
 10  category      105333 non-null  object
 11  full_content  58432 non-null   object
dtypes: int64(1), object(11)
memory usage: 9.6+ MB


In [43]:
# filter english news content only
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False
    
english_news_df = all_news_df[all_news_df['content'].apply(is_english)]

In [44]:
# filter english titles only
english_news_df = english_news_df[english_news_df['title'].apply(is_english)]

In [46]:
english_news_df

Unnamed: 0,article_id,title,published_at,content,category
0,89541,UN Chief Urges World To 'Stop The Madness' Of ...,2023-10-30 10:12:35.000000,UN Secretary-General Antonio Guterres urged th...,Nepal
2,89543,UN Chief Urges World to 'Stop the Madness' of ...,2023-10-30 10:53:30.000000,"Kathmandu, Nepal UN Secretary-General Antonio...",Nepal
3,89545,Sikkim warning: Hydroelectricity push must be ...,2023-10-06 01:20:24.000000,At least 14 persons lost their lives and more ...,Nepal
4,89547,"200 foreigners, dual nationals cut down in Ham...",2023-10-27 01:08:34.000000,"Scores of foreign citizens were killed, taken ...",Nepal
6,89551,Pro-Israel rallies allowed in India but Palest...,2023-10-25 09:58:17.000000,"New Delhi, India Israels relentless bombing of...",Nepal
...,...,...,...,...,...
105370,781108,"Have done no wrong, only did party work, says ...",2023-11-29 10:57:22,Karnataka Deputy Chief Minister D K Shivakumar...,Home
105371,781129,FC Barcelona Guarantees $77.6 Million Champion...,2023-11-29 08:41:18,FC Barcelona have guaranteed at least $767.6 m...,Home
105372,781235,Three hospitals ignored her gravely ill fiancé...,2023-11-29 10:01:12,The photo from David and Sarah Lubarsky's wedd...,Home
105373,781240,Kerber’s Farm: Bringing Farm To Table To Manha...,2023-11-29 13:44:33,Kerbers Farm: Bringing Farm To Table To Manhat...,Home


In [53]:
# drop null values
english_news_df.dropna()
english_news_df['category'].value_counts()

# print categories that have more than 500 news and less than 1000 news

print(english_news_df['category'].value_counts()[(english_news_df['category'].value_counts() > 1000)])

category
Stock                      3841
Health                     2507
Technology                 2290
Canada                     2288
Real estate                2286
Finance                    2146
News                       1762
COVID                      1735
Education                  1713
Food                       1549
Jobs                       1510
Weather                    1449
Travel                     1395
Cars                       1235
Science                    1209
Asia                       1163
India                      1136
Politics                   1112
America                    1097
Climate                    1075
Artificial Intelligence    1071
Fashion                    1060
Music                      1053
Sports                     1046
Relationships              1015
Love                       1004
Name: count, dtype: int64


In [54]:
# filter the category to keep: politics, europe, united states, climate, sustainability.

selected_categories = ['Politics', 'Europe', 'America', 'Climate', 'News', 'Ukraine', 'Palestine, State of', 'Israel', 'Russian Federation']

subset_english_news_df = english_news_df[english_news_df['category'].isin(selected_categories)]

subset_english_news_df

Unnamed: 0,article_id,title,published_at,content,category
298,103910,Vienna Jewish cemetery torched,2023-11-02 04:31:58.000000,The Jewish section of a major cemetery in the ...,Europe
856,104426,Sao Paulo Grand Prix first practice before qua...,2023-11-02 12:24:34.000000,"Hello, folks. We've had five of them already a...",News
961,96533,Uganda: Oil Pipeline Protests Stifled - HRW,2023-11-02 07:01:46.000000,"Environmental Defenders Face Harassment, Intim...",Climate
1782,96554,Africa: Giraffes Could Go Extinct - the 5 Bigg...,2023-11-02 04:30:50.000000,Giraffes are the world's tallest mammals and a...,Climate
2474,95207,Pro-Trump Pastor Hosts Book Burning During Chu...,2023-11-02 18:11:39.000000,"Tennessee pastor Greg Locke, an outspoken supp...",Politics
...,...,...,...,...,...
105356,780574,Joint statement by Joint Expeditionary Force m...,2023-11-29 02:47:13,Defence Secretary Grant Shapps met virtually w...,Europe
105357,780725,Google opens biggest European cyber centre,2023-11-29 13:22:51,"Malaga, Nov 29, 2023 -Google opened its larges...",Europe
105358,780803,Rapala VMC Corporation’s Financial Reporting i...,2023-11-29 14:00:00,"Rapala VMC Corporation, Financial calendar, No...",Europe
105359,780848,Pharming Group (NASDAQ:PHAR) Shares Gap Down t...,2023-11-29 13:02:41,Pharming Group (NASDAQ:PHAR – Get Free Report)...,Europe


In [1]:
#export to csv
subset_english_news_df.to_csv('/Users/gresasmolica/Desktop/Gresa Smolica/Hertie - MDS/Semester III/DL/Topic-Modelling-Tutorial/subset_english_news.csv', index=False)

NameError: name 'subset_english_news_df' is not defined