1. Pre-processing: Data cleaning and preparing for modelling

- We will be working on Topic Modelling with BERTopic using the Kaggle news dataset, with news aricles from different mediums in the period 01.10.2023 - 29.11.2023

In [3]:
# libraries
import pandas as pd
import numpy as np
from langdetect import detect

In [8]:
# read csv file
all_news_df = pd.read_csv('/Users/gresasmolica/Downloads/data-5.csv', encoding='utf-8')

In [9]:
# explore data
all_news_df.head()
all_news_df.info()
all_news_df.describe()
all_news_df.columns

# check latest date
all_news_df['published_at'].max()
all_news_df['published_at'].min()

# check unique source_name
all_news_df['source_name'].nunique()

# drop unnecessary columns
all_news_df.drop(['author', 'source_id', 'source_name', 'description', 'content', 'url', 'url_to_image'], axis=1, inplace=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105375 entries, 0 to 105374
Data columns (total 12 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   article_id    105375 non-null  int64 
 1   source_id     24495 non-null   object
 2   source_name   105375 non-null  object
 3   author        97156 non-null   object
 4   title         105335 non-null  object
 5   description   104992 non-null  object
 6   url           105375 non-null  object
 7   url_to_image  99751 non-null   object
 8   published_at  105375 non-null  object
 9   content       105375 non-null  object
 10  category      105333 non-null  object
 11  full_content  58432 non-null   object
dtypes: int64(1), object(11)
memory usage: 9.6+ MB


In [10]:
# filter english news content only
def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False
    
english_news_df = all_news_df[all_news_df['full_content'].apply(is_english)]

In [11]:
# filter english titles only
english_news_df = english_news_df[english_news_df['title'].apply(is_english)]

In [12]:
english_news_df

Unnamed: 0,article_id,title,published_at,category,full_content
0,89541,UN Chief Urges World To 'Stop The Madness' Of ...,2023-10-30 10:12:35.000000,Nepal,UN Secretary-General Antonio Guterres urged th...
3,89545,Sikkim warning: Hydroelectricity push must be ...,2023-10-06 01:20:24.000000,Nepal,At least 14 persons lost their lives and more ...
6,89551,Pro-Israel rallies allowed in India but Palest...,2023-10-25 09:58:17.000000,Nepal,"India, the first non-Arab country to recognise..."
7,89555,No nation in the world is buying more planes t...,2023-11-02 05:48:58.000000,Nepal,Written by Alex Travelli and Hari Kumar No nat...
15,89570,China expands climate change surveillance on H...,2023-10-06 08:37:56.000000,Nepal,BEIJING: China has set up weather stations on ...
...,...,...,...,...,...
105370,781108,"Have done no wrong, only did party work, says ...",2023-11-29 10:57:22,Home,Karnataka Deputy Chief Minister D K Shivakumar...
105371,781129,FC Barcelona Guarantees $77.6 Million Champion...,2023-11-29 08:41:18,Home,FC Barcelona have guaranteed at least $767.6 m...
105372,781235,Three hospitals ignored her gravely ill fiancé...,2023-11-29 10:01:12,Home,The photo from David and Sarah Lubarsky's wedd...
105373,781240,Kerber’s Farm: Bringing Farm To Table To Manha...,2023-11-29 13:44:33,Home,Kerber’s Farm: Bringing Farm To Table To Manha...


In [13]:
# drop null values
english_news_df.dropna()
english_news_df['category'].value_counts()

# print categories that have more 1000 news

print(english_news_df['category'].value_counts()[(english_news_df['category'].value_counts() > 1000)])

category
Stock          3523
Canada         2175
Health         2014
Real estate    1943
Technology     1924
Finance        1673
News           1328
COVID          1315
Education      1274
India          1095
Food           1078
Name: count, dtype: int64


In [14]:
# filter the category to keep: politics, europe, united states, climate, sustainability.

selected_categories = ['Politics', 'Europe', 'America', 'Climate', 'News', 'Ukraine', 'Palestine, State of', 'Israel', 'Russian Federation']

subset_english_news_df = english_news_df[english_news_df['category'].isin(selected_categories)]

subset_english_news_df

Unnamed: 0,article_id,title,published_at,category,full_content
298,103910,Vienna Jewish cemetery torched,2023-11-02 04:31:58.000000,Europe,The Jewish section of a major cemetery in the ...
3651,95015,Bill Ackman says it's 'pathetic' that law firm...,2023-11-02 11:14:13.000000,Politics,The billionaire investorBill Ackmantook aim at...
3652,95016,Netanyahu is focused on his own political 'sur...,2023-11-02 15:58:14.000000,Politics,A 30-year veteran of the Israel Defense Forces...
3654,95018,Democrats sound alarms over No Labels third-pa...,2023-11-02 20:20:50.000000,Politics,Former House Speaker Nancy Pelosi is advocatin...
3656,95023,"Amid Hezbollah-Israel clashes, Christian villa...",2023-11-02 06:07:16.000000,Politics,"In Pictures At Lebanon’s border with Israel, r..."
...,...,...,...,...,...
105355,780389,Europe’s COP 29 Climate Change Goals Should In...,2023-11-29 13:54:00,Europe,Space-based solar power for Earth At this week...
105356,780574,Joint statement by Joint Expeditionary Force m...,2023-11-29 02:47:13,Europe,Defence Secretary Grant Shapps met virtually w...
105358,780803,Rapala VMC Corporation’s Financial Reporting i...,2023-11-29 14:00:00,Europe,"Rapala VMC Corporation, Financial calendar, N..."
105359,780848,Pharming Group (NASDAQ:PHAR) Shares Gap Down t...,2023-11-29 13:02:41,Europe,Pharming Group (NASDAQ:PHAR–Get Free Report)’s...


In [16]:
#export to csv
subset_english_news_df.to_csv('/Users/gresasmolica/Desktop/Gresa Smolica/Hertie - MDS/Semester III/DL/Deep-Learning-Tutorial/subset_english_news.csv', index=False)

In [36]:
# prepare the data for topic modelling: remove stopwords, tokenize, lemmatize, remove punctuation, remove numbers, remove words with length less than 3

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
import string
import re

# download stopwords
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# create a list of stopwords
stop_words = stopwords.words('english')

# create a list of punctuation
punctuation = string.punctuation

# create a lemmatizer object
lemmatizer = WordNetLemmatizer()

# create a stemmer object
stemmer = PorterStemmer()

# create a function to clean the text
def clean_text(text):
    # remove punctuation
    text = "".join([char.lower() for char in text if char not in punctuation])
    # tokenize text
    tokens = word_tokenize(text)
    # remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    # lemmatize and stem text
    tokens = [stemmer.stem(lemmatizer.lemmatize(word)) for word in tokens]
    # remove numbers and short words
    tokens = [word for word in tokens if word.isalpha() and len(word) > 2]
    # lower capitalization
    tokens = [word.lower() for word in tokens]
    # join all
    text = " ".join(tokens)
    return text


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/gresasmolica/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gresasmolica/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/gresasmolica/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [52]:
subset_english_news_df

# save as csv
# subset_english_news_df.to_csv('/Users/gresasmolica/Desktop/Gresa Smolica/Hertie - MDS/Semester III/DL/Deep-Learning-Tutorial/preprocessed_subset_english_news.csv', index=False)

Unnamed: 0,title,published_at,category,full_content,preprocessed_content
298,Vienna Jewish cemetery torched,2023-11-02 04:31:58.000000,Europe,The Jewish section of a major cemetery in the ...,jewish section major cemeteri austrian capit s...
3651,Bill Ackman says it's 'pathetic' that law firm...,2023-11-02 11:14:13.000000,Politics,The billionaire investorBill Ackmantook aim at...,billionair investorbil ackmantook aim univers ...
3652,Netanyahu is focused on his own political 'sur...,2023-11-02 15:58:14.000000,Politics,A 30-year veteran of the Israel Defense Forces...,veteran israel defens forc former head countri...
3654,Democrats sound alarms over No Labels third-pa...,2023-11-02 20:20:50.000000,Politics,Former House Speaker Nancy Pelosi is advocatin...,former hous speaker nanci pelosi advoc thirdpa...
3656,"Amid Hezbollah-Israel clashes, Christian villa...",2023-11-02 06:07:16.000000,Politics,"In Pictures At Lebanon’s border with Israel, r...",pictur lebanon border israel resid christian v...
...,...,...,...,...,...
105355,Europe’s COP 29 Climate Change Goals Should In...,2023-11-29 13:54:00,Europe,Space-based solar power for Earth At this week...,spacebas solar power earth week climat chang c...
105356,Joint statement by Joint Expeditionary Force m...,2023-11-29 02:47:13,Europe,Defence Secretary Grant Shapps met virtually w...,defenc secretari grant shapp met virtual minis...
105358,Rapala VMC Corporation’s Financial Reporting i...,2023-11-29 14:00:00,Europe,"Rapala VMC Corporation, Financial calendar, N...",rapala vmc corpor financi calendar novemb eet ...
105359,Pharming Group (NASDAQ:PHAR) Shares Gap Down t...,2023-11-29 13:02:41,Europe,Pharming Group (NASDAQ:PHAR–Get Free Report)’s...,pharm group free report share price gap market...
