# Load libraries

In [9]:
%pip install scikit-learn
%pip install nltk
%pip install emoji

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [10]:
import pandas as pd
import numpy as np
import nltk
import ssl, certifi
ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=certifi.where())


nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /Users/Zapi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/Zapi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/Zapi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load data

In [11]:
# Load the datasets
rappler_docs = pd.read_excel('data/rappler_old.xlsx')
youtube_docs = pd.read_excel('data/youtube_old.xlsx')

# Drop unnamed columns
rappler_docs = rappler_docs.loc[:, ~rappler_docs.columns.str.contains("^Unnamed")]
youtube_docs = youtube_docs.loc[:, ~youtube_docs.columns.str.contains("^Unnamed")]

# Parse datetimes
# Rappler: already has +08:00 offset, just parse and strip tz
rappler_docs['date_published'] = pd.to_datetime(
    rappler_docs['date_published'], errors='coerce'
).dt.tz_localize(None)

# YouTube: UTC ‚Üí Manila
youtube_docs['date_published'] = pd.to_datetime(
    youtube_docs['date_published'], errors='coerce', utc=True
).dt.tz_convert('Asia/Manila').dt.tz_localize(None)

# Add missing columns to Rappler
rappler_docs['like_count'] = pd.NA
rappler_docs['reply_parent_id'] = pd.NA

# Add source
rappler_docs['source'] = 'rappler'
youtube_docs['source'] = 'youtube'

# Reorder columns consistently
column_order = [
    "title", "link", "date_published", "text",
    "like_count", "reply_parent_id", "source"
]

rappler_docs = rappler_docs[column_order]
youtube_docs = youtube_docs[column_order]

# Combine datasets
corpus = pd.concat([rappler_docs, youtube_docs], ignore_index=True)

  rappler_docs['date_published'] = pd.to_datetime(


# Preprocess text

## Load stopwords

In [12]:
from pandas.errors import EmptyDataError
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

try:
    BASIC_STOPWORDS = list(
        pd.read_csv('basic_stopwords.txt', header=None).values.flatten()
    )
except (FileNotFoundError, EmptyDataError):
    BASIC_STOPWORDS = []

try:
    DOMAIN_STOPWORDS = list(
        pd.read_csv('domain_stopwords.txt', header=None).values.flatten()
    )
except (FileNotFoundError, EmptyDataError):
    DOMAIN_STOPWORDS = []

EN_STOPWORDS_LIST = stopwords.words('english')

EXTRA_STOPWORDS = [
    "ako","ikaw","siya","kami","tayo","kayo","sila",
    "ko","mo","niya","natin","namin","nila","kanila","atin","amin",
    "ang","ng","sa","kay","kina","para","mula","galing","ayon",
    "dahil","kung","kapag","bago","hanggang","habang","pagkatapos",
    "kaya","pero","ngunit","subalit","kahit","kasi","sapagkat",
    "ito","iyan","iyon","doon","dito","dyan","diyan","ngayon","noon",
    "mamaya","kanina","bukas","kahapon","palagi","lagi","minsan",
    "madalas","halos","lamang","lang","na","ay","din","rin","daw","raw",
    "pa","naman","nga","pala","yata","dapat","hindi","oo","opo","huwag",
    "wala","may","meron","saan","kailan","paano","ano","bakit","sino","alin",
    "lahat","iba","ibang","pareho","ganito","ganyan","ganun","ganoon","gayunman",
    "yan", "lahat", "walang", "pa", "ka", "ni", "po", "si", "lng", "nyo", "mga", "yung", "ba", "di",
    "nya", "pag", "nya", "yang", "eh", "mag", "yan", "puro", "mag",
]

# BASIC_STOPWORDS = BASIC_STOPWORDS + EXTRA_STOPWORDS

[nltk_data] Downloading package stopwords to /Users/Zapi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Clean corpus

In [13]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import emoji


def clean_corpus(corpus, text_column='text'):
  '''
  Clean the text data in the specified column of the DataFrame.
  '''
  cleaned_corpus = corpus.copy()

  # Force text_column as string
  cleaned_corpus['cleaned_text'] = cleaned_corpus[text_column].astype(str)

  # Transform into lowercase
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.lower()

  # Remove usernames, non-alphanumeric characters, and links
  # docs['cleaned_text'] = docs['cleaned_text'].str.replace(r'(@[A-Za-z0-9_]+)|([^A-Za-z0-9_ \t])|(\w+:\/\/\S+)', '')

  # # Lemmatize (by default, lemmatize nouns)
  # # Other options:
  # #   'v' for verbs
  # #   'a' for adjectives
  # #   'r' for adverbs
  # #   's' for satellites adjectives (adjectives that appear after verbs)
  # lemmatizer = WordNetLemmatizer()
  # cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
  #   lambda text: ' '.join(
  #     [lemmatizer.lemmatize(word, pos='n') for word in str(text).split()]
  #   )
  # )

  # # Stemmer
  # stemmer = PorterStemmer()
  # cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
  #     lambda text: ' '.join(
  #       [stemmer.stem(word) for word in str(text).split()]
  #     )
  # )

  # Remove non-alphanumeric characters FIRST
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.replace(r'\W', ' ', regex=True)

  # Remove numbers
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.replace(r'\d+', ' ', regex=True)

  # Remove emojis using emoji library
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      emoji.replace_emoji(text, replace=' ').split()
    )
  )

  # Remove trailing and leading whitespaces
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.strip()

  # Remove NLTK stopwords
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [
        word for word in text.split() if word not in EN_STOPWORDS_LIST
      ]
    )
  )

  # Remove basic stopwords
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [word for word in text.split() if word not in BASIC_STOPWORDS]
    )
  )

  # Remove domain stopwords
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [word for word in text.split() if word not in DOMAIN_STOPWORDS]
    )
  )

  # Remove extra stopwords
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [word for word in text.split() if word not in EXTRA_STOPWORDS]
    )
  )

  # Remove trailing and leading whitespaces (final cleanup)
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.strip()

  # Remove NaN values
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].replace(np.nan, '', regex=True)

  return cleaned_corpus

In [14]:
cleaned_corpus = clean_corpus(corpus, text_column='text')
cleaned_corpus

Unnamed: 0,title,link,date_published,text,like_count,reply_parent_id,source,cleaned_text
0,[Rear View] Is Marcos looking for his Napoles?,2025-08-22T16:00:00+08:00,NaT,President Ferdinand Marcos Jr. is doubling dow...,,,rappler,president ferdinand marcos jr doubling pledge ...
1,Fixing the flood problem: What's in it for Ram...,2025-08-22T14:49:01+08:00,NaT,"MANILA, Philippines ‚Äì Filipino billionaire Ram...",,,rappler,manila philippines filipino billionaire ramon ...
2,Gardiola clan's DPWH deals hit billions after ...,2025-08-22T12:00:00+08:00,NaT,Two construction firms owned by Construction W...,,,rappler,two construction firms owned construction work...
3,[In This Economy] The hypocrisy in Marcos‚Äô new...,2025-08-22T10:44:11+08:00,NaT,President Ferdinand Marcos Jr. seems to be spe...,,,rappler,president ferdinand marcos jr seems spending i...
4,Which Bulacan towns got biggest slices of DPWH...,2025-08-22T08:00:00+08:00,NaT,"With every typhoon or heavy downpour, large pa...",,,rappler,every typhoon heavy downpour large parts bulac...
...,...,...,...,...,...,...,...,...
7859,Dahil after the typhoon wala na ang ebidensiya.,https://www.youtube.com/watch?v=QHKjGHbj-Gc&lc...,2025-08-11 14:29:49,Dahil after the typhoon wala na ang ebidensiya.,2,,youtube,typhoon ebidensiya
7860,Yabang mo kasi! Inuuna nyo impeachment sira ulo!,https://www.youtube.com/watch?v=QHKjGHbj-Gc&lc...,2025-08-11 14:28:26,Yabang mo kasi! Inuuna nyo impeachment sira ulo!,0,,youtube,yabang inuuna impeachment sira ulo
7861,"Gnyan klkaran ng kurakot s Dpwh ,lhat nyan my ...",https://www.youtube.com/watch?v=QHKjGHbj-Gc&lc...,2025-08-11 14:27:42,"Gnyan klkaran ng kurakot s Dpwh ,lhat nyan my ...",0,,youtube,gnyan klkaran kurakot dpwh lhat nyan lgay frm ...
7862,Magic ni Sec. Bonoan yan alam na alam nya laha...,https://www.youtube.com/watch?v=QHKjGHbj-Gc&lc...,2025-08-11 14:17:28,Magic ni Sec. Bonoan yan alam na alam nya laha...,1,,youtube,magic sec bonoan alam alam sinong politiko sin...


### Cleaning emojis

In [15]:
sample_sentence = "hello world @helloWorld üòÖ"
sample_sentence_2 = "Wait ko si dugong mag salita na JOKE LNGüòÅ‚ò∫Ô∏è<br>Kayu naman naniniwla agadüòÇ"

# Clean emojis
sample_sentence_without_emoji = emoji.replace_emoji(
  sample_sentence, replace=' EMOJI')
sample_sentence_2_without_emoji = emoji.replace_emoji(
  sample_sentence_2, replace=' EMOJI')

print(sample_sentence_2)
print(sample_sentence_2_without_emoji)

Wait ko si dugong mag salita na JOKE LNGüòÅ‚ò∫Ô∏è<br>Kayu naman naniniwla agadüòÇ
Wait ko si dugong mag salita na JOKE LNG EMOJI EMOJI<br>Kayu naman naniniwla agad EMOJI


In [16]:
cleaned_corpus.to_excel('data/cleaned_corpus.xlsx', index=False)