# Load libraries

In [1]:
%pip install scikit-learn
%pip install nltk
%pip install emoji

Note: you may need to restart the kernel to use updated packages.
Collecting nltk
  Using cached nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting click (from nltk)
  Using cached click-8.3.0-py3-none-any.whl.metadata (2.6 kB)
Using cached nltk-3.9.2-py3-none-any.whl (1.5 MB)
Using cached click-8.3.0-py3-none-any.whl (107 kB)
Installing collected packages: click, nltk
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m2/2[0m [nltk][32m1/2[0m [nltk]
[1A[2KSuccessfully installed click-8.3.0 nltk-3.9.2
Note: you may need to restart the kernel to use updated packages.
Collecting emoji
  Using cached emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Using cached emoji-2.15.0-py3-none-any.whl (608 kB)
Installing collected packages: emoji
Successfully installed emoji-2.15.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd
import numpy as np
import nltk
import ssl, certifi
ssl._create_default_https_context = lambda: ssl.create_default_context(cafile=certifi.where())


nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /Users/Zapi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/Zapi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/Zapi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Load data

In [3]:
# Load the datasets
rappler_docs = pd.read_excel('data/rappler_current.xlsx')
youtube_docs = pd.read_excel('data/youtube_current.xlsx')

# Drop unnamed columns
rappler_docs = rappler_docs.loc[:, ~rappler_docs.columns.str.contains("^Unnamed")]
youtube_docs = youtube_docs.loc[:, ~youtube_docs.columns.str.contains("^Unnamed")]

# Parse datetimes
# Rappler: already has +08:00 offset, just parse and strip tz
rappler_docs['date_published'] = pd.to_datetime(
    rappler_docs['date_published'], errors='coerce'
).dt.tz_localize(None)

# YouTube: UTC ‚Üí Manila
youtube_docs['date_published'] = pd.to_datetime(
    youtube_docs['date_published'], errors='coerce', utc=True
).dt.tz_convert('Asia/Manila').dt.tz_localize(None)

# Add missing columns to Rappler
rappler_docs['like_count'] = pd.NA
rappler_docs['reply_parent_id'] = pd.NA

# Add source
rappler_docs['source'] = 'rappler'
youtube_docs['source'] = 'youtube'

# Reorder columns consistently
column_order = [
    "title", "link", "date_published", "text",
    "like_count", "reply_parent_id", "source"
]

rappler_docs = rappler_docs[column_order]
youtube_docs = youtube_docs[column_order]

# Combine datasets
corpus = pd.concat([rappler_docs, youtube_docs], ignore_index=True)

# Preprocess text

## Load stopwords

In [4]:
from pandas.errors import EmptyDataError
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

try:
    BASIC_STOPWORDS = list(
        pd.read_csv('basic_stopwords.txt', header=None).values.flatten()
    )
except (FileNotFoundError, EmptyDataError):
    BASIC_STOPWORDS = []

try:
    DOMAIN_STOPWORDS = list(
        pd.read_csv('domain_stopwords.txt', header=None).values.flatten()
    )
except (FileNotFoundError, EmptyDataError):
    DOMAIN_STOPWORDS = []

EN_STOPWORDS_LIST = stopwords.words('english')

EXTRA_STOPWORDS = [
    "ako","ikaw","siya","kami","tayo","kayo","sila",
    "ko","mo","niya","natin","namin","nila","kanila","atin","amin",
    "ang","ng","sa","kay","kina","para","mula","galing","ayon",
    "dahil","kung","kapag","bago","hanggang","habang","pagkatapos",
    "kaya","pero","ngunit","subalit","kahit","kasi","sapagkat",
    "ito","iyan","iyon","doon","dito","dyan","diyan","ngayon","noon",
    "mamaya","kanina","bukas","kahapon","palagi","lagi","minsan",
    "madalas","halos","lamang","lang","na","ay","din","rin","daw","raw",
    "pa","naman","nga","pala","yata","dapat","hindi","oo","opo","huwag",
    "wala","may","meron","saan","kailan","paano","ano","bakit","sino","alin",
    "lahat","iba","ibang","pareho","ganito","ganyan","ganun","ganoon","gayunman",
    "yan", "lahat", "walang", "pa", "ka", "ni", "po", "si", "lng", "nyo", "mga", "yung", "ba", "di",
    "nya", "pag", "nya", "yang", "eh", "mag", "yan", "puro", "mag",
]

# BASIC_STOPWORDS = BASIC_STOPWORDS + EXTRA_STOPWORDS

[nltk_data] Downloading package stopwords to /Users/Zapi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Clean corpus

In [5]:
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import emoji


def clean_corpus(corpus, text_column='text'):
  '''
  Clean the text data in the specified column of the DataFrame.
  '''
  cleaned_corpus = corpus.copy()

  # Force text_column as string
  cleaned_corpus['cleaned_text'] = cleaned_corpus[text_column].astype(str)

  # Transform into lowercase
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.lower()

  # Remove usernames, non-alphanumeric characters, and links
  # docs['cleaned_text'] = docs['cleaned_text'].str.replace(r'(@[A-Za-z0-9_]+)|([^A-Za-z0-9_ \t])|(\w+:\/\/\S+)', '')

  # # Lemmatize (by default, lemmatize nouns)
  # # Other options:
  # #   'v' for verbs
  # #   'a' for adjectives
  # #   'r' for adverbs
  # #   's' for satellites adjectives (adjectives that appear after verbs)
  # lemmatizer = WordNetLemmatizer()
  # cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
  #   lambda text: ' '.join(
  #     [lemmatizer.lemmatize(word, pos='n') for word in str(text).split()]
  #   )
  # )

  # # Stemmer
  # stemmer = PorterStemmer()
  # cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
  #     lambda text: ' '.join(
  #       [stemmer.stem(word) for word in str(text).split()]
  #     )
  # )

  # Remove non-alphanumeric characters FIRST
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.replace(r'\W', ' ', regex=True)

  # Remove numbers
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.replace(r'\d+', ' ', regex=True)

  # Remove emojis using emoji library
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      emoji.replace_emoji(text, replace=' ').split()
    )
  )

  # Remove trailing and leading whitespaces
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.strip()

  # Remove NLTK stopwords
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [
        word for word in text.split() if word not in EN_STOPWORDS_LIST
      ]
    )
  )

  # Remove basic stopwords
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [word for word in text.split() if word not in BASIC_STOPWORDS]
    )
  )

  # Remove domain stopwords
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [word for word in text.split() if word not in DOMAIN_STOPWORDS]
    )
  )

  # Remove extra stopwords
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].apply(
    lambda text: ' '.join(
      [word for word in text.split() if word not in EXTRA_STOPWORDS]
    )
  )

  # Remove trailing and leading whitespaces (final cleanup)
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].str.strip()

  # Remove NaN values
  cleaned_corpus['cleaned_text'] = cleaned_corpus['cleaned_text'].replace(np.nan, '', regex=True)

  return cleaned_corpus

In [6]:
cleaned_corpus = clean_corpus(corpus, text_column='text')
cleaned_corpus

Unnamed: 0,title,link,date_published,text,like_count,reply_parent_id,source,cleaned_text
0,,https://www.rappler.com/philippines/dpwh-manue...,2025-08-31 17:02:21,,,,rappler,
1,,https://www.rappler.com/philippines/dpwh-suspe...,2025-09-03 11:20:56,,,,rappler,
2,,https://www.rappler.com/philippines/flood-cont...,2025-09-04 15:01:13,,,,rappler,
3,,https://www.rappler.com/philippines/coa-holds-...,2025-09-09 10:47:37,,,,rappler,
4,,https://www.rappler.com/philippines/visayas/ma...,2025-08-14 15:36:45,,,,rappler,
...,...,...,...,...,...,...,...,...
2145,Do u really have to investigate since it&#39;s...,https://www.youtube.com/watch?v=SpYDbT-PHeA&lc...,2025-11-14 22:08:07,Do u really have to investigate since it's cle...,0,,youtube,u really investigate since clear ghost project...
2146,They are govt officials that needs appearance ...,https://www.youtube.com/watch?v=SpYDbT-PHeA&lc...,2025-11-14 22:06:20,They are govt officials that needs appearance ...,0,,youtube,govt officials needs appearance investigtion
2147,Lagot kayong mga kurakot Kay sir sen general l...,https://www.youtube.com/watch?v=SpYDbT-PHeA&lc...,2025-11-14 20:56:23,Lagot kayong mga kurakot Kay sir sen general l...,1,,youtube,lagot kayong kurakot sir sen general lacson
2148,Dapat kasuhan na Sila nagaalburuto na Ang mamayan,https://www.youtube.com/watch?v=SpYDbT-PHeA&lc...,2025-11-14 20:49:15,Dapat kasuhan na Sila nagaalburuto na Ang mamayan,3,,youtube,kasuhan nagaalburuto mamayan


### Cleaning emojis

In [7]:
sample_sentence = "hello world @helloWorld üòÖ"
sample_sentence_2 = "Wait ko si dugong mag salita na JOKE LNGüòÅ‚ò∫Ô∏è<br>Kayu naman naniniwla agadüòÇ"

# Clean emojis
sample_sentence_without_emoji = emoji.replace_emoji(
  sample_sentence, replace=' EMOJI')
sample_sentence_2_without_emoji = emoji.replace_emoji(
  sample_sentence_2, replace=' EMOJI')

print(sample_sentence_2)
print(sample_sentence_2_without_emoji)

Wait ko si dugong mag salita na JOKE LNGüòÅ‚ò∫Ô∏è<br>Kayu naman naniniwla agadüòÇ
Wait ko si dugong mag salita na JOKE LNG EMOJI EMOJI<br>Kayu naman naniniwla agad EMOJI


In [8]:
cleaned_corpus.to_excel('data/cleaned_corpus.xlsx', index=False)