#### Import Libraries

In [10]:
# Use Kernel "nlp_course_copy" for this notebook
import pandas as pd     # Pandas for dataframes
import re               # Re for regular expressions
import string           # String for string manipulation

from tqdm import tqdm   # Tqdm for progress bar

import nltk             # Python library for NLP
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize.treebank import TreebankWordDetokenizer

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [11]:
df = pd.read_csv('(A) Data/Need To Vote/Combined_News Content Title_800 Data.csv')

df.head()

Unnamed: 0,News_Data,1st Annotator,2nd Annotator,3rd Annotator,Voting Result,Final Take,Labelling
0,"Olahraga Pilates Makin Populer, Ini Deret Manf...",Positive,Positive,Positive,Positive,Positive,1
1,"Janice Tjen Luar Biasa, Lolos di 2 Nomor Final...",Positive,Positive,Positive,Positive,Positive,1
2,Siapakah Pasukan RSF yang Diduga Bantai Ribuan...,Negative,Negative,Neutral,Negative,Negative,-1
3,Pertamina Peduli Salurkan Bantuan untuk Korban...,Positive,Positive,Neutral,Positive,Positive,1
4,"Cuaca Panas, Hujan hingga Banjir Rob Bayangi S...",Negative,Negative,Neutral,Negative,Negative,-1


In [12]:
df.shape

(832, 7)

In [13]:
df = df.drop(columns=["1st Annotator", "2nd Annotator", "3rd Annotator", "Voting Result", "Final Take"])

df.head()

Unnamed: 0,News_Data,Labelling
0,"Olahraga Pilates Makin Populer, Ini Deret Manf...",1
1,"Janice Tjen Luar Biasa, Lolos di 2 Nomor Final...",1
2,Siapakah Pasukan RSF yang Diduga Bantai Ribuan...,-1
3,Pertamina Peduli Salurkan Bantuan untuk Korban...,1
4,"Cuaca Panas, Hujan hingga Banjir Rob Bayangi S...",-1


In [14]:
df.shape

(832, 2)

In [15]:
# Check For Missing Values
df.isnull().sum()

News_Data    0
Labelling    0
dtype: int64

#### Cleaning Text

In [16]:
# Cleaning Text
def cleaning_text(text):

    # Remove punctuation, numbers, and special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\\x\S+', ' ', text)
    text = re.sub(r'\\u\S+', ' ', text)

    for sp in string.punctuation:
      text = text.replace(sp, " ")
    text = re.sub(r"\d+","",text)
    text = text.replace('/\s\s+/g', ' ')

    # Remove URL
    text = re.sub(r'((www.\.[^\s]+)|(https?://[^\s]+))', ' ', text)
    text = re.sub(r'http\S+', ' ', text)

    # Remove text that combines the same characters that are repeated consecutively in a text
    text = re.sub(r'(.)\1+', r'\1\1', text)
    text = re.sub(r'\\n', ' ', text)

    # Remove extra white space
    text = text.strip()
    text = re.sub('\s+', ' ', text)

    return text

# Applied to Content Column
df['Cleaned_News-Data'] = df['News_Data'].apply(cleaning_text)

df.head()

Unnamed: 0,News_Data,Labelling,Cleaned_News-Data
0,"Olahraga Pilates Makin Populer, Ini Deret Manf...",1,Olahraga Pilates Makin Populer Ini Deret Manfa...
1,"Janice Tjen Luar Biasa, Lolos di 2 Nomor Final...",1,Janice Tjen Luar Biasa Lolos di Nomor Final Ch...
2,Siapakah Pasukan RSF yang Diduga Bantai Ribuan...,-1,Siapakah Pasukan RSF yang Diduga Bantai Ribuan...
3,Pertamina Peduli Salurkan Bantuan untuk Korban...,1,Pertamina Peduli Salurkan Bantuan untuk Korban...
4,"Cuaca Panas, Hujan hingga Banjir Rob Bayangi S...",-1,Cuaca Panas Hujan hingga Banjir Rob Bayangi Se...


In [17]:
df['Cleaned_News-Data'][250]

'Potret Politikus PDIP yang Laporkan Hoax Megawati Meninggal'

#### Case Folding

In [18]:
# Uppercase to lowercase
def case_folding_text(text):
  return str(text).lower()

df['Cleaned_News-Data'] = df['Cleaned_News-Data'].apply(case_folding_text)
df.head()

Unnamed: 0,News_Data,Labelling,Cleaned_News-Data
0,"Olahraga Pilates Makin Populer, Ini Deret Manf...",1,olahraga pilates makin populer ini deret manfa...
1,"Janice Tjen Luar Biasa, Lolos di 2 Nomor Final...",1,janice tjen luar biasa lolos di nomor final ch...
2,Siapakah Pasukan RSF yang Diduga Bantai Ribuan...,-1,siapakah pasukan rsf yang diduga bantai ribuan...
3,Pertamina Peduli Salurkan Bantuan untuk Korban...,1,pertamina peduli salurkan bantuan untuk korban...
4,"Cuaca Panas, Hujan hingga Banjir Rob Bayangi S...",-1,cuaca panas hujan hingga banjir rob bayangi se...


#### Tokenization

Tokenization is the process of breaking down raw text into smaller units called tokens, such as words, subwords, or characters, to make it understandable for computers in Natural Language Processing (NLP). 

In [19]:
def tokenizing_text(data):
    return nltk.tokenize.word_tokenize(data)

df['Tokenized'] = df['Cleaned_News-Data'].apply(tokenizing_text)
df.head() 

Unnamed: 0,News_Data,Labelling,Cleaned_News-Data,Tokenized
0,"Olahraga Pilates Makin Populer, Ini Deret Manf...",1,olahraga pilates makin populer ini deret manfa...,"[olahraga, pilates, makin, populer, ini, deret..."
1,"Janice Tjen Luar Biasa, Lolos di 2 Nomor Final...",1,janice tjen luar biasa lolos di nomor final ch...,"[janice, tjen, luar, biasa, lolos, di, nomor, ..."
2,Siapakah Pasukan RSF yang Diduga Bantai Ribuan...,-1,siapakah pasukan rsf yang diduga bantai ribuan...,"[siapakah, pasukan, rsf, yang, diduga, bantai,..."
3,Pertamina Peduli Salurkan Bantuan untuk Korban...,1,pertamina peduli salurkan bantuan untuk korban...,"[pertamina, peduli, salurkan, bantuan, untuk, ..."
4,"Cuaca Panas, Hujan hingga Banjir Rob Bayangi S...",-1,cuaca panas hujan hingga banjir rob bayangi se...,"[cuaca, panas, hujan, hingga, banjir, rob, bay..."


In [None]:
# Normalization

#### StopWords Removal

Some words like "the" and "and" appear so frequently, and in so many documents, that we needn't bother counting them. Also, it may make sense to only record the root of a word, say `cat` in place of both `cat` and `cats`. This will shrink our vocab array and improve performance.

Stop words are common words like "the," "a," and "is" that are removed in Natural Language Processing (NLP) to improve efficiency and accuracy by focusing on more meaningful terms.

In [20]:
def stopword_text(tokens):
    stopword = stopwords.words('indonesian')
    filtered_tokens = [token for token in tokens if token not in stopword]
    return filtered_tokens

df['Tokenized_Stopword'] = df['Tokenized'].apply(stopword_text)

df.head()

Unnamed: 0,News_Data,Labelling,Cleaned_News-Data,Tokenized,Tokenized_Stopword
0,"Olahraga Pilates Makin Populer, Ini Deret Manf...",1,olahraga pilates makin populer ini deret manfa...,"[olahraga, pilates, makin, populer, ini, deret...","[olahraga, pilates, populer, deret, manfaatnya]"
1,"Janice Tjen Luar Biasa, Lolos di 2 Nomor Final...",1,janice tjen luar biasa lolos di nomor final ch...,"[janice, tjen, luar, biasa, lolos, di, nomor, ...","[janice, tjen, lolos, nomor, final, chennai, o..."
2,Siapakah Pasukan RSF yang Diduga Bantai Ribuan...,-1,siapakah pasukan rsf yang diduga bantai ribuan...,"[siapakah, pasukan, rsf, yang, diduga, bantai,...","[pasukan, rsf, diduga, bantai, ribuan, warga, ..."
3,Pertamina Peduli Salurkan Bantuan untuk Korban...,1,pertamina peduli salurkan bantuan untuk korban...,"[pertamina, peduli, salurkan, bantuan, untuk, ...","[pertamina, peduli, salurkan, bantuan, korban,..."
4,"Cuaca Panas, Hujan hingga Banjir Rob Bayangi S...",-1,cuaca panas hujan hingga banjir rob bayangi se...,"[cuaca, panas, hujan, hingga, banjir, rob, bay...","[cuaca, panas, hujan, banjir, rob, bayangi, wi..."


#### Stemming

Stemming is an NLP technique that reduces a word to its root or base form, called a "stem," by removing affixes like suffixes.

In [21]:
def stemming_text(tokens):
    # Stemming Using Sastrawi Liblary
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    stems = [stemmer.stem(token) for token in tokens]
    return stems

tqdm.pandas(desc="Stemming Progress: ")
df['Tokenized_Stopword_Stemmed'] = df['Tokenized_Stopword'].progress_apply(stemming_text)

df.head()

Stemming Progress: 100%|██████████| 832/832 [09:53<00:00,  1.40it/s]


Unnamed: 0,News_Data,Labelling,Cleaned_News-Data,Tokenized,Tokenized_Stopword,Tokenized_Stopword_Stemmed
0,"Olahraga Pilates Makin Populer, Ini Deret Manf...",1,olahraga pilates makin populer ini deret manfa...,"[olahraga, pilates, makin, populer, ini, deret...","[olahraga, pilates, populer, deret, manfaatnya]","[olahraga, pilates, populer, deret, manfaat]"
1,"Janice Tjen Luar Biasa, Lolos di 2 Nomor Final...",1,janice tjen luar biasa lolos di nomor final ch...,"[janice, tjen, luar, biasa, lolos, di, nomor, ...","[janice, tjen, lolos, nomor, final, chennai, o...","[janice, tjen, lolos, nomor, final, chennai, o..."
2,Siapakah Pasukan RSF yang Diduga Bantai Ribuan...,-1,siapakah pasukan rsf yang diduga bantai ribuan...,"[siapakah, pasukan, rsf, yang, diduga, bantai,...","[pasukan, rsf, diduga, bantai, ribuan, warga, ...","[pasu, rsf, duga, bantai, ribu, warga, sipil, ..."
3,Pertamina Peduli Salurkan Bantuan untuk Korban...,1,pertamina peduli salurkan bantuan untuk korban...,"[pertamina, peduli, salurkan, bantuan, untuk, ...","[pertamina, peduli, salurkan, bantuan, korban,...","[pertamina, peduli, salur, bantu, korban, benc..."
4,"Cuaca Panas, Hujan hingga Banjir Rob Bayangi S...",-1,cuaca panas hujan hingga banjir rob bayangi se...,"[cuaca, panas, hujan, hingga, banjir, rob, bay...","[cuaca, panas, hujan, banjir, rob, bayangi, wi...","[cuaca, panas, hujan, banjir, rob, bayang, wil..."


#### DeTokenized

In [22]:
def detokenize(tokens):
  return TreebankWordDetokenizer().detokenize(tokens)

df['Detokenized'] = df['Tokenized_Stopword_Stemmed'].apply(detokenize)
df.head()

Unnamed: 0,News_Data,Labelling,Cleaned_News-Data,Tokenized,Tokenized_Stopword,Tokenized_Stopword_Stemmed,Detokenized
0,"Olahraga Pilates Makin Populer, Ini Deret Manf...",1,olahraga pilates makin populer ini deret manfa...,"[olahraga, pilates, makin, populer, ini, deret...","[olahraga, pilates, populer, deret, manfaatnya]","[olahraga, pilates, populer, deret, manfaat]",olahraga pilates populer deret manfaat
1,"Janice Tjen Luar Biasa, Lolos di 2 Nomor Final...",1,janice tjen luar biasa lolos di nomor final ch...,"[janice, tjen, luar, biasa, lolos, di, nomor, ...","[janice, tjen, lolos, nomor, final, chennai, o...","[janice, tjen, lolos, nomor, final, chennai, o...",janice tjen lolos nomor final chennai open
2,Siapakah Pasukan RSF yang Diduga Bantai Ribuan...,-1,siapakah pasukan rsf yang diduga bantai ribuan...,"[siapakah, pasukan, rsf, yang, diduga, bantai,...","[pasukan, rsf, diduga, bantai, ribuan, warga, ...","[pasu, rsf, duga, bantai, ribu, warga, sipil, ...",pasu rsf duga bantai ribu warga sipil sudan
3,Pertamina Peduli Salurkan Bantuan untuk Korban...,1,pertamina peduli salurkan bantuan untuk korban...,"[pertamina, peduli, salurkan, bantuan, untuk, ...","[pertamina, peduli, salurkan, bantuan, korban,...","[pertamina, peduli, salur, bantu, korban, benc...",pertamina peduli salur bantu korban bencana su...
4,"Cuaca Panas, Hujan hingga Banjir Rob Bayangi S...",-1,cuaca panas hujan hingga banjir rob bayangi se...,"[cuaca, panas, hujan, hingga, banjir, rob, bay...","[cuaca, panas, hujan, banjir, rob, bayangi, wi...","[cuaca, panas, hujan, banjir, rob, bayang, wil...",cuaca panas hujan banjir rob bayang wilayah


In [23]:
output_path = r'(A) Data/PreProcessed_News Content Title_800 Data.csv'
df.to_csv(output_path, index=False)
print(f"Completed PreProcessed data saved to {output_path}")

Completed PreProcessed data saved to (A) Data/PreProcessed_News Content Title_800 Data.csv
