## Preprocessing Hasil Crawling

### Preprocessing Hasil Crawling PTA

In [1]:
!pip install pandas nltk spacy Sastrawi pyspellchecker
!python -m spacy download en_core_web_sm

Collecting pandas
  Downloading pandas-2.3.2-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting spacy
  Using cached spacy-3.8.7-cp313-cp313-win_amd64.whl.metadata (28 kB)
Collecting Sastrawi
  Using cached Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Collecting pyspellchecker
  Using cached pyspellchecker-0.8.3-py3-none-any.whl.metadata (9.5 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.3.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting click (from nltk)
  Using cached click-8.2.1-py3-none-any.whl.metadata (2.5 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.5.2-py3-none-any.whl.metadata (5.6 kB)
Collecting regex>=2021.8.3 (from nltk)
  Using cached reg

In [None]:
import pandas as pd
import re
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
from spellchecker import SpellChecker
import spacy

# Download stopwords (sekali saja)
nltk.download('stopwords')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# === Load dataset ===
pta_all = pd.read_csv("Data/pta_all.csv")

# === Stopwords ===
stopwords_id = set(stopwords.words("indonesian"))
stopwords_en = set(stopwords.words("english"))

# === Stemmer Indonesia ===
factory = StemmerFactory()
stemmer_id = factory.create_stemmer()

# === Spell checker English ===
spell_en = SpellChecker(language="en")

# ========================
# Fungsi Preprocessing Indo
# ========================
def preprocess_text_id(text):
    if pd.isna(text):
        return []
    # (2) Hapus tanda baca & simbol
    text = re.sub(r"[^a-zA-Z\s]", " ", text.lower())
    # (5) Tokenisasi
    tokens = text.split()
    if not tokens:
        return []
    # (1) Stopword removal
    tokens = [w for w in tokens if w not in stopwords_id]
    if not tokens:
        return []
    # (4) Stemming dengan Sastrawi
    tokens = [stemmer_id.stem(w) for w in tokens]
    return tokens

# ========================
# Fungsi Preprocessing English (versi aman)
# ========================
def preprocess_text_en(text):
    if pd.isna(text):
        return []
    # (2) Hapus tanda baca & simbol
    text = re.sub(r"[^a-zA-Z\s]", " ", text.lower())
    # (5) Tokenisasi
    tokens = text.split()
    if not tokens:
        return []
    
    # (3) Cek ejaan pembakuan kata (hindari None, pastikan string)
    corrected = []
    for w in tokens:
        corr = spell_en.correction(w)
        if corr is None:      # kalau None → pakai kata asli
            corr = w
        corrected.append(str(corr))
    tokens = corrected
    
    # (1) Stopword removal
    tokens = [w for w in tokens if w not in stopwords_en]
    if not tokens:
        return []
    
    # (4) Lematisasi dengan spaCy (hanya kalau ada token valid)
    text_joined = " ".join(tokens).strip()
    if not text_joined:
        return []
    doc = nlp(text_joined)
    tokens = [token.lemma_ for token in doc]
    
    return tokens

# === Terapkan ke kolom PTA ===
pta_all["abstrak_id_clean"] = pta_all["abstrak_id"].apply(preprocess_text_id)
pta_all["abstrak_en_clean"] = pta_all["abstrak_en"].apply(preprocess_text_en)

# === Simpan hasil ===
pta_all.to_csv("preprocessing_pta_all.csv", index=False)

# Contoh hasil
print("Jumlah data total:", len(pta_all))
pta_all[["abstrak_id_clean", "abstrak_en_clean"]].head()


[nltk_data] Downloading package stopwords to C:\Users\INFINIX
[nltk_data]     EBC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Jumlah data total: 14664


Unnamed: 0,abstrak_id_clean,abstrak_en_clean
0,"[abstrak, implementasi, fungsi, legislasi, dpr...","[abstract, implementation, legislation, parlia..."
1,"[badan, usaha, milik, negara, bumn, badan, usa...","[state, own, enterprise, business, entity, par..."
2,"[narkoba, henti, henti, dengar, media, televis...","[drug, case, endlessly, hear, television, radi..."
3,"[produk, elektronik, benda, gerak, hasil, pros...","[electronic, product, object, move, production..."
4,[],[]


In [7]:
import pandas as pd

# Atur opsi tampilan agar tabel lebih rapi
pd.set_option("display.max_colwidth", 100)  # panjang teks tiap kolom max 100 karakter
pd.set_option("display.max_rows", 20)       # default tampilkan max 20 baris

# === Baca file hasil preprocessing ===
df_pre = pd.read_csv("Data/preprocessing_pta_all.csv", engine="python", on_bad_lines="skip")

# === Ambil 20 baris acak (prodi, abstrak_id_clean, abstrak_en_clean) ===
contoh = df_pre[["prodi", "abstrak_id_clean", "abstrak_en_clean"]].sample(20, random_state=42)

# === Tampilkan tabel rapi di Jupyter Notebook ===
contoh


Unnamed: 0,prodi,abstrak_id_clean,abstrak_en_clean
3207,Ilmu Kelautan,"['teliti', 'rumus', 'produksi', 'tingkat', 'minta', 'hambat', 'produksi', 'tidak', 'rempa', 'mou...","['study', 'problem', 'formulation', 'production', 'increase', 'high', 'permintaanpun', 'occur', ..."
4256,Manajemen,"['abstrak', 'dasar', 'hasil', 'observasi', 'teliti', 'tenaga', 'didik', 'milik', 'motivasi', 'ke...","['abstract', 'job', 'placement', 'urgent', 'matter', 'motivate', 'staff', 'work', 'place', 'yet'..."
8830,Ilmu Komunikasi,"['abstrak', 'skripsi', 'judul', 'strategi', 'komunikasi', 'bpjs', 'sehat', 'puas', 'serta', 'bpj...","['abstract', 'thesis', 'entitle', 'communication', 'strategy', 'boy', 'kesehatan', 'boys', 'kese..."
6853,Teknik Informatika,"['teknologi', 'mobile', 'game', 'kembang', 'pesat', 'mobile', 'game', 'milik', 'minat', 'kalang'...","['mobile', 'game', 'technology', 'grow', 'rapidly', 'mobile', 'game', 'lot', 'enthusiast', 'vari..."
2495,Agribisnis,"['lurah', 'tal', 'lurah', 'kota', 'diri', 'sentra', 'buat', 'takwa', 'jajan', 'khas', 'kota', 'd...","['tinalan', 'urban', 'village', 'village', 'locate', 'city', 'keri', 'center', 'make', 'tofu', '..."
1547,Teknologi Industri Pertanian,"['kendali', 'mutu', 'proses', 'produksi', 'kerupuk', 'pul', 'tuju', 'mengi', 'dentifikasi', 'fak...","['quality', 'control', 'production', 'process', 'aim', 'wheeze', 'puli', 'cracker', 'identify', ..."
10778,Ekonomi Syariah,"['skripsi', 'judul', 'analis', 'pengaruh', 'strategi', 'srgmentasi', 'targeting', 'positioning',...","['study', 'entitle', 'analysis', 'influence', 'segmentation', 'target', 'position', 'strategy', ..."
11821,Pgsd,"['teliti', 'tuju', 'nilai', 'nilai', 'karakter', 'ekstrakurikuler', 'pencak', 'silat', 'tapak', ...","['purpose', 'research', 'understand', 'martial', 'art', 'tapas', 'ekstracuricullar', 'build', 's..."
2980,Agroteknologi,"['abstrak', 'asap', 'cair', 'tempurung', 'kelapa', 'hasil', 'kondensasi', 'pirolisis', 'bahan', ...","['abstract', 'liquid', 'smoke', 'coconut', 'shell', 'result', 'condensation', 'pyrolysis', 'orga..."
2081,Agribisnis,"['teliti', 'kabupaten', 'bangkal', 'tuju', 'karakteristik', 'responden', 'karakteristik', 'usaha...","['study', 'aim', 'analyze', 'characteristic', 'respondent', 'business', 'characteristic', 'jasmi..."


### Preprocessing Hasil Crawling PTA (Teknik)

In [None]:
import pandas as pd
import re
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
from spellchecker import SpellChecker
import spacy

# Download stopwords (sekali saja)
nltk.download('stopwords')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# === Load dataset ===
pta_all = pd.read_csv("Data/pta_all.csv")

# === Daftar Prodi Fakultas Teknik ===
prodi_teknik = [
    "Teknik Industri",
    "Teknik Informatika",
    "Manajemen Informatika",
    "Teknik Multimedia Dan Jaringan",
    "Mekatronika",
    "Teknik Elektro",
    "Sistem Informasi",
    "Teknik Mesin",
    "Teknik Mekatronika"
]

# === Filter hanya Fakultas Teknik ===
pta_teknik = pta_all[pta_all["prodi"].isin(prodi_teknik)].copy()

# === Stopwords ===
stopwords_id = set(stopwords.words("indonesian"))
stopwords_en = set(stopwords.words("english"))

# === Stemmer Indonesia ===
factory = StemmerFactory()
stemmer_id = factory.create_stemmer()

# === Spell checker English ===
spell_en = SpellChecker(language="en")

# ========================
# Fungsi Preprocessing Indo
# ========================
def preprocess_text_id(text):
    if pd.isna(text):
        return []
    # (2) Hapus tanda baca & simbol
    text = re.sub(r"[^a-zA-Z\s]", " ", text.lower())
    # (5) Tokenisasi
    tokens = text.split()
    if not tokens:
        return []
    # (1) Stopword removal
    tokens = [w for w in tokens if w not in stopwords_id]
    if not tokens:
        return []
    # (4) Stemming dengan Sastrawi
    tokens = [stemmer_id.stem(w) for w in tokens]
    return tokens

# ========================
# Fungsi Preprocessing English
# ========================
def preprocess_text_en(text):
    if pd.isna(text):
        return []
    # (2) Hapus tanda baca & simbol
    text = re.sub(r"[^a-zA-Z\s]", " ", text.lower())
    # (5) Tokenisasi
    tokens = text.split()
    if not tokens:
        return []
    # (3) Cek ejaan pembakuan kata (hindari None)
    corrected = []
    for w in tokens:
        corr = spell_en.correction(w)
        corrected.append(corr if corr is not None else w)
    tokens = corrected
    # (1) Stopword removal
    tokens = [w for w in tokens if isinstance(w, str) and w not in stopwords_en]
    if not tokens:
        return []
    # (4) Lematisasi dengan spaCy (hanya jika ada token)
    doc = nlp(" ".join(tokens))
    tokens = [token.lemma_ for token in doc]
    return tokens

# === Terapkan ke abstrak Fakultas Teknik ===
pta_teknik["abstrak_id_clean"] = pta_teknik["abstrak_id"].apply(preprocess_text_id)
pta_teknik["abstrak_en_clean"] = pta_teknik["abstrak_en"].apply(preprocess_text_en)

# === Simpan hasil ===
pta_teknik.to_csv("preprocessing_pta_teknik.csv", index=False)

# Contoh hasil
print("Jumlah data Fakultas Teknik:", len(pta_teknik))
display(pta_teknik[["prodi", "abstrak_id_clean", "abstrak_en_clean"]].head())


[nltk_data] Downloading package stopwords to C:\Users\INFINIX
[nltk_data]     EBC\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Jumlah data Fakultas Teknik: 2289


Unnamed: 0,prodi,abstrak_id_clean,abstrak_en_clean
6092,Teknik Industri,"[portofolio, kumpul, saham, milik, investor, s...","[portfolio, collection, stock, own, investor, ..."
6093,Teknik Industri,"[pt, abc, usaha, gerak, bidang, manufaktur, ka...","[pt, arc, company, engage, manufacture, wood, ..."
6094,Teknik Industri,"[bangkal, salah, kabupaten, milik, potensi, al...","[bangkalan, one, district, potential, natural,..."
6095,Teknik Industri,"[simulasi, duplikasi, abstraksi, hidup, nyata,...","[simulation, duplication, abstraction, real, l..."
6096,Teknik Industri,"[puas, tingkat, asa, layan, banding, kerja, ha...","[satisfaction, feel, level, someone, service, ..."


### Preprocessing Hasil Crawling Web Berita

In [5]:
import pandas as pd
import re
import nltk
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords

# Download stopwords (sekali saja)
nltk.download('stopwords')

# === Load dataset ===
tempo_berita = pd.read_csv("Data/tempo_berita.csv")

# === Stopwords & Stemmer Indonesia ===
stopwords_id = set(stopwords.words("indonesian"))
factory = StemmerFactory()
stemmer_id = factory.create_stemmer()

# ========================
# Fungsi Preprocessing Indo
# ========================
def preprocess_text_id(text):
    if pd.isna(text):
        return ""
    # Menghilangkan simbol & tanda baca
    text = re.sub(r"[^a-zA-Z\s]", " ", text.lower())
    # Tokenisasi
    tokens = text.split()
    # Stopword removal
    tokens = [w for w in tokens if w not in stopwords_id]
    # Stemming dengan Sastrawi
    tokens = [stemmer_id.stem(w) for w in tokens]
    return tokens

# === Terapkan ke Tempo ===
tempo_berita["judul_clean"] = tempo_berita["judul_berita"].apply(preprocess_text_id)
tempo_berita["isi_clean"] = tempo_berita["isi_berita"].apply(preprocess_text_id)

# === Simpan hasil ===
tempo_berita.to_csv("preprocessing_berita.csv", index=False)

# Contoh hasil
tempo_berita[["judul_clean", "isi_clean"]].head()


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,judul_clean,isi_clean
0,"[roy, suryo, alas, gelar, bedah, buku, jokowi,...","[tulis, bukujokowi, s, white, paperyaituroy, s..."
1,"[fadli, zon, gugat, adil, perkosa, massal]","[koalisi, masyarakat, sipil, lawan, impunitas,..."
2,"[jusuf, kalla, tuntut, demo, cermin, kondisi, ...","[mantan, wakil, presidenjusuf, kallamengatakan..."
3,"[mu, ti, sekolah, kembali, smart, tv, kenan]","[menteri, didik, dasar, tengah, abdul, mu, ti,..."
4,"[fraksi, gerindra, undur, rahayu, saraswati, m...","[fraksi, gerindra, aku, kaget, putus, undur, r..."
