## 1. Import Library
- pandas: Digunakan untuk manipulasi dan analisis data, khususnya dalam bentuk tabel (DataFrame). 
- nltk: Toolkit untuk pemrosesan teks (NLP), seperti tokenisasi, stemming, lemmatization, dan penghilangan stopwords. 
- string: Modul bawaan untuk manipulasi teks, misalnya penghapusan tanda baca. 
- re: Modul untuk operasi regular expressions, digunakan untuk pencarian atau manipulasi pola teks.
- CountVectorizer mengubah teks menjadi vektor numerik di mana setiap kolom mewakili sebuah kata unik (disebut term) dari seluruh korpus (kumpulan teks).

In [1]:
import pandas as pd
import nltk
import string
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SLM\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SLM\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SLM\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## 2. Load dataset

In [3]:
# membaca dataset bola dan health menggunakan modul read_CSV
bola = pd.read_csv('bola.csv')
health = pd.read_csv('health.csv')

# menggabungkan kedua dataset menngunakan concat
berita = pd.concat([bola, health])

# menyimpan dataset yang digabung menjadi file csv baru
berita.to_csv('berita.csv')

In [4]:
# data csv sebelumnya disimpan pada variable berita
berita = pd.read_csv('berita.csv')

# menampilkan data brita indeks teratas
berita.head()

Unnamed: 0.1,Unnamed: 0,judul,waktu,url,konten,kategori
0,0,MotoGP San Marino 2024: Ketika Marquez Menang ...,"- 09/09/2024, 05:00 WIB",https://www.kompas.com/motogp/read/2024/09/09/...,KOMPAS.com - Marc Marquez sukses memenangi Mot...,Bola
1,1,"Klasemen PON XXI 2024: Futsal Gagal Emas, Jati...","- 09/09/2024, 07:59 WIB",https://www.kompas.com/sports/read/2024/09/09/...,KOMPAS.com - Tim futsal Jawa Timur (Jatim) gag...,Bola
2,2,Hasil One Pride MMA 82: Frans Sormin Bekuk Jak...,"- 09/09/2024, 14:31 WIB",https://www.kompas.com/sports/read/2024/09/09/...,KOMPAS.com - Frans Lincol Sormin mengalahkan J...,Bola
3,3,"Indonesia Vs Australia, Kata Manajer Timnas so...","- 09/09/2024, 10:29 WIB",https://bola.kompas.com/read/2024/09/09/102929...,KOMPAS.com - Timnas Indonesia belum menentukan...,Bola
4,4,Dampak Kehadiran Marteen Paes untuk Persaingan...,"- 09/09/2024, 11:30 WIB",https://bola.kompas.com/read/2024/09/09/113000...,KOMPAS.com - Penampilan penjaga gawang timnas ...,Bola


## 3. Preprocess Dataset

In [5]:
def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    
    # Hapus angka
    text = re.sub(r'\d+', '', text)
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Hapus tanda baca
    tokens = [word for word in tokens if word.isalnum()]
    
    # Stopword removal
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    # Gabungkan kembali token menjadi string
    return ' '.join(tokens)

In [6]:
# Preprocessing seluruh data
berita['processed_text'] = berita['konten'].apply(preprocess_text)

In [7]:
berita['processed_text'][0]

'marc marquez sukses memenangi motogp san marino pebalap beralias baby alien itu mengakui adanya bantuan hujan rentang sepekan ini terasa begitu indah bagi pebalap gresini racing marc marquez bagaimana tidak setelah sempat melalui tiga tahun tanpa kemenangan pada balapan motogp marquez langsung berjaya dan finis pertama dalam dua seri beruntun yakni di aragon dan san marino kemenangannya di motogp aragon pekan lalu mengakhiri penantian marquez selama hari untuk berdiri di podium pertama selang sepekan marquez kembali naik podium tertinggi di balapan motogp san marino yang digelar di sirkuit misano minggu baca juga klasemen motogp usai gp san marino marquez bagnaia tempel martin di puncak marquez finis lebih dari tiga detik di depan francesco bagnaia ducati lenovo team yang memulai balapan dari posisi terdepan pebalap asal cervera itu melesat dari posisi sembilan di grid dan memimpin balapan saat lintasan basah sesaat akibat guyuran hujan ringan bantuan hujan ini diakui oleh marquez pem

## 4. Pemodelan Vector Space

In [8]:
# menyimpan modul CounVectorizer kedalam variable vectorizer
vectorizer = CountVectorizer()

# melakukan daftar terms dari dataset yang telah di-preprocessed kemudian mengubahnya (tranformasi) ke dalam bentuk numerik menggunakan fit_transform()
X_tf = vectorizer.fit_transform(berita['processed_text'])

# mengonversi hasil dari vectorizer dari bentuk sparse matrix ke dalam bentuk DataFrame
df_tf = pd.DataFrame(X_tf.toarray(), columns=vectorizer.get_feature_names_out())

# Inisialisasi TfidfTransformer
tfidf_transformer = TfidfTransformer()

# Transformasi dari TF ke TF-IDF
X_tfidf = tfidf_transformer.fit_transform(X_tf)

# Mengonversi hasil TF-IDF ke DataFrame
df_tfidf = pd.DataFrame(X_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

In [10]:
# menyimpan model ke dalam file CSV baru
df_tf.to_csv('VSM.csv')
df_tfidf.to_csv('VSM dengan Pembobotan.csv')

## 5. Hasil

In [38]:
dfTF = pd.read_csv('VSM.csv')
dfTF = dfTF.drop("Unnamed: 0", axis='columns')
dfTF

Unnamed: 0,abdominal,abdullah,abdurachman,abnormal,absen,abstrak,academy,acara,aceh,achilles,...,yuli,yulia,zambo,zat,zedoaria,ziad,zink,zirkzee,zona,zymuno
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
96,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
97,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
98,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
dfTFIDF = pd.read_csv('VSM dengan Pembobotan.csv')
dfTFIDF = dfTFIDF.drop("Unnamed: 0", axis='columns')
dfTFIDF

Unnamed: 0,abdominal,abdullah,abdurachman,abnormal,absen,abstrak,academy,acara,aceh,achilles,...,yuli,yulia,zambo,zat,zedoaria,ziad,zink,zirkzee,zona,zymuno
0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
1,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
2,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
3,0.000000,0.04922,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.065383,0.0
4,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.044749,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
96,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
97,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
98,0.049152,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0
