<a href="https://colab.research.google.com/github/MichaelArgs/Tugas-PenambanganData/blob/main/Klasifikasi_Berita.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Persiapan dan Import Library

In [12]:

# Bagian 1: Import Library yang Diperlukan
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer
import re

# Download NLTK resources (untuk digunakan di Google Colab)
nltk.download('punkt_tab')
nltk.download('stopwords')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## 2. Preprocessing Data

In [13]:

# Bagian 2: Preprocessing Data
# Memuat dataset
data = pd.read_csv('dataset.csv')

def clean_text(text):
    """
    Membersihkan teks dari angka, tanda baca, dan spasi berlebih
    """
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text

# Terapkan pembersihan
data['Content'] = data['Content'].apply(clean_text)

# Tokenisasi
data['Tokens'] = data['Content'].apply(word_tokenize)

# Stopword Removal
stop_words = set(stopwords.words('indonesian'))
data['Filtered'] = data['Tokens'].apply(lambda x: [word for word in x if word not in stop_words])

# Stemming
stemmer = PorterStemmer()
data['Stemmed'] = data['Filtered'].apply(lambda x: [stemmer.stem(word) for word in x])

# Normalisasi
data['Processed_Text'] = data['Stemmed'].apply(lambda x: ' '.join(x))

# Menampilkan hasil preprocessing
data.head()


Unnamed: 0,Category,Content,Tokens,Filtered,Stemmed,Processed_Text
0,Edukasi,memaknai hari pahlawan setiap tanggal november...,"[memaknai, hari, pahlawan, setiap, tanggal, no...","[memaknai, pahlawan, tanggal, november, salah,...","[memaknai, pahlawan, tanggal, novemb, salah, m...",memaknai pahlawan tanggal novemb salah masyara...
1,Edukasi,pemerintah mempunyai peran yang besar dalam me...,"[pemerintah, mempunyai, peran, yang, besar, da...","[pemerintah, peran, menggerakkan, kegiatan, ek...","[pemerintah, peran, menggerakkan, kegiatan, ek...",pemerintah peran menggerakkan kegiatan ekonomi...
2,Edukasi,virus sering diperdebatkan statusnya sebagai m...,"[virus, sering, diperdebatkan, statusnya, seba...","[virus, diperdebatkan, statusnya, makhluk, hid...","[viru, diperdebatkan, statusnya, makhluk, hidu...",viru diperdebatkan statusnya makhluk hidup lan...
3,Edukasi,cermin cembung atau yang juga kerap disebut ju...,"[cermin, cembung, atau, yang, juga, kerap, dis...","[cermin, cembung, kerap, cermin, lengkung, sal...","[cermin, cembung, kerap, cermin, lengkung, sal...",cermin cembung kerap cermin lengkung salah jen...
4,Edukasi,ekologi adalah ilmu yang membahas tentang hubu...,"[ekologi, adalah, ilmu, yang, membahas, tentan...","[ekologi, ilmu, membahas, hubungan, organisme,...","[ekolog, ilmu, membaha, hubungan, organism, li...",ekolog ilmu membaha hubungan organism lingkung...


## 3. TF-IDF Vectorization

In [14]:

# Bagian 3: TF-IDF Vectorization
# Vectorisasi teks menggunakan TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Processed_Text'])

# Menampilkan TF-IDF terms dan bobotnya
tfidf_words = tfidf_vectorizer.get_feature_names_out()
tfidf_weights = tfidf_matrix.toarray()
tfidf_df = pd.DataFrame(tfidf_weights, columns=tfidf_words)
tfidf_df.head(10)


Unnamed: 0,aa,aan,aarhu,aaron,ab,abad,abang,abde,abdul,abdullah,...,zombi,zona,zone,zonta,zoom,zrf,zuckerberg,zulfiadi,zulha,zulkifli
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0523,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 4. Implementasi Algoritma


In [15]:
# Encode label kategori
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Category'])

# Split dataset menjadi train dan test
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, data['Label'], test_size=0.2, random_state=42)

# ==============================================
# Pendekatan VSM (Vector Space Model)
# ==============================================
# Membaca dokumen uji
with open('Test2.txt', 'r') as file:
    test_doc = file.read()

# Preprocessing dokumen uji
cleaned_test_doc = clean_text(test_doc)
tokenized_test_doc = word_tokenize(cleaned_test_doc)
filtered_test_doc = [word for word in tokenized_test_doc if word not in stop_words]
stemmed_test_doc = [stemmer.stem(word) for word in filtered_test_doc]
normalized_test_doc = ' '.join(stemmed_test_doc)

# Vectorisasi dokumen uji menggunakan TF-IDF
test_vector = tfidf_vectorizer.transform([normalized_test_doc])

# Hitung kemiripan kosinus
cos_similarities = cosine_similarity(test_vector, X_train)
predicted_index = np.argmax(cos_similarities)

# Menggunakan array agar tidak terjadi KeyError
y_train_array = np.array(y_train)
predicted_label = y_train_array[predicted_index]

# Mengonversi label yang diprediksi ke kategori asli
predicted_category = label_encoder.inverse_transform([predicted_label])[0]
print(f"Prediksi Kategori untuk Dokumen Uji (VSM): {predicted_category}")

# ==============================================
# Pendekatan Random Forest
# ==============================================
from sklearn.ensemble import RandomForestClassifier

# Latih model Random Forest
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)

# Prediksi dengan dokumen uji
rf_predicted_label = rf_classifier.predict(test_vector)[0]
rf_predicted_category = label_encoder.inverse_transform([rf_predicted_label])[0]
print(f"Prediksi Kategori untuk Dokumen Uji (Random Forest): {rf_predicted_category}")

# ==============================================
# Pendekatan KNN (K-Nearest Neighbors)
# ==============================================
from sklearn.neighbors import KNeighborsClassifier

# Latih model KNN
knn_classifier = KNeighborsClassifier(n_neighbors=5)
knn_classifier.fit(X_train, y_train)

# Prediksi dengan dokumen uji
knn_predicted_label = knn_classifier.predict(test_vector)[0]
knn_predicted_category = label_encoder.inverse_transform([knn_predicted_label])[0]
print(f"Prediksi Kategori untuk Dokumen Uji (KNN): {knn_predicted_category}")


Prediksi Kategori untuk Dokumen Uji (VSM): Politik
Prediksi Kategori untuk Dokumen Uji (Random Forest): Politik
Prediksi Kategori untuk Dokumen Uji (KNN): Ekonomi


## 5. Evaluasi Model

In [16]:
# ==============================================
# Evaluasi VSM (Vector Space Model)
# ==============================================
# Hitung kemiripan kosinus antara data uji dan data latih
cos_similarities = cosine_similarity(X_test, X_train)

# Prediksi label berdasarkan kemiripan tertinggi
predicted_indices = cos_similarities.argmax(axis=1)

# Menggunakan array agar tidak terjadi KeyError
y_train_array = np.array(y_train)
y_pred_vsm = y_train_array[predicted_indices]

# Konversi hasil prediksi ke kategori yang sebenarnya
y_pred_labels_vsm = label_encoder.inverse_transform(y_pred_vsm)
y_true_labels = label_encoder.inverse_transform(y_test)

# Tampilkan classification report untuk VSM
print("Evaluasi VSM (Vector Space Model):")
print(classification_report(y_true_labels, y_pred_labels_vsm, target_names=label_encoder.classes_))

# ==============================================
# Evaluasi Random Forest
# ==============================================
# Prediksi dengan model Random Forest
rf_y_pred = rf_classifier.predict(X_test)

# Konversi hasil prediksi ke kategori yang sebenarnya
rf_y_pred_labels = label_encoder.inverse_transform(rf_y_pred)

# Tampilkan classification report untuk Random Forest
print("\nEvaluasi Random Forest:")
print(classification_report(y_true_labels, rf_y_pred_labels, target_names=label_encoder.classes_))

# ==============================================
# Evaluasi KNN (K-Nearest Neighbors)
# ==============================================
# Prediksi dengan model KNN
knn_y_pred = knn_classifier.predict(X_test)

# Konversi hasil prediksi ke kategori yang sebenarnya
knn_y_pred_labels = label_encoder.inverse_transform(knn_y_pred)

# Tampilkan classification report untuk KNN
print("\nEvaluasi KNN:")
print(classification_report(y_true_labels, knn_y_pred_labels, target_names=label_encoder.classes_))


Evaluasi VSM (Vector Space Model):
              precision    recall  f1-score   support

     Edukasi       0.86      0.79      0.83        24
     Ekonomi       0.69      0.69      0.69        13
   GayaHidup       0.94      0.75      0.83        20
     Hiburan       0.95      0.91      0.93        23
    Olahraga       0.93      1.00      0.96        13
    Otomotif       0.88      0.95      0.91        22
     Politik       0.83      0.86      0.84        22
   Teknologi       0.81      0.91      0.86        23

    accuracy                           0.86       160
   macro avg       0.86      0.86      0.86       160
weighted avg       0.87      0.86      0.86       160


Evaluasi Random Forest:
              precision    recall  f1-score   support

     Edukasi       0.91      0.88      0.89        24
     Ekonomi       0.65      0.85      0.73        13
   GayaHidup       0.85      0.85      0.85        20
     Hiburan       0.96      0.96      0.96        23
    Olahraga      