<a href="https://colab.research.google.com/github/MichaelArgs/Tugas-PenambanganData/blob/main/Klasifikasi_Berita.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 1. Persiapan dan Import Library

In [1]:

# Bagian 1: Import Library yang Diperlukan
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from nltk.stem import PorterStemmer
import re

# Download NLTK resources (untuk digunakan di Google Colab)
nltk.download('punkt_tab')
nltk.download('stopwords')


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## 2. Preprocessing Data

In [2]:

# Bagian 2: Preprocessing Data
# Memuat dataset
data = pd.read_csv('dataset.csv')

def clean_text(text):
    """
    Membersihkan teks dari angka, tanda baca, dan spasi berlebih
    """
    text = re.sub(r'\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text

# Terapkan pembersihan
data['Content'] = data['Content'].apply(clean_text)

# Tokenisasi
data['Tokens'] = data['Content'].apply(word_tokenize)

# Stopword Removal
stop_words = set(stopwords.words('indonesian'))
data['Filtered'] = data['Tokens'].apply(lambda x: [word for word in x if word not in stop_words])

# Stemming
stemmer = PorterStemmer()
data['Stemmed'] = data['Filtered'].apply(lambda x: [stemmer.stem(word) for word in x])

# Normalisasi
data['Processed_Text'] = data['Stemmed'].apply(lambda x: ' '.join(x))

# Menampilkan hasil preprocessing
data.head()


FileNotFoundError: [Errno 2] No such file or directory: 'dataset.csv'

## 3. TF-IDF Vectorization

In [None]:

# Bagian 3: TF-IDF Vectorization
# Vectorisasi teks menggunakan TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(data['Processed_Text'])

# Menampilkan TF-IDF terms dan bobotnya
tfidf_words = tfidf_vectorizer.get_feature_names_out()
tfidf_weights = tfidf_matrix.toarray()
tfidf_df = pd.DataFrame(tfidf_weights, columns=tfidf_words)
tfidf_df.head(10)


## 4. Implementasi Algoritma


In [None]:

# Bagian 4: Implementasi VSM (Vector Space Model)
# Encode label kategori
label_encoder = LabelEncoder()
data['Label'] = label_encoder.fit_transform(data['Category'])

# Split dataset menjadi train dan test
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, data['Label'], test_size=0.2, random_state=42)

# Membaca dokumen uji
with open('Test2.txt', 'r') as file:
    test_doc = file.read()

# Preprocessing dokumen uji
cleaned_test_doc = clean_text(test_doc)
tokenized_test_doc = word_tokenize(cleaned_test_doc)
filtered_test_doc = [word for word in tokenized_test_doc if word not in stop_words]
stemmed_test_doc = [stemmer.stem(word) for word in filtered_test_doc]
normalized_test_doc = ' '.join(stemmed_test_doc)

# Vectorisasi dokumen uji menggunakan TF-IDF
test_vector = tfidf_vectorizer.transform([normalized_test_doc])

# Hitung kemiripan kosinus
cos_similarities = cosine_similarity(test_vector, X_train)
predicted_index = np.argmax(cos_similarities)

# Menggunakan array agar tidak terjadi KeyError
y_train_array = np.array(y_train)
predicted_label = y_train_array[predicted_index]

# Mengonversi label yang diprediksi ke kategori asli
predicted_category = label_encoder.inverse_transform([predicted_label])[0]
print(f"Prediksi Kategori untuk Dokumen Uji: {predicted_category}")


## 5. Evaluasi Model

In [None]:

# Bagian 5: Evaluasi Model
# Hitung kemiripan kosinus antara data uji dan data latih
cos_similarities = cosine_similarity(X_test, X_train)

# Prediksi label berdasarkan kemiripan tertinggi
predicted_indices = cos_similarities.argmax(axis=1)

# Menggunakan array agar tidak terjadi KeyError
y_train_array = np.array(y_train)
y_pred = y_train_array[predicted_indices]

# Konversi hasil prediksi ke kategori yang sebenarnya
y_pred_labels = label_encoder.inverse_transform(y_pred)
y_true_labels = label_encoder.inverse_transform(y_test)

# Tampilkan classification report
report = classification_report(y_true_labels, y_pred_labels, target_names=label_encoder.classes_)
print("Classification Report:")
print(report)
