Tahapan untuk text clustering

1. Load data
2. Preprocessing (remove karakter aneh-aneh pake regex, stopwords removal, stemming atau lemma)
3. Ekstraksi fitur pake tf idf atau yang lain
4. Masukin ke algoritma

# Kumpulan Library

In [None]:
import pandas as pd
from nltk import sent_tokenize 
from nltk import word_tokenize 
from nltk.corpus import stopwords
import regex as re

# Load data

In [None]:
data = pd.read_excel("ExerciseSAClass.xlsx")

data

filtered_data = data[data['Language'] == 'Spanish']

filtered_data

selected_columns = filtered_data[['Post ID', 'Post description', 'Language', 'Sentiment']]

selected_columns

# Preprocessing

In [None]:
from nltk import word_tokenize
from nltk.corpus import stopwords
import regex as re

stop_words = set(stopwords.words('spanish'))  

def preprocess(doc): 
    doc = re.sub(r'http\S+|www\S+|t\.co\S+', '', doc)
    sents = word_tokenize(doc) 
    sents_tok = []  
    
    for s in sents: 
        s = s.strip().lower()  # Case folding dan menghilangkan new line 
        s = s.replace("\n", " ")  # Mengganti \n dengan spasi 
        s = re.sub(r'[0-9]+', '', s)  # Menghapus angka 
        s = re.sub(r'[^a-zA-Z ]', ' ', s)  # Menghapus simbol, kecuali spasi 
        s = re.sub(' +', ' ', s)  # Menghapus repetitive space 
        sents_tok.append(s) 
    
    sents_tok = [word for word in ' '.join(sents_tok).split() if word not in stop_words]
    return " ".join(sents_tok)

docs_clear = []
for d in selected_columns["Post description"]: 
    docs_clear.append(preprocess(d))

docs_clear

# Stopwords dan Stemmimg  Bahasa Indonesia

In [None]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from nltk.tokenize import word_tokenize
factory = StopWordRemoverFactory()
stopword = factory.create_stop_word_remover()

stop_docs = []

for doc in docs_clear:
    cleaned_doc = stopword.remove(doc)
    stop_docs.append(cleaned_doc)

stop_docs

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

stemmer = StemmerFactory().create_stemmer()

stemmed_docs = []

for doc in stop_docs:
    stemmed_doc = stemmer.stem(doc)
    stemmed_docs.append(stemmed_doc)

# STopwords dan Stemming bahasa luar

In [None]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

a = set(stopwords.words('english'))

stemmer = PorterStemmer()

stop_stem_docs = []

for doc in docs_clear:
    tokens = word_tokenize(doc.lower())
    cleaned_doc = [x for x in tokens if x not in a]
    stemmed_doc = [stemmer.stem(word) for word in cleaned_doc]
    stop_stem_docs.append(stemmed_doc)

stop_stem_docs


# Ekstraksi Fitur

## TF IDF

In [None]:
# representasi vektor dengan VSM-TFIDF
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import cluster

tfidf_vectorizer = TfidfVectorizer(max_df=0.50, min_df = 2)
X = tfidf_vectorizer.fit_transform(stemmed_docs)
print(X)

## BoW

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(max_df=0.50, min_df=2)
X_bow = bow_vectorizer.fit_transform(stemmed_docs)

print(X_bow.toarray())

## Word2Vec

# Model atau algoritma clustering

## KMEANS

In [None]:
from sklearn import cluster
from sklearn.metrics import silhouette_score

k = 3
seed = 99

km = cluster.KMeans(n_clusters=k, init='random', random_state=seed)
km.fit(X)

C_km = km.predict(X)

sil_score = silhouette_score(X, C_km)

print("Cluster assignment:", C_km)
print("Silhouette Score:", sil_score)


## KMEANS ++

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

kmeans_pp = KMeans(n_clusters=3, init='k-means++', random_state=seed)
kmeans_pp.fit(X)

C_kmeans_pp = kmeans_pp.predict(X)
sil_score_kmeans_pp = silhouette_score(X, C_kmeans_pp)

print("Cluster assignment (KMeans++):", C_kmeans_pp)
print("Silhouette Score (KMeans++):", sil_score_kmeans_pp)


## DBSCAN

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score

dbscan = DBSCAN(eps=0.5, min_samples=5)
C_dbscan = dbscan.fit_predict(X)

if len(set(C_dbscan)) > 1:
    sil_score_dbscan = silhouette_score(X, C_dbscan)
    print("Silhouette Score (DBSCAN):", sil_score_dbscan)
else:
    print("DBSCAN menghasilkan hanya satu cluster atau outlier, Silhouette Score tidak dapat dihitung.")

print("Cluster assignment (DBSCAN):", C_dbscan)


## Fuzzy C Means

In [None]:
import numpy as np
import skfuzzy as fuzz
from sklearn.metrics import silhouette_score

cntr, u, u0, d, jm, p, fpc = fuzz.cluster.cmeans(X.T, c=3, m=2, error=0.005, maxiter=1000, init=None)
C_fcm = np.argmax(u, axis=0)

sil_score_fcm = silhouette_score(X, C_fcm)

print("Cluster assignment (Fuzzy C-Means):", C_fcm)
print("Silhouette Score (Fuzzy C-Means):", sil_score_fcm)


## Hierarchical

In [None]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_score

agg_clustering = AgglomerativeClustering(n_clusters=3)
C_agg = agg_clustering.fit_predict(X)

sil_score_agg = silhouette_score(X, C_agg)

print("Cluster assignment (Agglomerative):", C_agg)
print("Silhouette Score (Agglomerative):", sil_score_agg)


## Mean Shift

In [None]:
from sklearn.cluster import MeanShift
from sklearn.metrics import silhouette_score

mean_shift = MeanShift()
C_ms = mean_shift.fit_predict(X)

sil_score_ms = silhouette_score(X, C_ms)

print("Cluster assignment (Mean Shift):", C_ms)
print("Silhouette Score (Mean Shift):", sil_score_ms)


# Evaluasi Score