In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from nltk.corpus import stopwords

# Custom list of Malay stopwords
malay_stopwords = [
    "saya", "anda", "kamu", "ini", "itu", "ke", "di", "dengan", "untuk", "oleh", 
    "pada", "sebagai", "akan", "adalah", "dan", "juga", "atau", "tetapi", "kerana", "aku", "kau","yang","akan",
    "ni", "memang", "paling", "tu","nauzubillahiminzalik","amin", "Assalamualaikum", 
    "semua","kot","pun","masih","dalam","tau"
    # Add more stopwords as needed
]

# Preprocess function with custom stopwords
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    
    # Stopword removal
    filtered_tokens = [token for token in tokens if token.lower() not in malay_stopwords]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmas = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Return preprocessed text as a string
    preprocessed_text = ' '.join(lemmas)
    return preprocessed_text

file_path = "DATA.txt"
with open(file_path, "r") as file:
    file_contents = file.read()

# Tokenize the text into individual documents
documents = file_contents.splitlines()

# Preprocess the documents
preprocessed_documents = [preprocess_text(doc) for doc in documents]

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Vectorize the preprocessed documents
feature_matrix = vectorizer.fit_transform(preprocessed_documents)

# Define the number of clusters
num_clusters = 5

# Create a K-means clustering model
kmeans = KMeans(n_clusters=num_clusters, random_state=42)

# Fit the model to the feature matrix
kmeans.fit(feature_matrix)

# Get the cluster labels
cluster_labels = kmeans.labels_


# Assign words as cluster labels
cluster_words = ["Food", "Family", "Friends", "Self","Partner"]

# Display the cluster assignments with words
print("{:<35s} {}".format("Document", "Cluster"))
print("-" * 50)
for doc, label in zip(documents, cluster_labels):
    cluster_label = cluster_words[label]
    print("{:<35s} {}".format(doc, cluster_label))



Document                            Cluster
--------------------------------------------------
Kisah yang akan aku kongsikan bukan kisah diri sendiri. Tetapi, adik bongsu aku. Family
Dia ni student di sebuah IPTA di Selangor & duduk di rumah sewa berdekatan dengan IPTA tersebut lah. Itu je yang aku boleh kongsi. Family
Sifat adik bongsu aku ni agak manja. Biasalah bongsu. Tapi yang aku boleh banggakan, adik aku memang seorang yang sangat pembersih. Family
Bukan aku nak backup adik aku sendiri or what. Memang dia ni jenis penggeli dengan semua benda. Spesies aku (kakak dia) jugak la itu geli ini geli. Partner
Nak dijadikan cerita, lately ni almost setiap hari dia akan whatsapp dalam group family kami mengadu yang dia stress. Partner
Stress sebab apa?                   Self
Sebab dapat housemate yang pengotor nauzubillahiminzalik. Family
Korang bayangkan, anak dara. ANAK DARA gais. Punyalah pengotor & pemalas, sampah dapur dibiarkan sampai berulat-ulat merayap satu rumah. Friends
Bila di