In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import pandas as pd

# Membaca data hasil preprocessing
df = pd.read_csv('data/preprocessed/preprocessed_genshin.csv')

# Representasi teks menggunakan TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
tfidf_matrix = tfidf.fit_transform(df['snippet'])

In [2]:
# Menentukan jumlah cluster
n_clusters = 5  # Sesuaikan jumlah cluster sesuai kebutuhan

# KMeans dengan inisialisasi k-means++ untuk pembagian lebih baik
kmeans = KMeans(n_clusters=n_clusters, random_state=42, init='k-means++')

# Melakukan clustering
df['cluster'] = kmeans.fit_predict(tfidf_matrix)

# Menampilkan distribusi awal cluster
print("Initial Cluster Distribution:")
print(df['cluster'].value_counts())

# Fungsi untuk memastikan tiap cluster memiliki 35-40 data
def balance_clusters(df, min_size=150, max_size=200, n_clusters=5):
    # Cek ukuran cluster
    cluster_sizes = df['cluster'].value_counts()
    
    while True:
        # Cek apakah semua cluster sudah memenuhi batasan (35 <= size <= 40)
        if all(min_size <= size <= max_size for size in cluster_sizes):
            break
        
        # Temukan cluster yang lebih besar dari max_size dan lebih kecil dari min_size
        large_clusters = cluster_sizes[cluster_sizes > max_size].index
        small_clusters = cluster_sizes[cluster_sizes < min_size].index
        
        # Jika tidak ada cluster yang terlalu besar atau kecil, berhenti
        if large_clusters.empty and small_clusters.empty:
            break
        
        # Ambil data dari cluster yang lebih besar dan pindahkan ke cluster yang lebih kecil
        for large_cluster in large_clusters:
            # Ambil data dari cluster besar
            large_cluster_data = df[df['cluster'] == large_cluster]
            # Tentukan jumlah data yang perlu dipindahkan
            excess_data_count = len(large_cluster_data) - max_size
            
            # Ambil sampel acak dari data yang berlebih
            data_to_move = large_cluster_data.sample(n=excess_data_count, random_state=42)
            
            # Pilih cluster kecil untuk menerima data
            for small_cluster in small_clusters:
                if len(df[df['cluster'] == small_cluster]) < max_size:
                    # Pindahkan data ke cluster kecil
                    df.loc[data_to_move.index, 'cluster'] = small_cluster
                    break  # Hanya pindahkan ke satu cluster kecil

        # Perbarui ukuran cluster setelah pemindahan
        cluster_sizes = df['cluster'].value_counts()

    return df

# Seimbangkan cluster
df = balance_clusters(df, min_size=35, max_size=40, n_clusters=5)

# Menampilkan distribusi akhir cluster
print("\nBalanced Cluster Distribution:")
print(df['cluster'].value_counts())

# Menampilkan data beberapa baris sebagai contoh
# print("\nSample of balanced data:")
# print(df.head())

Initial Cluster Distribution:
cluster
2    56
4    48
0    39
3    34
1    22
Name: count, dtype: int64

Balanced Cluster Distribution:
cluster
4    40
1    40
3    40
2    40
0    39
Name: count, dtype: int64


In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Fungsi untuk menghitung kemiripan antara snippet yang dicari dan data di dalam cluster
def text_similarity(snippet1, snippet2):
    # Menghitung kemiripan teks menggunakan cosine similarity atau metrik lain
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform([snippet1, snippet2])
    
    # Menghitung cosine similarity antara dua teks
    similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    return similarity[0][0]

# Fungsi utama rekomendasi berdasarkan snippet
def recommend_by_snippet(snippet, df, n_recommendations=5):
    # Normalisasi casing dan hilangkan spasi ekstra pada snippet
    snippet = snippet.lower().strip()
    
    # Cari snippet yang paling mirip
    df['snippet_similarity'] = df['snippet'].apply(lambda x: text_similarity(snippet, x))
    
    # Urutkan berdasarkan kemiripan terbesar
    similar_snippets = df.sort_values(by='snippet_similarity', ascending=False)
    
    # Ambil cluster dari snippet yang paling mirip
    cluster = similar_snippets.iloc[0]['cluster']
    
    # Ambil data dari cluster yang sama
    cluster_data = df[df['cluster'] == cluster]
    
    # Jika cluster terlalu kecil (kurang dari jumlah rekomendasi), kurangi jumlah rekomendasi
    if len(cluster_data) < n_recommendations:
        n_recommendations = len(cluster_data)
    
    # Urutkan data di dalam cluster berdasarkan kemiripan
    cluster_data = cluster_data.sort_values(by='snippet_similarity', ascending=False)
    
    # Tampilkan ulasan lain dari cluster yang sama
    recommendations = cluster_data.head(n_recommendations)  # Ambil n rekomendasi teratas
    return recommendations[['title', 'snippet', 'rating']]

# Contoh pencarian berdasarkan snippet
snippet = "story"
recommendations = recommend_by_snippet(snippet, df)
print(recommendations)

                     title                                            snippet  \
103             Freydy Voo  started first release really love graphics cha...   
78   Farrell Adra Khalfani  hard give good rating things worth playing gen...   
9         Olivia Staringer  uninteresting characters uninteresting dialog ...   
67               Sean Arce  pointless playing play games coupon gone might...   
158                  Sam B  update gi offers incredible openworld recent s...   

     rating  
103     5.0  
78      2.0  
9       1.0  
67      1.0  
158     5.0  


In [4]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

# Fungsi untuk menghitung kemiripan antara snippet yang dicari dan data di dalam cluster
def text_similarity(snippet1, snippet2):
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform([snippet1, snippet2])
    similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])
    return similarity[0][0]

# Fungsi utama rekomendasi berdasarkan snippet
def recommend_by_snippet(snippet, df, n_recommendations=5):
    snippet = snippet.lower().strip()
    df['snippet_similarity'] = df['snippet'].apply(lambda x: text_similarity(snippet, x))
    similar_snippets = df.sort_values(by='snippet_similarity', ascending=False)
    
    cluster = similar_snippets.iloc[0]['cluster']
    cluster_data = df[df['cluster'] == cluster]
    
    if len(cluster_data) < n_recommendations:
        n_recommendations = len(cluster_data)
    
    cluster_data = cluster_data.sort_values(by='snippet_similarity', ascending=False)
    recommendations = cluster_data.head(n_recommendations)
    
    return recommendations[['title', 'snippet', 'rating']], cluster_data

# Fungsi untuk menghitung Precision dan Recall
def evaluate_recommendation(recommended, cluster_data):
    # Menghitung relevansi berdasarkan cluster
    relevant_items = cluster_data  # Semua item di dalam cluster dianggap relevan
    
    # Precision = (Jumlah rekomendasi yang relevan) / (Jumlah rekomendasi yang diberikan)
    recommended_titles = set(recommended['title'])
    relevant_titles = set(relevant_items['title'])
    
    # Menghitung precision dan recall
    true_positives = len(recommended_titles & relevant_titles)
    precision = true_positives / len(recommended_titles) if len(recommended_titles) > 0 else 0
    recall = true_positives / len(relevant_titles) if len(relevant_titles) > 0 else 0
    
    # F1-Score
    f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1_score

# Contoh penggunaan evaluasi
snippet = "gensina"
recommended, cluster_data = recommend_by_snippet(snippet, df)
precision, recall, f1 = evaluate_recommendation(recommended, cluster_data)

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1-Score: {f1:.2f}")

Precision: 1.00
Recall: 0.13
F1-Score: 0.23
