In [None]:
import numpy as np
import pandas as pd

# Train Dataset

In [None]:
final_df = pd.read_csv("/kaggle/input/dataset/dataset.csv")

# Clustering

In [None]:
# !pip install sentence_transformers

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from typing import List, Dict
import torch
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score

class RestaurantTopicAnalyzer:
    def __init__(self, similarity_threshold=0.3, device=None, embedding_model=None):
        if device is None:
            self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        else:
            self.device = device
        
        self.model = SentenceTransformer(embedding_model, device=self.device)
        self.similarity_threshold = similarity_threshold
        
        self.categories = {
            'food': (
                "discussion about food quality, taste, dishes, menu items, "
                "cooking, flavors, portions, ingredients, cuisine"
            ),
            'place': (
                "discussion about restaurant ambiance, atmosphere, decoration, "
                "location, cleanliness, seating, parking, venue"
            ),
            'price': (
                "discussion about costs, prices, value for money, expenses, "
                "affordability, budget, worth, deals"
            ),
            'service': (
                "discussion about staff behavior, waiting time, customer service, "
                "waiters, servers, attentiveness, hospitality"
            )
        }
        
        self.category_embeddings = np.array([
            self.model.encode(description, convert_to_tensor=True, device=self.device).cpu().numpy()
            for description in self.categories.values()
        ])
        
        self.kmeans = KMeans(
            n_clusters=len(self.categories),
            init=self.category_embeddings,
            n_init=1
        )
        
        self.cluster_to_category = {i: cat for i, cat in enumerate(self.categories.keys())}

    def get_topics(self, text):
        try:
            text_embedding = self.model.encode(text, convert_to_tensor=True, device=self.device).cpu().numpy()
            text_embedding = text_embedding.reshape(1, -1)
            
            cluster = self.kmeans.predict(text_embedding)[0]
            primary_category = self.cluster_to_category[cluster]
            
            similarities = {}
            for category, description in self.categories.items():
                cat_embedding = self.model.encode(description, convert_to_tensor=True, device=self.device).cpu().numpy()
                similarity = float(np.dot(text_embedding, cat_embedding) / 
                                (np.linalg.norm(text_embedding) * np.linalg.norm(cat_embedding)))
                similarities[category] = similarity
            
            relevant_topics = {
                category: score 
                for category, score in similarities.items() 
                if score > self.similarity_threshold
            }
            
            if primary_category not in relevant_topics:
                relevant_topics[primary_category] = similarities[primary_category]
            
            return relevant_topics
        
        except Exception as e:
            print(f"Error processing text: {text}")
            print(f"Error message: {str(e)}")
            return {}

    def fit(self, texts):
        embeddings = np.array([
            self.model.encode(text, convert_to_tensor=True, device=self.device).cpu().numpy()
            for text in tqdm(texts, desc="Encoding texts")
        ])
        
        self.kmeans.fit(embeddings)
        return self

    def analyze_dataframe(self, df, text_column='text'):
        self.fit(df[text_column].values)
        
        result_df = df.copy()
        
        for category in self.categories.keys():
            result_df[f'topic_{category}'] = 0
            result_df[f'score_{category}'] = 0.0
        
        result_df['topic_count'] = 0
        result_df['main_topics'] = ''
        result_df['primary_topic'] = ''
        result_df['primary_score'] = 0.0
        
        print(f"Analyzing texts on {self.device}...")
        for idx in tqdm(range(len(df)), desc="Analyzing texts"):
            text = str(df.iloc[idx][text_column])
            topics = self.get_topics(text)
            
            for category, score in topics.items():
                result_df.at[idx, f'topic_{category}'] = 1
                result_df.at[idx, f'score_{category}'] = score
            
            result_df.at[idx, 'topic_count'] = len(topics)
            result_df.at[idx, 'main_topics'] = ', '.join(topics.keys())
            
            if topics:
                primary_topic = max(topics.items(), key=lambda x: x[1])
                result_df.at[idx, 'primary_topic'] = primary_topic[0]
                result_df.at[idx, 'primary_score'] = primary_topic[1]
        
        return result_df
    def evaluate_clustering(self, texts):
        embeddings = np.array([
            self.model.encode(text, convert_to_tensor=True, device=self.device).cpu().numpy()
            for text in tqdm(texts, desc="Encoding texts for evaluation")
        ])
        
        self.kmeans.fit(embeddings)
        labels = self.kmeans.labels_
        
        scores = {
            'silhouette': silhouette_score(embeddings, labels),
            'calinski_harabasz': calinski_harabasz_score(embeddings, labels),
            'davies_bouldin': davies_bouldin_score(embeddings, labels)
        }
        
        print("\nClustering Evaluation Scores:")
        print(f"Silhouette Score: {scores['silhouette']:.4f}")
        print(f"Calinski Harabasz Score: {scores['calinski_harabasz']:.4f}")
        print(f"Davies Bouldin Score: {scores['davies_bouldin']:.4f}")
        
        return scores

In [None]:
analyzer = RestaurantTopicAnalyzer(
    similarity_threshold=0.3,
    device='cuda' if torch.cuda.is_available() else 'cpu',
    embedding_model='all-MiniLM-L6-v2'
)

In [None]:
results_df = analyzer.analyze_dataframe(final_df)

In [None]:
scores = analyzer.evaluate_clustering(final_df['text'].values)

In [None]:
results_df.to_csv("cluster-result.csv")

# Eksperimen

In [None]:
evaluation_results = []
models = [
    'all-MiniLM-L6-v2',
    'all-mpnet-base-v2',
    'paraphrase-multilingual-MiniLM-L12-v2',
    'all-MiniLM-L12-v2'
]
thresholds = [0.2, 0.3, 0.4]

for model_name in models:
    for threshold in thresholds:
        print(f"\nProcessing model: {model_name}, with threshold: {threshold}")
        analyzer = RestaurantTopicAnalyzer(
            similarity_threshold=threshold,
            device='cuda' if torch.cuda.is_available() else 'cpu',
            embedding_model=model_name
        )
        
        results_df = analyzer.analyze_dataframe(final_df)
        
        scores = analyzer.evaluate_clustering(final_df['text'].values)
        scores.update({'model': model_name, 'threshold': threshold})
        evaluation_results.append(scores)

comparison_df = pd.DataFrame(evaluation_results)
comparison_df.to_csv("Comparison.csv")
print("\nModel Comparison:")
print(comparison_df)