In [11]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import spacy
import swifter  # for optimized pandas operations
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from concurrent.futures import ThreadPoolExecutor

# Download required NLTK data
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')

  from .autonotebook import tqdm as notebook_tqdm


In [12]:
# Read the data (assuming it's in a CSV file)
df = pd.read_csv('../data/raw/hotel_reviews.csv', nrows=10000)

In [13]:
# Create separate rows for positive and negative reviews
def split_reviews(df):
    # Create positive reviews dataframe
    pos_df = df.copy()
    pos_df['review_type'] = 'positive'
    pos_df['review_text'] = pos_df['Positive_Review']
    pos_df['word_count'] = pos_df['Review_Total_Positive_Word_Counts']

    # Create negative reviews dataframe
    neg_df = df.copy()
    neg_df['review_type'] = 'negative'
    neg_df['review_text'] = neg_df['Negative_Review']
    neg_df['word_count'] = neg_df['Review_Total_Negative_Word_Counts']

    # Combine both dataframes
    combined_df = pd.concat([pos_df, neg_df], ignore_index=True)
    
    # Keep only relevant columns
    cols_to_keep = ['Hotel_Name', 'review_type', 'review_text', 'word_count', 
                    'Reviewer_Score', 'Tags']
    combined_df = combined_df[cols_to_keep]
    
    # Remove "No Negative" and "No Positive" reviews
    combined_df = combined_df[~combined_df['review_text'].isin(['No Negative', 'No Positive'])]
    
    return combined_df

# Transform the data
transformed_df = split_reviews(df)

In [14]:
transformed_df.head()

Unnamed: 0,Hotel_Name,review_type,review_text,word_count,Reviewer_Score,Tags
0,Hotel Arena,positive,Only the park outside of the hotel was beauti...,11,2.9,"[' Leisure trip ', ' Couple ', ' Duplex Double..."
1,Hotel Arena,positive,No real complaints the hotel was great great ...,105,7.5,"[' Leisure trip ', ' Couple ', ' Duplex Double..."
2,Hotel Arena,positive,Location was good and staff were ok It is cut...,21,7.1,"[' Leisure trip ', ' Family with young childre..."
3,Hotel Arena,positive,Great location in nice surroundings the bar a...,26,3.8,"[' Leisure trip ', ' Solo traveler ', ' Duplex..."
4,Hotel Arena,positive,Amazing location and building Romantic setting,8,6.7,"[' Leisure trip ', ' Couple ', ' Suite ', ' St..."


In [16]:
# Load spacy model for text processing
nlp = spacy.load('en_core_web_sm')

def transform_dataframe(df):
    """Transform the dataframe to separate positive and negative reviews"""
    # Create arrays for the new data
    data = {
        'review_type': ['positive', 'negative'] * len(df),
        'review_text': np.concatenate([df['Positive_Review'].values, df['Negative_Review'].values]),
        'word_count': np.concatenate([df['Review_Total_Positive_Word_Counts'].values, 
                                    df['Review_Total_Negative_Word_Counts'].values]),
        'score': np.repeat(df['Reviewer_Score'].values, 2)
    }
    
    # Create transformed dataframe
    transformed_df = pd.DataFrame(data)
    
    # Drop rows with "No Negative" or "No Positive"
    mask = ~transformed_df['review_text'].isin(['No Negative', 'No Positive'])
    transformed_df = transformed_df[mask].reset_index(drop=True)
    
    return transformed_df

def preprocess_text(text):
    """Preprocess text for aspect extraction"""
    # Convert to lowercase and tokenize
    doc = nlp(text.lower())
    
    # Remove stopwords and punctuation, keep only nouns and adjectives
    tokens = [token.lemma_ for token in doc 
             if not token.is_stop and not token.is_punct 
             and token.pos_ in ['NOUN', 'ADJ']]
    
    return ' '.join(tokens)

def extract_aspects(df, n_clusters=5):
    """Extract aspects using TF-IDF and K-means clustering"""
    # Preprocess reviews using parallel processing
    tqdm.pandas(desc="Preprocessing texts")
    df['processed_text'] = df['review_text'].progress_apply(preprocess_text)
    
    # TF-IDF Vectorization
    print("Performing TF-IDF vectorization...")
    vectorizer = TfidfVectorizer(max_features=1000)
    tfidf_matrix = vectorizer.fit_transform(df['processed_text'])
    
    # K-means clustering
    print("Performing K-means clustering...")
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(tfidf_matrix)
    
    # Calculate silhouette score
    silhouette_avg = silhouette_score(tfidf_matrix, clusters)
    print(f"Silhouette Score: {silhouette_avg}")
    
    # Get top terms for each cluster
    feature_names = vectorizer.get_feature_names_out()
    cluster_terms = {}
    
    for i in range(n_clusters):
        center = kmeans.cluster_centers_[i]
        top_indices = center.argsort()[-5:][::-1]  # Get top 5 terms
        cluster_terms[i] = [feature_names[idx] for idx in top_indices]
    
    # Add cluster assignments to dataframe
    df['aspect_cluster'] = clusters
    df['aspect_terms'] = df['aspect_cluster'].map(cluster_terms)
    
    return df, cluster_terms, silhouette_avg

# def main(input_file):
#     # Read the data
#     print("Reading data...")
#     df = pd.read_csv(input_file)
    
#     # Transform dataframe
#     print("Transforming dataframe...")
#     transformed_df = transform_dataframe(df)
    
#     # Extract aspects
#     print("Extracting aspects...")
#     final_df, cluster_terms, silhouette = extract_aspects(transformed_df)
    
#     return final_df, cluster_terms, silhouette

In [17]:
print("Extracting aspects...")
final_df, cluster_terms, silhouette = extract_aspects(transformed_df)

Extracting aspects...


Preprocessing texts: 100%|██████████| 17021/17021 [02:11<00:00, 129.63it/s]


Performing TF-IDF vectorization...
Performing K-means clustering...
Silhouette Score: 0.032755488641395276


In [19]:
# Print cluster terms (aspects)
for cluster, terms in cluster_terms.items():
    print(f"Cluster {cluster}: {', '.join(terms)}")

# Sample of final dataframe
print("\nSample of processed data:")
print(final_df[['review_text', 'review_type', 'aspect_cluster', 'aspect_terms']].head())

Cluster 0: location, good, great, excellent, staff
Cluster 1: staff, friendly, helpful, location, great
Cluster 2: room, bed, clean, comfortable, size
Cluster 3: small, room, bathroom, bed, bit
Cluster 4: breakfast, hotel, good, bed, location

Sample of processed data:
                                         review_text review_type  \
0   Only the park outside of the hotel was beauti...    positive   
1   No real complaints the hotel was great great ...    positive   
2   Location was good and staff were ok It is cut...    positive   
3   Great location in nice surroundings the bar a...    positive   
4    Amazing location and building Romantic setting     positive   

   aspect_cluster                             aspect_terms  
0               4  [breakfast, hotel, good, bed, location]  
1               4  [breakfast, hotel, good, bed, location]  
2               4  [breakfast, hotel, good, bed, location]  
3               4  [breakfast, hotel, good, bed, location]  
4               

In [21]:
final_df.tail()

Unnamed: 0,Hotel_Name,review_type,review_text,word_count,Reviewer_Score,Tags,processed_text,aspect_cluster,aspect_terms
19992,Grand Royale London Hyde Park,negative,Dirty towels hotel misplaced our baggage Room...,20,4.2,"[' Leisure trip ', ' Couple ', ' Superior Doub...",dirty towel hotel baggage room small advertisi...,2,"[room, bed, clean, comfortable, size]"
19993,Grand Royale London Hyde Park,negative,Bed and bedroom too small Breakfast would be ...,13,7.9,"[' Leisure trip ', ' Couple ', ' Deluxe Double...",bed small breakfast nice,3,"[small, room, bathroom, bed, bit]"
19996,Grand Royale London Hyde Park,negative,Room size,3,7.9,"[' Leisure trip ', ' Group ', ' Superior Doubl...",room size,2,"[room, bed, clean, comfortable, size]"
19997,Grand Royale London Hyde Park,negative,Room very small Very cramped,7,5.8,"[' Leisure trip ', ' Couple ', ' Superior Doub...",room small,3,"[small, room, bathroom, bed, bit]"
19999,Grand Royale London Hyde Park,negative,Windows Were broken,5,3.8,"[' Leisure trip ', ' Family with young childre...",window,4,"[breakfast, hotel, good, bed, location]"


In [22]:
from collections import Counter
from sklearn.metrics import silhouette_samples
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
def label_clusters(df, cluster_terms, tfidf_matrix, kmeans):
    # Define common hotel aspects and their related keywords
    aspect_keywords = {
        'room': ['room', 'bed', 'bathroom', 'shower', 'furniture', 'clean'],
        'staff': ['staff', 'service', 'reception', 'employee', 'helpful'],
        'location': ['location', 'central', 'distance', 'transport', 'nearby'],
        'food': ['breakfast', 'restaurant', 'food', 'meal', 'dining'],
        'facilities': ['pool', 'gym', 'wifi', 'internet', 'parking']
    }

    # For each cluster:
    for cluster_id, terms in cluster_terms.items():
        # Get all reviews in this cluster
        cluster_reviews = df[df['aspect_cluster'] == cluster_id]['processed_text']
        
        # Count word frequencies in the cluster
        term_counts = Counter(' '.join(cluster_reviews).split())
        
        # Calculate aspect scores based on keyword matches
        aspect_scores = {
            aspect: sum(term_counts.get(keyword, 0) for keyword in keywords)
            for aspect, keywords in aspect_keywords.items()
        }
        
        # Assign the most frequent aspect as the cluster label
        predominant_aspect = max(aspect_scores.items(), key=lambda x: x[1])[0]

In [24]:
def evaluate_clustering(df, tfidf_matrix, kmeans, aspect_labels):
    # Calculate overall silhouette score
    overall_score = silhouette_score(tfidf_matrix, kmeans.labels_)
    
    # Calculate silhouette score for each sample
    sample_silhouette_values = silhouette_samples(tfidf_matrix, kmeans.labels_)
    
    # Calculate average silhouette score per cluster
    cluster_silhouette_scores = {}
    for cluster_id in range(kmeans.n_clusters):
        mask = kmeans.labels_ == cluster_id
        cluster_silhouette_scores[cluster_id] = np.mean(sample_silhouette_values[mask])

In [25]:
def evaluate_clustering(df, tfidf_matrix, kmeans, aspect_labels):
    # Calculate overall silhouette score
    overall_score = silhouette_score(tfidf_matrix, kmeans.labels_)
    
    # Calculate silhouette score for each sample
    sample_silhouette_values = silhouette_samples(tfidf_matrix, kmeans.labels_)
    
    # Calculate average silhouette score per cluster
    cluster_silhouette_scores = {}
    for cluster_id in range(kmeans.n_clusters):
        mask = kmeans.labels_ == cluster_id
        cluster_silhouette_scores[cluster_id] = np.mean(sample_silhouette_values[mask])

In [26]:
def print_cluster_analysis(aspect_labels, evaluation):
    print(f"Overall Clustering Performance: {evaluation['overall_silhouette_score']:.3f}")
    print("\nCluster Analysis:")
    for cluster_id, details in aspect_labels.items():
        print(f"\nCluster {cluster_id}:")
        print(f"Labeled as: {details['predominant_aspect']}")
        print(f"Key terms: {', '.join(details['top_terms'])}")
        print(f"Silhouette score: {details['avg_silhouette']:.3f}")
        print(f"Number of reviews: {evaluation['cluster_sizes'][cluster_id]}")

In [None]:
# ... (previous imports remain the same)
from collections import Counter
from sklearn.metrics import silhouette_samples
import matplotlib.pyplot as plt
import seaborn as sns

def label_clusters(df, cluster_terms, tfidf_matrix, kmeans):
    """
    Label clusters based on predominant aspects and evaluate clustering performance
    """
    # Calculate silhouette scores for each sample
    sample_silhouette_values = silhouette_samples(tfidf_matrix, kmeans.labels_)
    df['silhouette_score'] = sample_silhouette_values

    # Initialize aspect labeling dictionary
    aspect_labels = {}
    
    # Define common aspect categories in hotel reviews
    aspect_keywords = {
        'room': ['room', 'bed', 'bathroom', 'shower', 'furniture', 'clean'],
        'staff': ['staff', 'service', 'reception', 'employee', 'helpful'],
        'location': ['location', 'central', 'distance', 'transport', 'nearby'],
        'food': ['breakfast', 'restaurant', 'food', 'meal', 'dining'],
        'facilities': ['pool', 'gym', 'wifi', 'internet', 'parking']
    }

    for cluster_id, terms in cluster_terms.items():
        # Get all reviews in this cluster
        cluster_reviews = df[df['aspect_cluster'] == cluster_id]['processed_text']
        
        # Count term frequencies in the cluster
        term_counts = Counter(' '.join(cluster_reviews).split())
        
        # Score each aspect category based on keyword matches
        aspect_scores = {}
        for aspect, keywords in aspect_keywords.items():
            score = sum(term_counts.get(keyword, 0) for keyword in keywords)
            aspect_scores[aspect] = score
        
        # Assign the predominant aspect label
        predominant_aspect = max(aspect_scores.items(), key=lambda x: x[1])[0]
        
        # Store cluster information
        aspect_labels[cluster_id] = {
            'predominant_aspect': predominant_aspect,
            'top_terms': terms,
            'avg_silhouette': np.mean(sample_silhouette_values[kmeans.labels_ == cluster_id])
        }

    return aspect_labels

def evaluate_clustering(df, tfidf_matrix, kmeans, aspect_labels):
    """
    Evaluate clustering performance and generate visualizations
    """
    # Overall silhouette score
    overall_score = silhouette_score(tfidf_matrix, kmeans.labels_)
    
    # Cluster sizes
    cluster_sizes = df['aspect_cluster'].value_counts().to_dict()
    
    # Prepare evaluation results
    evaluation = {
        'overall_silhouette_score': overall_score,
        'cluster_sizes': cluster_sizes,
        'cluster_details': aspect_labels
    }
    
    # Generate visualization
    plt.figure(figsize=(12, 6))
    
    # Plot 1: Silhouette scores by cluster
    plt.subplot(1, 2, 1)
    scores = [details['avg_silhouette'] for details in aspect_labels.values()]
    aspects = [details['predominant_aspect'] for details in aspect_labels.values()]
    sns.barplot(x=aspects, y=scores)
    plt.title('Average Silhouette Score by Aspect')
    plt.xticks(rotation=45)
    
    # Plot 2: Cluster sizes
    plt.subplot(1, 2, 2)
    sns.barplot(x=list(cluster_sizes.keys()), y=list(cluster_sizes.values()))
    plt.title('Cluster Sizes')
    
    plt.tight_layout()
    
    return evaluation, plt.gcf()

def main(input_file):
    # ... (previous main code remains the same until aspect extraction)
    
    # Transform dataframe
    print("Transforming dataframe...")
    transformed_df = transform_dataframe(df)
    
    # Extract aspects
    print("Extracting aspects...")
    vectorizer = TfidfVectorizer(max_features=1000)
    tfidf_matrix = vectorizer.fit_transform(transformed_df['processed_text'])
    
    kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
    clusters = kmeans.fit_predict(tfidf_matrix)
    
    # Get cluster terms
    feature_names = vectorizer.get_feature_names_out()
    cluster_terms = {
        i: [feature_names[idx] for idx in kmeans.cluster_centers_[i].argsort()[-5:][::-1]]
        for i in range(kmeans.n_clusters)
    }
    
    transformed_df['aspect_cluster'] = clusters
    
    # Label clusters and evaluate
    aspect_labels = label_clusters(transformed_df, cluster_terms, tfidf_matrix, kmeans)
    evaluation_results, evaluation_plot = evaluate_clustering(
        transformed_df, tfidf_matrix, kmeans, aspect_labels
    )
    
    # Add aspect labels to dataframe
    transformed_df['aspect_label'] = transformed_df['aspect_cluster'].map(
        lambda x: aspect_labels[x]['predominant_aspect']
    )
    
    return transformed_df, aspect_labels, evaluation_results, evaluation_plot

# Example usage
file_path = 'hotel_reviews.csv'
final_df, aspect_labels, evaluation, plot = main(file_path)

# Print evaluation results
print("\nClustering Evaluation:")
print(f"Overall Silhouette Score: {evaluation['overall_silhouette_score']:.3f}")
print("\nCluster Details:")
for cluster_id, details in aspect_labels.items():
    print(f"\nCluster {cluster_id}:")
    print(f"Predominant Aspect: {details['predominant_aspect']}")
    print(f"Top Terms: {', '.join(details['top_terms'])}")
    print(f"Average Silhouette Score: {details['avg_silhouette']:.3f}")

# Display evaluation visualizations
plot.show()