In [None]:
import numpy as np
import pandas as pd
import re
import string
from collections import Counter
import math
from typing import List, Tuple, Dict, Any
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Download required NLTK data
def download_nltk_data():
    """Download required NLTK data"""
    required_packages = ['punkt', 'punkt_tab', 'stopwords']
    
    for package in required_packages:
        try:
            nltk.data.find(f'tokenizers/{package}' if 'punkt' in package else f'corpora/{package}')
        except LookupError:
            print(f"Downloading NLTK {package}...")
            nltk.download(package, quiet=True)

# Download NLTK data
download_nltk_data()


In [None]:

class TextPreprocessor:
    """Advanced text preprocessing for similarity analysis"""
    
    def __init__(self, use_stemming=True, remove_stopwords=True):
        self.use_stemming = use_stemming
        self.remove_stopwords = remove_stopwords
        
        if remove_stopwords:
            self.stop_words = set(stopwords.words('english'))
            # Add common hotel review stopwords
            self.stop_words.update(['hotel', 'room', 'stay', 'stayed', 'night', 'nights'])
        
        if use_stemming:
            self.stemmer = PorterStemmer()
    
    def clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        if pd.isna(text) or not isinstance(text, str):
            return ""
        
        # Convert to lowercase
        text = text.lower()
        
        # Remove extra whitespace and newlines
        text = re.sub(r'\s+', ' ', text)
        
        # Remove URLs, emails, phone numbers
        text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
        text = re.sub(r'\S+@\S+', '', text)
        text = re.sub(r'\b\d{10,}\b', '', text)
        
        # Remove special characters but keep basic punctuation for sentence boundaries
        text = re.sub(r'[^\w\s\.\,\!\?\;\:]', ' ', text)
        
        # Clean up multiple punctuation
        text = re.sub(r'[\.]{2,}', '.', text)
        text = re.sub(r'[,]{2,}', ',', text)
        text = re.sub(r'[!]{2,}', '!', text)
        text = re.sub(r'[?]{2,}', '?', text)
        
        return text.strip()
    
    def tokenize_and_process(self, text: str) -> List[str]:
        """Tokenize and process text"""
        cleaned_text = self.clean_text(text)
        if not cleaned_text:
            return []
        
        # Tokenize
        tokens = word_tokenize(cleaned_text)
        
        # Filter tokens
        processed_tokens = []
        for token in tokens:
            # Skip punctuation, very short words, and numbers
            if (len(token) < 2 or 
                token in string.punctuation or 
                token.isdigit()):
                continue
            
            # Remove stopwords if specified
            if self.remove_stopwords and token in self.stop_words:
                continue
            
            # Apply stemming if specified
            if self.use_stemming:
                token = self.stemmer.stem(token)
            
            processed_tokens.append(token)
        
        return processed_tokens
    
    def preprocess(self, text: str) -> str:
        """Full preprocessing pipeline returning cleaned text"""
        tokens = self.tokenize_and_process(text)
        return ' '.join(tokens)

In [None]:

class ManualTFIDFVectorizer:
    """Manual implementation of TF-IDF vectorization for educational purposes"""
    
    def __init__(self, max_features=10000, min_df=2, max_df=0.95):
        self.max_features = max_features
        self.min_df = min_df
        self.max_df = max_df
        self.vocabulary = {}
        self.idf_values = {}
        self.feature_names = []
        self.n_documents = 0
        
    def _compute_tf(self, text_tokens: List[str]) -> Dict[str, float]:
        """Compute term frequency for a document"""
        tf_dict = {}
        total_tokens = len(text_tokens)
        
        if total_tokens == 0:
            return tf_dict
        
        token_counts = Counter(text_tokens)
        
        for word, count in token_counts.items():
            tf_dict[word] = count / total_tokens
        
        return tf_dict
    
    def _compute_idf(self, documents: List[List[str]]):
        """Compute inverse document frequency for all terms"""
        # Count document frequency for each term
        df_dict = {}
        self.n_documents = len(documents)
        
        for doc_tokens in documents:
            unique_tokens = set(doc_tokens)
            for token in unique_tokens:
                df_dict[token] = df_dict.get(token, 0) + 1
        
        # Filter terms based on min_df and max_df
        filtered_terms = {}
        min_df_count = self.min_df if isinstance(self.min_df, int) else int(self.min_df * self.n_documents)
        max_df_count = self.max_df if isinstance(self.max_df, int) else int(self.max_df * self.n_documents)
        
        for term, df in df_dict.items():
            if min_df_count <= df <= max_df_count:
                filtered_terms[term] = df
        
        # Sort by frequency and limit features
        sorted_terms = sorted(filtered_terms.items(), key=lambda x: x[1], reverse=True)
        if self.max_features and len(sorted_terms) > self.max_features:
            sorted_terms = sorted_terms[:self.max_features]
        
        # Create vocabulary and compute IDF
        self.vocabulary = {term: idx for idx, (term, _) in enumerate(sorted_terms)}
        self.feature_names = [term for term, _ in sorted_terms]
        
        for term, df in sorted_terms:
            self.idf_values[term] = math.log(self.n_documents / df)
    
    def fit_transform(self, documents: List[str]) -> np.ndarray:
        """Fit the vectorizer and transform documents"""
        # Tokenize all documents
        tokenized_docs = []
        for doc in documents:
            # Simple tokenization (assuming preprocessing is done)
            tokens = doc.split() if isinstance(doc, str) else []
            tokenized_docs.append(tokens)
        
        # Compute IDF values
        self._compute_idf(tokenized_docs)
        
        # Transform documents to TF-IDF vectors
        return self.transform(documents)
    
    def transform(self, documents: List[str]) -> np.ndarray:
        """Transform documents to TF-IDF vectors"""
        tfidf_matrix = np.zeros((len(documents), len(self.vocabulary)))
        
        for doc_idx, doc in enumerate(documents):
            tokens = doc.split() if isinstance(doc, str) else []
            tf_dict = self._compute_tf(tokens)
            
            for term, tf in tf_dict.items():
                if term in self.vocabulary:
                    term_idx = self.vocabulary[term]
                    idf = self.idf_values[term]
                    tfidf_matrix[doc_idx, term_idx] = tf * idf
        
        return tfidf_matrix
    
    def get_feature_names_out(self) -> List[str]:
        """Get feature names"""
        return self.feature_names

In [None]:

class SimilarReviewsFinder:
    """Main class for finding similar reviews"""
    
    def __init__(self, use_sklearn=True, max_features=10000):
        self.use_sklearn = use_sklearn
        self.max_features = max_features
        self.preprocessor = TextPreprocessor(use_stemming=True, remove_stopwords=True)
        
        if use_sklearn:
            self.vectorizer = TfidfVectorizer(
                max_features=max_features,
                min_df=2,
                max_df=0.95,
                stop_words='english',
                ngram_range=(1, 2),  # Include bigrams
                lowercase=True
            )
        else:
            self.vectorizer = ManualTFIDFVectorizer(
                max_features=max_features,
                min_df=2,
                max_df=0.95
            )
        
        self.reviews = []
        self.tfidf_matrix = None
        self.feature_names = []
    
    def load_reviews(self, filename: str) -> List[str]:
        """Load reviews from text file"""
        print(f"Loading reviews from {filename}...")
        
        reviews = []
        try:
            with open(filename, 'r', encoding='utf-8') as f:
                for line_num, line in enumerate(f, 1):
                    line = line.strip()
                    if line:  # Skip empty lines
                        reviews.append(line)
                    
                    if line_num % 5000 == 0:
                        print(f"Loaded {line_num} reviews...")
        
        except FileNotFoundError:
            print(f"Error: File {filename} not found!")
            return []
        
        print(f"Successfully loaded {len(reviews)} reviews")
        return reviews
    
    def preprocess_reviews(self, reviews: List[str]) -> List[str]:
        """Preprocess all reviews"""
        print("Preprocessing reviews...")
        processed_reviews = []
        
        for i, review in enumerate(reviews):
            if i % 2000 == 0:
                print(f"Processed {i}/{len(reviews)} reviews")
            
            processed = self.preprocessor.preprocess(review)
            processed_reviews.append(processed)
        
        return processed_reviews
    
    def build_tfidf_matrix(self, reviews: List[str]):
        """Build TF-IDF matrix from reviews"""
        print("Building TF-IDF matrix...")
        
        # Preprocess reviews
        processed_reviews = self.preprocess_reviews(reviews)
        
        # Remove empty reviews
        valid_reviews = []
        valid_indices = []
        for i, review in enumerate(processed_reviews):
            if review.strip():
                valid_reviews.append(review)
                valid_indices.append(i)
        
        print(f"Using {len(valid_reviews)} valid reviews out of {len(reviews)}")
        
        # Build TF-IDF matrix
        self.tfidf_matrix = self.vectorizer.fit_transform(valid_reviews)
        self.reviews = [reviews[i] for i in valid_indices]  # Keep original reviews
        
        if hasattr(self.vectorizer, 'get_feature_names_out'):
            self.feature_names = self.vectorizer.get_feature_names_out()
        else:
            self.feature_names = self.vectorizer.get_feature_names_out()
        
        print(f"TF-IDF matrix shape: {self.tfidf_matrix.shape}")
        print(f"Vocabulary size: {len(self.feature_names)}")
    
    def find_similar_reviews(self, query_review: str, top_k: int = 5) -> List[Dict[str, Any]]:
        """Find most similar reviews to query"""
        if self.tfidf_matrix is None:
            raise ValueError("TF-IDF matrix not built. Call build_tfidf_matrix first.")
        
        print(f"\nFinding {top_k} most similar reviews...")
        print(f"Query review: {query_review[:100]}...")
        
        # Preprocess query
        processed_query = self.preprocessor.preprocess(query_review)
        if not processed_query.strip():
            print("Warning: Query review is empty after preprocessing")
            return []
        
        # Transform query to TF-IDF vector
        query_vector = self.vectorizer.transform([processed_query])
        
        # Calculate cosine similarities
        similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        
        # Get top-k most similar reviews
        top_indices = np.argsort(similarities)[::-1][:top_k]
        
        results = []
        for rank, idx in enumerate(top_indices, 1):
            similarity_score = similarities[idx]
            original_review = self.reviews[idx]
            
            result = {
                'rank': rank,
                'similarity_score': similarity_score,
                'review': original_review,
                'review_index': idx,
                'preview': original_review[:200] + "..." if len(original_review) > 200 else original_review
            }
            results.append(result)
        
        return results
    
    def analyze_query_terms(self, query_review: str, top_terms: int = 10) -> List[Tuple[str, float]]:
        """Analyze which terms in query are most important"""
        processed_query = self.preprocessor.preprocess(query_review)
        if not processed_query.strip():
            return []
        
        query_vector = self.vectorizer.transform([processed_query])
        
        # Get non-zero elements (terms present in query)
        nonzero_indices = query_vector.nonzero()[1]
        term_scores = [(self.feature_names[i], query_vector[0, i]) for i in nonzero_indices]
        
        # Sort by TF-IDF score
        term_scores.sort(key=lambda x: x[1], reverse=True)
        
        return term_scores[:top_terms]
    
    def display_results(self, results: List[Dict[str, Any]], query_review: str):
        """Display search results in a formatted way"""
        print("\n" + "="*80)
        print("SIMILAR REVIEWS SEARCH RESULTS")
        print("="*80)
        
        print(f"\nQUERY REVIEW:")
        print(f"{query_review}\n")
        
        # Analyze query terms
        important_terms = self.analyze_query_terms(query_review)
        if important_terms:
            print("IMPORTANT QUERY TERMS (TF-IDF scores):")
            for term, score in important_terms:
                print(f"  {term}: {score:.4f}")
            print()
        
        print("MOST SIMILAR REVIEWS:")
        print("-" * 80)
        
        for result in results:
            print(f"RANK #{result['rank']} (Similarity: {result['similarity_score']:.4f})")
            print(f"Review Index: {result['review_index']}")
            print(f"Review: {result['preview']}")
            print("-" * 80)
    
    def get_vocabulary_stats(self) -> Dict[str, Any]:
        """Get statistics about the vocabulary"""
        if len(self.feature_names) == 0:
            return {}
        
        # Calculate term frequencies across corpus
        term_sums = np.array(self.tfidf_matrix.sum(axis=0)).flatten()
        term_stats = list(zip(self.feature_names, term_sums))
        term_stats.sort(key=lambda x: x[1], reverse=True)
        
        # Handle sparse matrix
        if hasattr(self.tfidf_matrix, 'nnz'):  # Sparse matrix
            nonzero_count = self.tfidf_matrix.nnz
            total_elements = self.tfidf_matrix.shape[0] * self.tfidf_matrix.shape[1]
            sparsity = 1 - (nonzero_count / total_elements)
            avg_terms = np.mean(np.diff(self.tfidf_matrix.indptr))
        else:  # Dense matrix
            nonzero_count = np.count_nonzero(self.tfidf_matrix)
            sparsity = 1 - (nonzero_count / self.tfidf_matrix.size)
            avg_terms = np.mean(np.count_nonzero(self.tfidf_matrix, axis=1))
        
        return {
            'vocabulary_size': len(self.feature_names),
            'total_documents': self.tfidf_matrix.shape[0],
            'matrix_sparsity': sparsity,
            'top_terms': term_stats[:20],
            'avg_terms_per_doc': avg_terms
        }


In [None]:

def main():
    """Main function to demonstrate similar reviews finder"""
    
    print("Similar Reviews Finder")
    print("=" * 50)
    
    # Initialize finder
    finder = SimilarReviewsFinder(use_sklearn=True, max_features=10000)
    
    # Load reviews
    reviews = finder.load_reviews('reviews.txt')
    if not reviews:
        print("No reviews loaded. Exiting.")
        return
    
    # Build TF-IDF matrix
    finder.build_tfidf_matrix(reviews)
    
    # Show vocabulary statistics
    stats = finder.get_vocabulary_stats()
    print(f"\nVOCABULARY STATISTICS:")
    print(f"Vocabulary size: {stats['vocabulary_size']}")
    print(f"Total documents: {stats['total_documents']}")
    print(f"Matrix sparsity: {stats['matrix_sparsity']:.4f}")
    print(f"Average terms per document: {stats['avg_terms_per_doc']:.2f}")
    
    print(f"\nTop 10 terms by total TF-IDF score:")
    for term, score in stats['top_terms'][:10]:
        print(f"  {term}: {score:.4f}")
    
    # Test with some example queries
    print("\n" + "="*50)
    print("TESTING WITH EXAMPLE QUERIES")
    print("="*50)
    
    example_queries = [
        "The hotel room was clean and comfortable with excellent service",
        "Terrible experience, dirty rooms and rude staff",
        "Great location near the beach, beautiful view from balcony",
        "The breakfast was amazing and the staff was very helpful",
        "Expensive parking and noisy rooms, not worth the money"
    ]
    
    for i, query in enumerate(example_queries, 1):
        print(f"\nEXAMPLE {i}:")
        results = finder.find_similar_reviews(query, top_k=3)
        finder.display_results(results, query)
        
        input("Press Enter to continue to next example...")
    
    # Interactive mode
    print("\n" + "="*50)
    print("INTERACTIVE MODE")
    print("="*50)
    print("Enter your own review to find similar ones (or 'quit' to exit):")
    
    while True:
        user_query = input("\nEnter review: ").strip()
        
        if user_query.lower() in ['quit', 'exit', 'q']:
            break
        
        if not user_query:
            continue
        
        try:
            results = finder.find_similar_reviews(user_query, top_k=5)
            if results:
                finder.display_results(results, user_query)
            else:
                print("No similar reviews found.")
                
        except Exception as e:
            print(f"Error: {e}")
    
    print("\nThank you for using Similar Reviews Finder!")

if __name__ == "__main__":
    main()