# Labeling

In [17]:
!pip install textblob
!pip install vaderSentiment
!pip install transformers
!pip install scikit-learn




In [18]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Text processing and ML
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
from scipy import stats
from scipy.stats import zscore
from sklearn.preprocessing import MinMaxScaler


In [19]:
df = pd.read_csv('yelp_review_preprocessed.csv')

# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])

print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

Dataset shape: (7673, 19)
Columns: ['business_name', 'review_id', 'user_id', 'date', 'text', 'stars_review', 'stars_business_average', 'cleaned_text', 'original_length', 'cleaned_length', 'normalized_text', 'case_folded_text', 'tokens', 'tokens_ml', 'tokens_dl', 'tokens_bert', 'lemmatized_ml', 'lemmatized_dl', 'lemmatized_bert']


## Sentiment Labeling

In [20]:
def enhanced_sentiment_analysis_single_business(df):
    """
    Enhanced sentiment analysis optimized for single business dataset
    Combines multiple approaches for robust labeling
    """
    print("üé≠ Starting Enhanced Sentiment Analysis for Single Business...")

    df = df.copy()

    # Ensure we have text to analyze
    df['analysis_text'] = df['lemmatized_dl'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

    # ============ 1. VADER Sentiment ============
    print("üîç 1. Calculating VADER sentiment...")
    analyzer = SentimentIntensityAnalyzer()

    def get_vader_scores(text):
        scores = analyzer.polarity_scores(str(text))
        return scores['compound'], scores['pos'], scores['neg'], scores['neu']

    df[['vader_compound', 'vader_pos', 'vader_neg', 'vader_neu']] = df['analysis_text'].apply(
        lambda x: pd.Series(get_vader_scores(x))
    )

    # ============ 2. TextBlob Sentiment ============
    print("üîç 2. Calculating TextBlob sentiment...")
    def get_textblob_sentiment(text):
        blob = TextBlob(str(text))
        return blob.sentiment.polarity, blob.sentiment.subjectivity

    df[['textblob_polarity', 'textblob_subjectivity']] = df['analysis_text'].apply(
        lambda x: pd.Series(get_textblob_sentiment(x))
    )

    # ============ 3. Lexicon-Based Scoring ============
    print("üîç 3. Calculating lexicon-based scores...")

    # Enhanced sentiment lexicons
    positive_words = {
        'excellent', 'amazing', 'great', 'good', 'wonderful', 'fantastic', 'outstanding',
        'perfect', 'friendly', 'clean', 'tasty', 'love', 'nice', 'delicious', 'awesome',
        'fresh', 'perfectly', 'best', 'favorite', 'recommend', 'yummy', 'satisfied', 'crispy', 'juicy', 'tender', 'flavorful', 'savory', 'friendly'
    }

    negative_words = {
        'terrible', 'awful', 'horrible', 'bad', 'poor', 'disappointing', 'worst',
        'waste', 'rude', 'dirty', 'slow', 'cold', 'hate', 'overpriced', 'disgusting',
        'mediocre', 'undercooked', 'salty', 'bland', 'stale', 'expensive', 'greasy', 'cold', 'undercooked', 'noisy', 'rude', 'burnt'
    }

    def lexicon_sentiment_score(text):
        if not isinstance(text, str):
            return 0
        words = set(text.lower().split())
        pos_count = len(words.intersection(positive_words))
        neg_count = len(words.intersection(negative_words))
        total = pos_count + neg_count

        if total == 0:
            return 0
        return (pos_count - neg_count) / total

    df['lexicon_score'] = df['analysis_text'].apply(lexicon_sentiment_score)

    # ============ 4. Star Rating Integration ============
    print("üîç 4. Integrating star ratings...")

    # Convert stars to sentiment score (1-2: negative, 3: neutral, 4-5: positive)
    def stars_to_sentiment_score(stars):
        if stars >= 4:
            return 1.0
        elif stars <= 2:
            return -1.0
        else:
            return 0.0

    df['star_sentiment'] = df['stars_review'].apply(stars_to_sentiment_score)

    # ============ 5. Combined Sentiment Score ============
    print("üîç 5. Creating combined sentiment score...")

    # Weighted combination of all signals
    df['combined_sentiment'] = (
            0.3 * df['vader_compound'] +           # VADER is good for social media text
            0.2 * df['textblob_polarity'] +        # TextBlob for general sentiment
            0.2 * df['lexicon_score'] +            # Custom lexicon for restaurant context
            0.3 * df['star_sentiment']             # Direct user rating (most important)
    )

    # ============ 6. Final Sentiment Labeling ============
    print("üîç 6. Assigning final sentiment labels...")

    def assign_sentiment_label(row):
        combined = row['combined_sentiment']
        stars = row['stars_review']

        # Rule-based approach with confidence
        if combined >= 0.3 or stars >= 4:
            return 'positive'
        elif combined <= -0.3 or stars <= 2:
            return 'negative'
        else:
            return 'neutral'

    df['sentiment_label'] = df.apply(assign_sentiment_label, axis=1)

    # ============ 7. Sentiment Confidence ============
    print("üîç 7. Calculating sentiment confidence...")

    def calculate_confidence(row):
        # Higher confidence when all signals agree
        signals = [
            np.sign(row['vader_compound']),
            np.sign(row['textblob_polarity']),
            np.sign(row['lexicon_score']),
            np.sign(row['star_sentiment'])
        ]

        agreement = sum(1 for s in signals if s == np.sign(row['combined_sentiment']))
        return agreement / len(signals)

    df['sentiment_confidence'] = df.apply(calculate_confidence, axis=1)

    # ============ 8. Analysis ============
    print("\nüìä SENTIMENT ANALYSIS RESULTS:")
    print("=" * 50)
    sentiment_counts = df['sentiment_label'].value_counts()
    for label, count in sentiment_counts.items():
        percentage = (count / len(df)) * 100
        print(f"   {label.upper()}: {count} reviews ({percentage:.1f}%)")

    # Sentiment distribution by stars
    print("\n‚≠ê SENTIMENT BY STAR RATING:")
    sentiment_by_stars = pd.crosstab(df['stars_review'], df['sentiment_label'])
    print(sentiment_by_stars)

    return df

# Perform enhanced sentiment analysis
df = enhanced_sentiment_analysis_single_business(df)

üé≠ Starting Enhanced Sentiment Analysis for Single Business...
üîç 1. Calculating VADER sentiment...
üîç 2. Calculating TextBlob sentiment...
üîç 3. Calculating lexicon-based scores...
üîç 4. Integrating star ratings...
üîç 5. Creating combined sentiment score...
üîç 6. Assigning final sentiment labels...
üîç 7. Calculating sentiment confidence...

üìä SENTIMENT ANALYSIS RESULTS:
   POSITIVE: 6406 reviews (83.5%)
   NEGATIVE: 722 reviews (9.4%)
   NEUTRAL: 545 reviews (7.1%)

‚≠ê SENTIMENT BY STAR RATING:
sentiment_label  negative  neutral  positive
stars_review                                
1.0                   273        0         0
2.0                   449        0         0
3.0                     0      545       424
4.0                     0        0      2337
5.0                     0        0      3645


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7673 entries, 0 to 7672
Data columns (total 31 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   business_name           7673 non-null   object        
 1   review_id               7673 non-null   object        
 2   user_id                 7673 non-null   object        
 3   date                    7673 non-null   datetime64[ns]
 4   text                    7673 non-null   object        
 5   stars_review            7673 non-null   float64       
 6   stars_business_average  7673 non-null   float64       
 7   cleaned_text            7672 non-null   object        
 8   original_length         7673 non-null   int64         
 9   cleaned_length          7673 non-null   int64         
 10  normalized_text         7672 non-null   object        
 11  case_folded_text        7672 non-null   object        
 12  tokens                  7673 non-null   object  

In [22]:
df.head()

Unnamed: 0,business_name,review_id,user_id,date,text,stars_review,stars_business_average,cleaned_text,original_length,cleaned_length,...,vader_pos,vader_neg,vader_neu,textblob_polarity,textblob_subjectivity,lexicon_score,star_sentiment,combined_sentiment,sentiment_label,sentiment_confidence
0,Acme Oyster House,vHLTOsdILT7xgTu7TAWTUQ,417HF4q8ynnWtuJrkNax_g,2016-07-25 04:34:34,This place has amazing oysters and the BEST bl...,5.0,4.0,This place has amazing oysters and the BEST bl...,556,546,...,0.514,0.022,0.464,0.343162,0.531197,0,1.0,0.666652,positive,0.75
1,Acme Oyster House,I90lP6oPICTkrhCFGQt5tA,1UAb3zZQeGX6fzZax5DY1A,2016-12-19 20:27:16,OH MY!! A must try. We had no idea there would...,5.0,4.0,OH MY A must try We had no idea there would be...,425,407,...,0.41,0.091,0.499,0.214286,0.614286,0,1.0,0.633737,positive,0.75
2,Acme Oyster House,469eAl2fB069YTF_B5zW7w,p2kXD3gNu3N776C0WrmBjA,2018-08-23 20:58:39,The fried seafood was extremely hot. Very nice...,5.0,4.0,The fried seafood was extremely hot Very nice ...,530,498,...,0.421,0.0,0.579,0.360098,0.519608,0,1.0,0.66395,positive,0.75
3,Acme Oyster House,aPpHBDs7Jiiq0sb9YutOhQ,7cDhfvTSH1wTxEmXMj_ChQ,2013-06-24 18:07:12,I love this place. I wish my stay was longer s...,5.0,4.0,I love this place I wish my stay was longer so...,193,188,...,0.461,0.0,0.539,0.317273,0.490909,0,1.0,0.642065,positive,0.75
4,Acme Oyster House,k9OG5kA5ebruSx_f1T-P-A,7QTh-fkw9Nr2lO10-PV8yw,2010-10-06 08:03:20,"Loved the chargrilled oysters! I mean, seriou...",3.0,4.0,Loved the chargrilled oysters I mean seriously...,1286,1229,...,0.271,0.073,0.656,0.128603,0.601739,0,0.0,0.320831,positive,0.5


In [23]:
null_row = df[df[['cleaned_text', 'normalized_text', 'case_folded_text']].isna().any(axis=1)]
print("üîç Baris yang memiliki nilai NULL:")
null_row

üîç Baris yang memiliki nilai NULL:


Unnamed: 0,business_name,review_id,user_id,date,text,stars_review,stars_business_average,cleaned_text,original_length,cleaned_length,...,vader_pos,vader_neg,vader_neu,textblob_polarity,textblob_subjectivity,lexicon_score,star_sentiment,combined_sentiment,sentiment_label,sentiment_confidence
3787,Acme Oyster House,3Jji_9v3aoKe2Dbi44lWXg,eArpCCLM_Bx33KpevzNyZw,2014-04-30 03:33:43,„Ç´„Ç≠„Çí„Åü„Åè„Åï„ÇìÈ£ü„Åπ„Åæ„Åó„Åü„ÄÇÂÆâ„Åè„Å¶ÁæéÂë≥„Åó„ÅÑ„Åß„Åô„ÄÇ„Ç´„Ç≠„ÅØÁîü„Ç¨„Ç≠„Åß„ÄÅ‰∏ÄÂÄãÁ¥Ñ1„Éâ„É´„ÄÇË™øÁêÜ„Åó„Åü„Ç´„Ç≠„ÅØ„ÄÅ...,4.0,4.0,,141,0,...,0.0,0.0,1.0,0.0,0.0,0,1.0,0.3,positive,0.25


## Fake Review Detection

In [24]:
def enhanced_fake_review_detection_single_business(df):
    """
    Enhanced fake review detection with DUAL temporal analysis (daily + hourly spikes)
    Optimized for single business dataset
    """
    print("\nüïµÔ∏è Starting Enhanced Fake Review Detection with Dual Temporal Analysis...")

    df = df.copy()

    # ============ 1. DUAL TEMPORAL ANALYSIS ============
    print("‚è∞ 1. Analyzing DUAL temporal patterns (daily + hourly)...")

    # Convert to datetime and sort
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date')

    # Create temporal features
    df['date_only'] = df['date'].dt.date
    df['hour'] = df['date'].dt.floor('H')  # Round to hour
    df['date_hour'] = df['date'].dt.strftime('%Y-%m-%d %H:00:00')  # Combine date and hour

    # DAILY analysis
    daily_counts = df.groupby('date_only').size()
    df['daily_count'] = df['date_only'].map(daily_counts)

    # HOURLY analysis - PERBAIKAN: perhatikan hari dan jam bersama-sama
    hourly_counts = df.groupby('date_hour').size()
    df['hourly_count'] = df['date_hour'].map(hourly_counts)

    # Calculate z-scores for both temporal patterns
    df['daily_spike_score'] = 0.0
    df['hourly_spike_score'] = 0.0

    # Daily z-score (if we have enough days)
    if len(daily_counts) > 3:
        try:
            daily_z = zscore(daily_counts)
            date_to_daily_z = dict(zip(daily_counts.index, daily_z))
            df['daily_spike_score'] = df['date_only'].map(lambda x: max(date_to_daily_z.get(x, 0), 0))
        except:
            print("   ‚ö†Ô∏è Daily z-score calculation failed")

    # Hourly z-score - PERBAIKAN: berdasarkan date_hour (kombinasi tanggal dan jam)
    if len(hourly_counts) > 5:
        try:
            hourly_z = zscore(hourly_counts)
            date_hour_to_hourly_z = dict(zip(hourly_counts.index, hourly_z))
            df['hourly_spike_score'] = df['date_hour'].map(lambda x: max(date_hour_to_hourly_z.get(x, 0), 0))
        except:
            print("   ‚ö†Ô∏è Hourly z-score calculation failed")

    # Combined temporal score (weighted)
    df['temporal_spike_score'] = (
            0.6 * df['daily_spike_score'] +  # Daily spikes are more significant
            0.4 * df['hourly_spike_score']   # Hourly spikes are supporting evidence
    )

    # ============ 2. USER BEHAVIOR ANALYSIS ============
    print("üë§ 2. Analyzing user behavior patterns...")

    user_stats = df.groupby('user_id').agg({
        'review_id': 'count',
        'date': ['min', 'max'],
        'stars_review': 'mean',
        'sentiment_label': lambda x: (x == 'positive').mean()
    }).round(3)

    # Flatten column names
    user_stats.columns = ['user_review_count', 'first_review', 'last_review',
                          'avg_user_rating', 'positive_ratio']

    # User activity metrics
    user_stats['activity_span_days'] = (user_stats['last_review'] - user_stats['first_review']).dt.days + 1
    user_stats['review_frequency'] = user_stats['user_review_count'] / user_stats['activity_span_days']

    # Detect suspicious user patterns
    user_stats['suspicious_user'] = (
            (user_stats['review_frequency'] > 3) |           # Too many reviews per day
            (user_stats['user_review_count'] > 50)           # Too many total reviews
    ).astype(int)

    # Merge user stats back to main dataframe
    df = df.merge(user_stats, on='user_id', how='left')

    # User behavior score
    df['user_behavior_score'] = (
            0.4 * MinMaxScaler().fit_transform(df[['user_review_count']]).flatten() +
            0.3 * MinMaxScaler().fit_transform(df[['review_frequency']]).flatten() +
            0.2 * df['suspicious_user']
    )

    # ============ 3. TEXT QUALITY & SIMILARITY ANALYSIS ============
    print("üìù 3. Analyzing text quality and similarities...")

    # Text length analysis
    df['text_length'] = df['lemmatized_dl'].apply(
        lambda x: len(x) if isinstance(x, list) else len(str(x).split())
    )

    # Text similarity analysis
    print("   üîç Calculating text similarities (this may take a while)...")

    df['text_similarity_score'] = 0.0

    try:
        # Use a sample for efficiency while maintaining representativeness
        sample_size = min(7673, len(df))
        sample_indices = np.random.choice(df.index, sample_size, replace=False)
        sample_texts = df.loc[sample_indices, 'analysis_text'].tolist()

        # TF-IDF vectorization
        vectorizer = TfidfVectorizer(
            max_features=5000,
            stop_words='english',
            min_df=2,
            max_df=0.8
        )

        tfidf_matrix = vectorizer.fit_transform(sample_texts)
        similarity_matrix = cosine_similarity(tfidf_matrix)

        # Calculate similarity scores
        similarity_scores = np.zeros(len(df))
        for i, idx in enumerate(sample_indices):
            # Count reviews with high similarity (>0.85) excluding self
            similar_indices = np.where(similarity_matrix[i] > 0.85)[0]
            similar_count = len(similar_indices) - 1  # exclude self

            if similar_count > 0:
                # Normalize score based on number of similar reviews
                score = min(similar_count / 5.0, 1.0)  # cap at 1.0
                similarity_scores[idx] = score

        df['text_similarity_score'] = similarity_scores
        print(f"   ‚úÖ Text similarity analysis completed on {sample_size} samples")

    except Exception as e:
        print(f"   ‚ö†Ô∏è Text similarity calculation failed: {e}")
        df['text_similarity_score'] = 0.0

    # ============ 4. RATING ANOMALIES & PATTERNS ============
    print("‚≠ê 4. Detecting rating anomalies and patterns...")

    # Deviation from business average
    business_avg = df['stars_business_average'].iloc[0]
    df['rating_deviation'] = abs(df['stars_review'] - business_avg) / 4.0

    # Extreme rater detection
    user_rating_std = df.groupby('user_id')['stars_review'].std().fillna(0)
    df['user_rating_consistency'] = df['user_id'].map(user_rating_std)
    df['extreme_rater_score'] = ((df['user_rating_consistency'] < 0.5) &
                                 (df['user_review_count'] > 1)).astype(int)

    # Rating distribution analysis
    rating_counts = df['stars_review'].value_counts().sort_index()
    total_ratings = len(df)

    # Detect rating manipulation (too many 5-star or 1-star reviews from same user)
    user_rating_stats = df.groupby('user_id').agg({
        'stars_review': ['count', lambda x: (x == 5).sum(), lambda x: (x == 1).sum()]
    })
    user_rating_stats.columns = ['total', 'five_star_count', 'one_star_count']
    user_rating_stats['five_star_ratio'] = user_rating_stats['five_star_count'] / user_rating_stats['total']
    user_rating_stats['one_star_ratio'] = user_rating_stats['one_star_count'] / user_rating_stats['total']

    df['rating_manipulation_score'] = (
                                              df['user_id'].map(user_rating_stats['five_star_ratio']).fillna(0) +
                                              df['user_id'].map(user_rating_stats['one_star_ratio']).fillna(0)
                                      ) / 2

    # ============ 5. SENTIMENT-RATING CONSISTENCY ============
    print("üé≠ 5. Checking sentiment-rating consistency...")

    def sentiment_rating_consistency(row):
        sentiment = row['sentiment_label']
        stars = row['stars_review']

        # High inconsistency cases
        if (sentiment == 'positive' and stars <= 2) or (sentiment == 'negative' and stars >= 4):
            return 1.0
        # Medium inconsistency
        elif (sentiment == 'positive' and stars == 3) or (sentiment == 'negative' and stars == 3):
            return 0.5
        elif sentiment == 'neutral' and stars in [1, 5]:
            return 0.5
        else:
            return 0.0

    df['consistency_score'] = df.apply(sentiment_rating_consistency, axis=1)

    # ============ 6. COMBINE ALL SIGNALS ============
    print("üîó 6. Combining all detection signals with optimized weights...")

    # Optimized weights for single business analysis
    weights = {
        'temporal': 0.25,      # Daily + hourly spikes
        'user_behavior': 0.25,  # User patterns
        'text_similarity': 0.20, # Text copying
        'rating_anomaly': 0.15,  # Rating deviations
        'consistency': 0.15      # Sentiment-rating mismatch
    }

    # Calculate raw fake probability
    df['fake_probability_raw'] = (
            weights['temporal'] * MinMaxScaler().fit_transform(df[['temporal_spike_score']]).flatten() +
            weights['user_behavior'] * df['user_behavior_score'] +
            weights['text_similarity'] * df['text_similarity_score'] +
            weights['rating_anomaly'] * (
                    MinMaxScaler().fit_transform(df[['rating_deviation']]).flatten() * 0.6 +
                    df['extreme_rater_score'] * 0.2 +
                    MinMaxScaler().fit_transform(df[['rating_manipulation_score']]).flatten() * 0.12
            ) +
            weights['consistency'] * df['consistency_score']
    )

    # Scale to 0-1
    df['fake_probability'] = MinMaxScaler().fit_transform(df[['fake_probability_raw']])

    # ============ 7. FINAL FAKE LABEL ASSIGNMENT ============
    print("üè∑Ô∏è 7. Assigning final fake labels...")

    # Use 85th percentile as threshold (more conservative)
    threshold = df['fake_probability'].quantile(0.85)
    df['is_fake'] = (df['fake_probability'] >= threshold).astype(int)

    # ============ 8. ENHANCED EXPLAINABILITY ============
    print("üìã 8. Generating detailed explanations...")

    def generate_detailed_explanation(row):
        reasons = []

        # Temporal reasons
        if row['daily_spike_score'] > 1.5:
            reasons.append(f"High daily spike (z={row['daily_spike_score']:.2f})")
        elif row['daily_spike_score'] > 1.0:
            reasons.append(f"Medium daily spike (z={row['daily_spike_score']:.2f})")

        if row['hourly_spike_score'] > 2.0:
            reasons.append(f"High hourly spike (z={row['hourly_spike_score']:.2f})")
        elif row['hourly_spike_score'] > 1.5:
            reasons.append(f"Medium hourly spike (z={row['hourly_spike_score']:.2f})")

        # User behavior reasons
        if row['user_behavior_score'] > 0.8:
            reasons.append("Very suspicious user behavior")
        elif row['user_behavior_score'] > 0.6:
            reasons.append("Suspicious user behavior")

        if row['suspicious_user'] == 1:
            reasons.append("Flagged as suspicious user")

        # Text similarity reasons
        if row['text_similarity_score'] > 0.7:
            reasons.append(f"High text similarity ({row['text_similarity_score']:.2f})")
        elif row['text_similarity_score'] > 0.4:
            reasons.append(f"Medium text similarity ({row['text_similarity_score']:.2f})")

        # Rating anomaly reasons
        if row['rating_deviation'] > 0.75:
            reasons.append("Extreme rating deviation")
        elif row['rating_deviation'] > 0.5:
            reasons.append("High rating deviation")

        if row['extreme_rater_score'] == 1:
            reasons.append("Extreme rating pattern")

        if row['rating_manipulation_score'] > 0.8:
            reasons.append("Suspicious rating manipulation")

        # Consistency reasons
        if row['consistency_score'] > 0.7:
            reasons.append("High sentiment-rating mismatch")
        elif row['consistency_score'] > 0.4:
            reasons.append("Medium sentiment-rating mismatch")

        return "; ".join(reasons) if reasons else "Normal review pattern"

    df['fake_explanation'] = df.apply(generate_detailed_explanation, axis=1)

    # ============ 9. COMPREHENSIVE RESULTS ANALYSIS ============
    print("\nüîé COMPREHENSIVE FAKE REVIEW DETECTION RESULTS:")
    print("=" * 60)

    fake_count = df['is_fake'].sum()
    fake_percentage = (fake_count / len(df)) * 100

    print(f"   üìä Total reviews analyzed: {len(df):,}")
    print(f"   üö® Potential fake reviews: {fake_count} ({fake_percentage:.1f}%)")
    print(f"   üìà Detection threshold: {threshold:.3f}")
    print(f"   üìÖ Analysis period: {df['date'].min().strftime('%Y-%m-%d')} to {df['date'].max().strftime('%Y-%m-%d')}")

    # Fake reviews by sentiment
    print(f"\n   üé≠ FAKE REVIEWS BY SENTIMENT:")
    fake_by_sentiment = df[df['is_fake'] == 1]['sentiment_label'].value_counts()
    for sentiment, count in fake_by_sentiment.items():
        percentage = (count / fake_count) * 100 if fake_count > 0 else 0
        print(f"      {sentiment.upper()}: {count} reviews ({percentage:.1f}%)")

    # Fake reviews by rating
    print(f"\n   ‚≠ê FAKE REVIEWS BY STAR RATING:")
    fake_by_rating = df[df['is_fake'] == 1]['stars_review'].value_counts().sort_index()
    for stars, count in fake_by_rating.items():
        percentage = (count / fake_count) * 100 if fake_count > 0 else 0
        print(f"      {stars} stars: {count} reviews ({percentage:.1f}%)")

    # Top detection reasons
    print(f"\n   üîç TOP DETECTION REASONS:")
    explanation_counts = df[df['is_fake'] == 1]['fake_explanation'].value_counts().head(100)
    for reason, count in explanation_counts.items():
        print(f"      - {reason}: {count} reviews")

    return df, threshold

# Perform enhanced fake review detection
df, fake_threshold = enhanced_fake_review_detection_single_business(df)


üïµÔ∏è Starting Enhanced Fake Review Detection with Dual Temporal Analysis...
‚è∞ 1. Analyzing DUAL temporal patterns (daily + hourly)...
üë§ 2. Analyzing user behavior patterns...
üìù 3. Analyzing text quality and similarities...
   üîç Calculating text similarities (this may take a while)...
   ‚úÖ Text similarity analysis completed on 7673 samples
‚≠ê 4. Detecting rating anomalies and patterns...
üé≠ 5. Checking sentiment-rating consistency...
üîó 6. Combining all detection signals with optimized weights...
üè∑Ô∏è 7. Assigning final fake labels...
üìã 8. Generating detailed explanations...

üîé COMPREHENSIVE FAKE REVIEW DETECTION RESULTS:
   üìä Total reviews analyzed: 7,673
   üö® Potential fake reviews: 1151 (15.0%)
   üìà Detection threshold: 0.409
   üìÖ Analysis period: 2006-09-18 to 2022-01-17

   üé≠ FAKE REVIEWS BY SENTIMENT:
      POSITIVE: 846 reviews (73.5%)
      NEGATIVE: 260 reviews (22.6%)
      NEUTRAL: 45 reviews (3.9%)

   ‚≠ê FAKE REVIEWS BY STAR RA

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7673 entries, 0 to 7672
Data columns (total 59 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   business_name              7673 non-null   object        
 1   review_id                  7673 non-null   object        
 2   user_id                    7673 non-null   object        
 3   date                       7673 non-null   datetime64[ns]
 4   text                       7673 non-null   object        
 5   stars_review               7673 non-null   float64       
 6   stars_business_average     7673 non-null   float64       
 7   cleaned_text               7672 non-null   object        
 8   original_length            7673 non-null   int64         
 9   cleaned_length             7673 non-null   int64         
 10  normalized_text            7672 non-null   object        
 11  case_folded_text           7672 non-null   object        
 12  tokens

In [26]:
df.head()

Unnamed: 0,business_name,review_id,user_id,date,text,stars_review,stars_business_average,cleaned_text,original_length,cleaned_length,...,text_similarity_score,rating_deviation,user_rating_consistency,extreme_rater_score,rating_manipulation_score,consistency_score,fake_probability_raw,fake_probability,is_fake,fake_explanation
0,Acme Oyster House,DjEUxYAIbPmu9EnQMuso3A,Dd6ElS2Cng3Qag_h4IQC-Q,2006-09-18 22:01:13,"Their Fried Peace Maker Po-Boy was delicious, ...",3.0,4.0,Their Fried Peace Maker Po Boy was delicious E...,179,172,...,0.0,0.25,0.0,0,0.0,0.5,0.18,0.375372,0,Medium sentiment-rating mismatch
1,Acme Oyster House,vYwBMm1uK9VgA735nFMCLQ,yeGIAyHixJrIe-zmXiePWQ,2006-10-01 00:35:10,Full disclosure here. I designed the Acme Oyst...,4.0,4.0,Full disclosure here I designed the Acme Oyste...,931,910,...,0.0,0.0,0.0,0,0.0,0.0,0.075,0.072071,0,Normal review pattern
2,Acme Oyster House,0FgDHLa41Qy4U9kZi_qlYQ,LyXvIE71cMeiBaPZI_Yq2w,2006-10-05 18:25:34,Try to go when they aren't too crowded (this m...,4.0,4.0,Try to go when they are not too crowded this m...,538,519,...,0.0,0.0,0.0,0,0.0,0.0,0.075,0.072071,0,Normal review pattern
3,Acme Oyster House,3L4xptZk0kgN3W8JgnqQsg,O1U20igtZ9ROL9WxHq3eng,2006-11-22 01:17:44,i don't normally like raw oysters cause they t...,5.0,4.0,i do not normally like raw oysters because the...,390,383,...,0.0,0.25,0.0,0,0.5,0.0,0.123,0.210723,0,Normal review pattern
4,Acme Oyster House,DvvBJtYhNzDEpSHRdg-BUQ,LHTsHRVgnhkBwagj81kHkQ,2006-11-26 12:00:47,"dirty rice, po boys and oysters- the repeat bu...",5.0,4.0,dirty rice po boys and oysters the repeat but ...,220,214,...,0.0,0.25,0.0,0,0.5,0.0,0.123,0.210723,0,Normal review pattern


## Check

In [27]:
# 1Ô∏è‚É£ Cek baris yang ada missing value di kolom terkait
null_row = df[df[['cleaned_text', 'normalized_text', 'case_folded_text']].isna().any(axis=1)]
print("üîç Baris yang memiliki nilai NULL:")
null_row


üîç Baris yang memiliki nilai NULL:


Unnamed: 0,business_name,review_id,user_id,date,text,stars_review,stars_business_average,cleaned_text,original_length,cleaned_length,...,text_similarity_score,rating_deviation,user_rating_consistency,extreme_rater_score,rating_manipulation_score,consistency_score,fake_probability_raw,fake_probability,is_fake,fake_explanation
1964,Acme Oyster House,3Jji_9v3aoKe2Dbi44lWXg,eArpCCLM_Bx33KpevzNyZw,2014-04-30 03:33:43,„Ç´„Ç≠„Çí„Åü„Åè„Åï„ÇìÈ£ü„Åπ„Åæ„Åó„Åü„ÄÇÂÆâ„Åè„Å¶ÁæéÂë≥„Åó„ÅÑ„Åß„Åô„ÄÇ„Ç´„Ç≠„ÅØÁîü„Ç¨„Ç≠„Åß„ÄÅ‰∏ÄÂÄãÁ¥Ñ1„Éâ„É´„ÄÇË™øÁêÜ„Åó„Åü„Ç´„Ç≠„ÅØ„ÄÅ...,4.0,4.0,,141,0,...,0.0,0.0,0.0,0,0.0,0.0,0.162385,0.32449,0,Medium daily spike (z=1.25); High hourly spike...


In [28]:
# 2Ô∏è‚É£ Hapus baris tersebut
df = df.dropna(subset=['cleaned_text', 'normalized_text', 'case_folded_text']).reset_index(drop=True)
print(f"\n‚úÖ Baris NULL dihapus. Total data sekarang: {len(df)} baris.")


‚úÖ Baris NULL dihapus. Total data sekarang: 7672 baris.


In [29]:
output_path = "yelp_labeled_full.csv"

# Simpan ke CSV
df.to_csv(output_path, index=False, encoding='utf-8')
print(f"‚úÖ File berhasil disimpan ke: {output_path}")

‚úÖ File berhasil disimpan ke: yelp_labeled_full.csv


In [30]:
# Baca ulang
df_loaded = pd.read_csv("yelp_labeled_full.csv")

# Lihat info dasar
print("=== INFO DATASET ===")
print(df_loaded.info())


=== INFO DATASET ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7672 entries, 0 to 7671
Data columns (total 59 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   business_name              7672 non-null   object 
 1   review_id                  7672 non-null   object 
 2   user_id                    7672 non-null   object 
 3   date                       7672 non-null   object 
 4   text                       7672 non-null   object 
 5   stars_review               7672 non-null   float64
 6   stars_business_average     7672 non-null   float64
 7   cleaned_text               7672 non-null   object 
 8   original_length            7672 non-null   int64  
 9   cleaned_length             7672 non-null   int64  
 10  normalized_text            7672 non-null   object 
 11  case_folded_text           7672 non-null   object 
 12  tokens                     7672 non-null   object 
 13  tokens_ml                  

In [31]:
df_loaded.head()

Unnamed: 0,business_name,review_id,user_id,date,text,stars_review,stars_business_average,cleaned_text,original_length,cleaned_length,...,text_similarity_score,rating_deviation,user_rating_consistency,extreme_rater_score,rating_manipulation_score,consistency_score,fake_probability_raw,fake_probability,is_fake,fake_explanation
0,Acme Oyster House,DjEUxYAIbPmu9EnQMuso3A,Dd6ElS2Cng3Qag_h4IQC-Q,2006-09-18 22:01:13,"Their Fried Peace Maker Po-Boy was delicious, ...",3.0,4.0,Their Fried Peace Maker Po Boy was delicious E...,179,172,...,0.0,0.25,0.0,0,0.0,0.5,0.18,0.375372,0,Medium sentiment-rating mismatch
1,Acme Oyster House,vYwBMm1uK9VgA735nFMCLQ,yeGIAyHixJrIe-zmXiePWQ,2006-10-01 00:35:10,Full disclosure here. I designed the Acme Oyst...,4.0,4.0,Full disclosure here I designed the Acme Oyste...,931,910,...,0.0,0.0,0.0,0,0.0,0.0,0.075,0.072071,0,Normal review pattern
2,Acme Oyster House,0FgDHLa41Qy4U9kZi_qlYQ,LyXvIE71cMeiBaPZI_Yq2w,2006-10-05 18:25:34,Try to go when they aren't too crowded (this m...,4.0,4.0,Try to go when they are not too crowded this m...,538,519,...,0.0,0.0,0.0,0,0.0,0.0,0.075,0.072071,0,Normal review pattern
3,Acme Oyster House,3L4xptZk0kgN3W8JgnqQsg,O1U20igtZ9ROL9WxHq3eng,2006-11-22 01:17:44,i don't normally like raw oysters cause they t...,5.0,4.0,i do not normally like raw oysters because the...,390,383,...,0.0,0.25,0.0,0,0.5,0.0,0.123,0.210723,0,Normal review pattern
4,Acme Oyster House,DvvBJtYhNzDEpSHRdg-BUQ,LHTsHRVgnhkBwagj81kHkQ,2006-11-26 12:00:47,"dirty rice, po boys and oysters- the repeat bu...",5.0,4.0,dirty rice po boys and oysters the repeat but ...,220,214,...,0.0,0.25,0.0,0,0.5,0.0,0.123,0.210723,0,Normal review pattern


### Check review yang sama persis --> taunya dari pas preprocessing

In [34]:
df_loaded[df_loaded['text'].str.contains("I've never been more disrespected in my life", case=False, na=False)].T

Unnamed: 0,3846,3851,3937
business_name,Acme Oyster House,Acme Oyster House,Acme Oyster House
review_id,R0LuveEyadjqDBYEL4gqsw,NGff0grXrZMTBNqDkdFzJg,nY1p0rdzItxhzKJmyPknrQ
user_id,CaFbx2zBXiAUC-JmgsyDnQ,CaFbx2zBXiAUC-JmgsyDnQ,CaFbx2zBXiAUC-JmgsyDnQ
date,2016-10-23 21:01:00,2016-10-26 02:07:52,2016-12-04 01:17:31
text,"I've never been more disrespected in my life, ...","I've never been more disrespected in my life, ...","I've never been more disrespected in my life, ..."
stars_review,1.0,1.0,1.0
stars_business_average,4.0,4.0,4.0
cleaned_text,I have never been more disrespected in my life...,I have never been more disrespected in my life...,I have never been more disrespected in my life...
original_length,609,609,609
cleaned_length,590,590,590
