In [None]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import numpy as np
import matplotlib.pyplot as plt
import folium
import seaborn as sns
from folium.plugins import HeatMap
from tqdm import tqdm


Note: you must run this notebook by:https://www.kaggle.com/code/stegosaurus3000/ml-sentiment

# Sentimental Analysis

In [None]:
listings = pd.read_csv("/kaggle/input/cleaned-df-gesa-csv/airbnb_cleaned_for_ML.csv")

In [None]:
reviews = pd.read_csv('/kaggle/input/florence-airbnb-data/reviews.csv')

We split long reviews into smaller parts because the model can only handle 512 tokens at a time. Then, we run sentiment analysis on each part using a pretrained multilingual transformer model and average the results to get one sentiment score per review

In [None]:
# Keep only necessary 
reviews_filtered = reviews[['listing_id', 'comments']].dropna()

In [None]:
import re

def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove emojis and special characters (except basic punctuation)
    text = re.sub(r'[^\w\s,.!?]', '', text)
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

reviews_filtered['clean_comments'] = reviews_filtered['comments'].apply(clean_text)

reviews_filtered[['comments', 'clean_comments']]


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import gc

# Setup simple and fast
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using: {device}")

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)
model.eval()

def get_sentiment_scores_fast(texts, batch_size=256, chunk_size=50000):
    """
    Simple sentiment analysis in chunks for 1M reviews
    Returns float scores from 1.0 to 5.0
    """
    print(f"Processing {len(texts):,} reviews in chunks of {chunk_size:,}")
    
    all_scores = []
    
    # Process in chunks to avoid memory issues
    for chunk_start in range(0, len(texts), chunk_size):
        chunk_end = min(chunk_start + chunk_size, len(texts))
        chunk_texts = texts[chunk_start:chunk_end]
        
        print(f"Processing chunk {chunk_start//chunk_size + 1}/{(len(texts)-1)//chunk_size + 1}")
        
        chunk_scores = []
        
        # Process chunk in batches
        for i in tqdm(range(0, len(chunk_texts), batch_size), desc="Batches"):
            batch_texts = chunk_texts[i:i+batch_size]
            
            # Clean and prepare texts
            clean_batch = []
            for text in batch_texts:
                if text is None or not isinstance(text, str):
                    clean_batch.append("")
                else:
                    clean_batch.append(str(text)[:512])  # Truncate long texts
            
            # Tokenize
            inputs = tokenizer(
                clean_batch,
                padding=True,
                truncation=True,
                max_length=512,
                return_tensors="pt"
            ).to(device)
            
            # Predict
            with torch.no_grad():
                outputs = model(**inputs)
                predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
                predicted_classes = torch.argmax(predictions, dim=-1)
                
                # Convert to 1-5 scale (model outputs 0-4)
                scores = (predicted_classes + 1).float().cpu().numpy()
                chunk_scores.extend(scores)
            
            # Clean memory every 20 batches
            if i % (batch_size * 20) == 0:
                torch.cuda.empty_cache()
        
        all_scores.extend(chunk_scores)
        
        # Clean memory after each chunk
        torch.cuda.empty_cache()
        gc.collect()
        
        print(f"Chunk completed. Total processed: {len(all_scores):,}")
    
    return [float(score) for score in all_scores]

# Apply to reviews_filtered
print("Starting sentiment analysis on reviews_filtered...")

# Get the scores
sentiment_scores = get_sentiment_scores_fast(
    reviews_filtered['clean_comments'].tolist(),
    batch_size=256,
    chunk_size=50000  # Process 50K at a time
)

# Add to dataframe as float
reviews_filtered['sentiment_score'] = sentiment_scores

print("Sentiment scores added as float values (1.0 to 5.0)")
print(f"Score distribution:")
print(f"Mean: {np.mean(sentiment_scores):.2f}")
print(f"Min: {min(sentiment_scores):.1f}, Max: {max(sentiment_scores):.1f}")

# Show sample
print("\nSample results:")
print(reviews_filtered[['clean_comments', 'sentiment_score']].head())

In [None]:
reviews_filtered.to_csv('/kaggle/working/scored_reviews.csv', index=False)