#Notebook 1: Feature Engineering

This notebook loads the raw review data, performs comprehensive feature
engineering, and saves the resulting DataFrame to an intermediate file
for the next stage of the pipeline.

##1. SETUP AND INSTALLATION

In [8]:
print("Starting setup and installation for Notebook 1...")

# Install all required packages
!pip install -q pandas scikit-learn nltk transformers torch tqdm gdown sentencepiece

# Import libraries
import json
import pandas as pd
import numpy as np
import os
import re
import pickle
from pathlib import Path
import warnings
from datasets import Dataset

# NLTK downloads
import nltk
nltk.download("stopwords", quiet=True)
nltk.download("wordnet", quiet=True)
nltk.download("punkt", quiet=True)
nltk.download('punkt_tab', quiet=True)

# Scikit-learn imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Transformers and Torch imports
import torch
from transformers import pipeline
from tqdm.auto import tqdm

# Other utilities
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Suppress warnings
warnings.filterwarnings('ignore')

print("\nSetup complete!")
print("-" * 80)

Starting setup and installation for Notebook 1...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).

Setup complete!
--------------------------------------------------------------------------------


## 2. CONFIGURATION

In [9]:
"""## 2. CONFIGURATION"""

print("Loading configuration...")

# Data and File Paths
GOOGLE_DRIVE_FILE_ID = "1sDxExNX1y0RocYXrAE5kLjJc5cMWk0pT"
INPUT_FILENAME = "review-Wyoming_10.json"
SAVE_DIR = "/content/drive/MyDrive/Tiktok_Hackaton/preprocessed_data"
OUTPUT_FILE = os.path.join(SAVE_DIR, "engineered_features.pkl")

# Ensure save directory exists
Path(SAVE_DIR).mkdir(parents=True, exist_ok=True)

# Data Processing Parameters
SAMPLE_SIZE = 50000 # Number of reviews to sample. Set to -1 to use all data.
CHUNK_SIZE = 32     # Batch size for sentiment analysis

# Feature Engineering Parameters
NUM_TOPICS_LDA = 5
NUM_KEYWORDS = 5
SENTIMENT_MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"

print("Configuration loaded.")
print("-" * 80)

Loading configuration...
Configuration loaded.
--------------------------------------------------------------------------------


##3. DATA COLLECTION AND INITIAL PREPROCESSING

In [10]:
"""##3. DATA COLLECTION AND INITIAL PREPROCESSING"""

print("Loading and preprocessing initial data...")

def load_json_data(file_path):
    """Loads data from a JSON file where each line is a JSON object."""
    data = []
    try:
        with open(file_path, "r", encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                try:
                    data.append(json.loads(line))
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON from line {line_num}: {e}")
    except FileNotFoundError:
        print(f"Error: The file at {file_path} was not found.")
        return pd.DataFrame()
    except Exception as e:
        print(f"Error reading file: {e}")
        return pd.DataFrame()

    df = pd.DataFrame(data)
    print(f"Loaded {len(df)} records from {file_path}")
    return df

# Download the data file to the correct directory
input_file_path = os.path.join(SAVE_DIR, INPUT_FILENAME)
print(f"Downloading {INPUT_FILENAME} to {input_file_path}...")
!gdown --id {GOOGLE_DRIVE_FILE_ID} -O {input_file_path} --quiet
print("Download complete.")

# Load the data
reviews_df = load_json_data(input_file_path)

if reviews_df.empty:
    raise ValueError("No data loaded. Please check the file and try again.")

print(f"Initial dataset shape: {reviews_df.shape}")

# Basic data validation
required_columns = ['text', 'rating']
missing_columns = [col for col in required_columns if col not in reviews_df.columns]
if missing_columns:
    raise ValueError(f"Missing required columns: {missing_columns}")

# 1. Handle missing values
initial_count = len(reviews_df)
reviews_df.dropna(subset=['text', 'rating'], inplace=True)
dropped_count = initial_count - len(reviews_df)
if dropped_count > 0:
    print(f"Dropped {dropped_count} rows with missing text or rating")

# 2. Convert 'time' to datetime
if 'time' in reviews_df.columns:
    reviews_df['time'] = pd.to_datetime(reviews_df['time'], unit='ms', errors='coerce')
else:
    reviews_df['time'] = pd.Timestamp.now()  # Default timestamp if missing

# 3. Basic text cleaning
reviews_df['cleaned_text'] = (reviews_df['text']
                              .astype(str)
                              .str.lower()
                              .str.strip())

print("Initial data loading and preprocessing complete.")
print("-" * 80)

Loading and preprocessing initial data...
Downloading review-Wyoming_10.json to /content/drive/MyDrive/Tiktok_Hackaton/preprocessed_data/review-Wyoming_10.json...
Download complete.
Loaded 324725 records from /content/drive/MyDrive/Tiktok_Hackaton/preprocessed_data/review-Wyoming_10.json
Initial dataset shape: (324725, 8)
Dropped 146557 rows with missing text or rating
Initial data loading and preprocessing complete.
--------------------------------------------------------------------------------


##4. FEATURE ENGINEERING

In [11]:
"""##4. FEATURE ENGINEERING"""

print("Starting feature engineering...")

# For demonstration purposes, we'll use a sample of the data if SAMPLE_SIZE is specified.
if SAMPLE_SIZE > 0 and len(reviews_df) > SAMPLE_SIZE:
    reviews_df = reviews_df.sample(n=SAMPLE_SIZE, random_state=42).reset_index(drop=True)
    print(f"Using sample of {len(reviews_df)} reviews")

# Helper function to get the best available device
def get_device():
    if torch.cuda.is_available():
        device = torch.device('cuda')
        print(f"Using CUDA GPU: {torch.cuda.get_device_name()}")
    elif torch.backends.mps.is_available():
        device = torch.device('mps')
        print("Using Apple Silicon GPU (MPS)")
    else:
        device = torch.device('cpu')
        print("Using CPU")
    return device

device = get_device()
print(f"Using device: {device}")

print("Feature engineering pipeline starting...")
print("-" * 50)

Starting feature engineering...
Using sample of 50000 reviews
Using CUDA GPU: Tesla T4
Using device: cuda
Feature engineering pipeline starting...
--------------------------------------------------


###  [1/9] Sentiment Analysis

In [None]:
"""### [1/9] Sentiment Analysis"""

print("[1/9] Performing Sentiment Analysis...")

try:
    sentiment_pipeline = pipeline(
        "sentiment-analysis",
        model=SENTIMENT_MODEL_NAME,
        device=0 if device.type == 'cuda' else -1,
        padding=True,
        truncation=True,
        max_length=512
    )

    # Corrected line: Pass the list of texts directly to the pipeline
    texts_to_analyze = reviews_df['cleaned_text'].tolist()
    batch_results = sentiment_pipeline(texts_to_analyze, batch_size=CHUNK_SIZE)

    # Process results
    sent_labels = [res['label'] for res in batch_results]
    sent_scores = [res['score'] for res in batch_results]

    # Updated score mapping for the twitter-roberta model
    score_map = {'LABEL_0': -1, 'LABEL_1': 0, 'LABEL_2': 1, 'NEGATIVE': -1, 'NEUTRAL': 0, 'POSITIVE': 1}
    reviews_df['sentiment_score'] = [
        score_map.get(label, 0) * score for label, score in zip(sent_labels, sent_scores)
    ]

    print("Sentiment analysis complete.")

except Exception as e:
    print(f"Error in sentiment analysis: {e}")
    reviews_df['sentiment_score'] = 0.0

[1/9] Performing Sentiment Analysis...


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cuda:0


### [2/9] Time-based features

In [None]:
"""### [2/9] Time-based features"""

print("[2/9] Extracting Time-based Features...")

# Ensure time column is properly formatted
if 'time' not in reviews_df.columns or reviews_df['time'].isna().all():
    reviews_df['time'] = pd.Timestamp.now()
    print("Warning: Using default timestamp for missing time data")

reviews_df["review_day_of_week"] = reviews_df["time"].dt.dayofweek
reviews_df["review_hour_of_day"] = reviews_df["time"].dt.hour

print("Time-based features complete.")

###[3/9] Topic Modeling (LDA)

In [14]:
"""### [3/9] Topic Modeling (LDA)"""

print("[3/9] Performing Topic Modeling...")

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text_for_topic_modeling(text):
    """Preprocess text for topic modeling"""
    try:
        tokens = word_tokenize(str(text))
        tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens
                 if word.isalpha() and word.lower() not in stop_words and len(word) > 2]
        return " ".join(tokens)
    except Exception as e:
        return ""

reviews_df["processed_text_for_topic"] = reviews_df["cleaned_text"].apply(preprocess_text_for_topic_modeling)

# Remove empty processed texts
non_empty_mask = reviews_df["processed_text_for_topic"].str.len() > 0
processed_texts = reviews_df.loc[non_empty_mask, "processed_text_for_topic"]

if len(processed_texts) > NUM_TOPICS_LDA and len(processed_texts) > 10:
    try:
        vectorizer = TfidfVectorizer(
            max_df=0.95,
            min_df=2,
            stop_words="english",
            max_features=1000
        )
        dtm = vectorizer.fit_transform(processed_texts)

        lda = LatentDirichletAllocation(
            n_components=NUM_TOPICS_LDA,
            random_state=42,
            max_iter=10  # Reduce iterations for faster processing
        )
        lda.fit(dtm)

        # Get topic assignments for all texts
        all_dtm = vectorizer.transform(reviews_df["processed_text_for_topic"])
        reviews_df["dominant_topic"] = lda.transform(all_dtm).argmax(axis=1)

        print(f"Topic modeling complete with {NUM_TOPICS_LDA} topics.")

    except Exception as e:
        print(f"Error in topic modeling: {e}")
        reviews_df["dominant_topic"] = 0
else:
    reviews_df["dominant_topic"] = 0
    print("Insufficient data for topic modeling, using default topic.")

[3/9] Performing Topic Modeling...
Topic modeling complete with 5 topics.


### [4/9] Keyword Extraction

In [15]:
"""### [4/9] Keyword Extraction"""

print("[4/9] Performing Keyword Extraction...")

if len(reviews_df) > NUM_KEYWORDS:
    try:
        vectorizer_keywords = TfidfVectorizer(
            stop_words="english",
            max_features=1000,
            min_df=2
        )

        texts_for_keywords = reviews_df["processed_text_for_topic"].fillna("")
        non_empty_texts = texts_for_keywords[texts_for_keywords.str.len() > 0]

        if len(non_empty_texts) > 0:
            tfidf_matrix = vectorizer_keywords.fit_transform(non_empty_texts)
            feature_names = vectorizer_keywords.get_feature_names_out()

            def get_top_n_keywords(row, feature_names, n=NUM_KEYWORDS):
                """Extract top n keywords from TF-IDF row"""
                if np.sum(row) == 0:
                    return ['no_keywords'] * n
                top_n_indices = row.argsort()[-n:][::-1]
                return [feature_names[i] for i in top_n_indices]

            # Get keywords for non-empty texts
            keywords_sparse = vectorizer_keywords.transform(texts_for_keywords)
            reviews_df["top_keywords"] = [
                get_top_n_keywords(row.toarray().flatten(), feature_names)
                for row in keywords_sparse
            ]

            print("Keyword extraction complete.")
        else:
            reviews_df["top_keywords"] = [['no_keywords'] * NUM_KEYWORDS] * len(reviews_df)
            print("No valid text for keyword extraction.")

    except Exception as e:
        print(f"Error in keyword extraction: {e}")
        reviews_df["top_keywords"] = [['no_keywords'] * NUM_KEYWORDS] * len(reviews_df)
else:
    reviews_df["top_keywords"] = [['no_keywords'] * NUM_KEYWORDS] * len(reviews_df)
    print("Insufficient data for keyword extraction.")

[4/9] Performing Keyword Extraction...
Keyword extraction complete.


###[5/9] Policy Insight Features

In [16]:
"""### [5/9] Policy Insight Features"""

print("[5/9] Adding Policy Insight Features...")

# Ensure text column is string type
reviews_df["cleaned_text"] = reviews_df["cleaned_text"].fillna("").astype(str)

# URL detection
URL_PATTERNS = [
    r"(?:https?://|www\.)",  # Standard URLs
    r"\b\w+\.\w{2,4}(?:/\S*)?\b"  # Domain-like patterns
]
reviews_df["num_urls"] = sum(
    reviews_df["cleaned_text"].str.count(pattern, flags=re.I)
    for pattern in URL_PATTERNS
)

# Email detection
EMAIL_PAT = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b"
reviews_df["num_emails"] = reviews_df["cleaned_text"].str.count(EMAIL_PAT, flags=re.I)

# Phone number detection (improved pattern)
PHONE_PATTERNS = [
    r"\b\d{3}[-.]?\d{3}[-.]?\d{4}\b",  # US format
    r"\b\(\d{3}\)\s?\d{3}[-.]?\d{4}\b",  # (123) 456-7890
    r"\b\d{10,15}\b"  # General long numbers
]
reviews_df["num_phone_numbers"] = sum(
    reviews_df["cleaned_text"].str.count(pattern)
    for pattern in PHONE_PATTERNS
)

# Social media patterns
MENTION_PAT = r"@\w+"
reviews_df["num_mentions"] = reviews_df["cleaned_text"].str.count(MENTION_PAT)

HASHTAG_PAT = r"#\w+"
reviews_df["num_hashtags"] = reviews_df["cleaned_text"].str.count(HASHTAG_PAT)

# Irrelevant content indicators
irrelevant_words = [
    "iphone", "crypto", "bitcoin", "forex", "makeup", "subscribe",
    "channel", "follow", "like", "comment", "share", "promo", "discount"
]
IRRELEVANT_PAT = r"\b(?:" + "|".join(map(re.escape, irrelevant_words)) + r")\b"
reviews_df["num_irrelevant_words"] = reviews_df["cleaned_text"].str.count(IRRELEVANT_PAT, flags=re.I)

print("Policy insight features complete.")

[5/9] Adding Policy Insight Features...
Policy insight features complete.


### [6/9] Stylistic Features

In [17]:
"""### [6/9] Stylistic Features"""

print("[6/9] Extracting Stylistic Features...")

texts = reviews_df["cleaned_text"].fillna("")

# Punctuation and style markers
reviews_df["num_exclamations"] = texts.str.count("!")
reviews_df["num_questions"] = texts.str.count(r"\?")
reviews_df["num_ellipsis"] = texts.str.count(r"\.{3,}")

# Total punctuation count
all_punctuation = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
reviews_df["num_total_punctuation"] = texts.apply(
    lambda s: sum(1 for char in s if char in all_punctuation)
)

# Text length and word statistics
reviews_df["review_length_words"] = texts.str.split().apply(len)
reviews_df["review_length_chars"] = texts.str.len()

# Capitalization patterns
reviews_df["all_caps_word_count"] = texts.str.findall(r"\b[A-Z]{2,}\b").apply(len)
reviews_df["elongated_word_count"] = texts.str.count(r"\b\w*(\w)\1{2,}\w*\b")

# Ratios for normalization
def safe_ratio(numerator, denominator, default=0.0):
    """Calculate ratio with safe division"""
    return numerator / max(1, denominator) if denominator > 0 else default

reviews_df["caps_ratio"] = texts.apply(
    lambda t: safe_ratio(sum(ch.isupper() for ch in t), sum(ch.isalpha() for ch in t))
)

reviews_df["unique_word_ratio"] = texts.apply(
    lambda t: safe_ratio(len(set(t.split())), len(t.split()))
)

reviews_df["digit_ratio"] = texts.apply(
    lambda t: safe_ratio(sum(ch.isdigit() for ch in t), len(t))
)

reviews_df["punctuation_ratio"] = texts.apply(
    lambda t: safe_ratio(
        sum(1 for ch in t if not ch.isalnum() and not ch.isspace()),
        len(t)
    )
)

print("Stylistic features complete.")

[6/9] Extracting Stylistic Features...
Stylistic features complete.


###[7/9] Emotional & Expressive Marker Extractor

In [18]:
"""### [7/9] Emotional & Expressive Marker Extractor"""

print("[7/9] Extracting Emotional Markers...")

texts = reviews_df["cleaned_text"].fillna("")

# Sentiment word lists (expanded)
POS_TERMS = [
    "love", "amazing", "awesome", "fantastic", "great", "good", "wonderful",
    "excellent", "happy", "enjoy", "beautiful", "nice", "friendly", "delicious",
    "tasty", "perfect", "outstanding", "brilliant", "superb", "marvelous"
]

NEG_TERMS = [
    "bad", "worse", "worst", "awful", "terrible", "horrible", "disgusting",
    "hate", "scam", "rude", "dirty", "cold", "bland", "overpriced", "poor",
    "slow", "angry", "sad", "disappoint", "nightmare", "disaster", "pathetic"
]

# Count sentiment words
POS_RE = r"\b(?:" + "|".join(map(re.escape, POS_TERMS)) + r")\b"
NEG_RE = r"\b(?:" + "|".join(map(re.escape, NEG_TERMS)) + r")\b"

reviews_df["num_sentiment_words_pos"] = texts.str.count(POS_RE, flags=re.IGNORECASE)
reviews_df["num_sentiment_words_neg"] = texts.str.count(NEG_RE, flags=re.IGNORECASE)

# Emoji detection (simplified pattern)
EMOJI_PATTERN = r"[😀-🙏🚀-🛿☀-➿]"
reviews_df["num_emojis"] = texts.str.count(EMOJI_PATTERN)

# Derived features
reviews_df["sentiment_polarity_lex"] = (
    reviews_df["num_sentiment_words_pos"] - reviews_df["num_sentiment_words_neg"]
)

# Per 100 words normalization
word_counts = texts.str.split().apply(len).replace(0, 1)
reviews_df["pos_words_per_100w"] = reviews_df["num_sentiment_words_pos"] * 100 / word_counts
reviews_df["neg_words_per_100w"] = reviews_df["num_sentiment_words_neg"] * 100 / word_counts
reviews_df["emojis_per_100w"] = reviews_df["num_emojis"] * 100 / word_counts

print("Emotional markers complete.")

[7/9] Extracting Emotional Markers...
Emotional markers complete.


### [8/9] User Information Extractor

In [None]:
"""### [8/9] User Information Extractor"""

print("[8/9] Adding User-level Features...")

# Ensure user_id exists and is properly formatted
if 'user_id' not in reviews_df.columns:
    reviews_df['user_id'] = 'anonymous_' + reviews_df.index.astype(str)
    print("Warning: No user_id found, created anonymous IDs")

reviews_df["user_id"] = reviews_df["user_id"].fillna("unknown").astype(str)

# Basic user statistics
reviews_df["user_review_count"] = reviews_df.groupby("user_id")["user_id"].transform("size")

# Same day reviews (requires valid time column)
if reviews_df['time'].notna().any():
    reviews_df["num_same_day_reviews"] = reviews_df.groupby(
        ["user_id", reviews_df["time"].dt.floor("D")]
    )["user_id"].transform("size")
else:
    reviews_df["num_same_day_reviews"] = 1

# Rating statistics
if 'rating' in reviews_df.columns:
    reviews_df["avg_review_rating"] = reviews_df.groupby("user_id")["rating"].transform("mean")
else:
    reviews_df["avg_review_rating"] = 5.0

# Burst analysis (7-day rolling window)
try:
    if reviews_df['time'].notna().sum() > 0:
        per_user_burst = (reviews_df.assign(day=reviews_df["time"].dt.floor("D"))
                         .groupby("user_id")
                         .apply(lambda g: (g.set_index("day")
                                          .resample("D")["user_id"]
                                          .size()
                                          .rolling(7, min_periods=1)
                                          .sum()
                                          .max()))
                         .rename("user_max_7d_burst")
                         .reset_index())
        reviews_df = reviews_df.merge(per_user_burst, on="user_id", how="left")
        reviews_df["user_max_7d_burst"] = reviews_df["user_max_7d_burst"].fillna(1)
    else:
        reviews_df["user_max_7d_burst"] = 1
except Exception as e:
    print(f"Error in burst analysis: {e}")
    reviews_df["user_max_7d_burst"] = 1

# Time span analysis
if reviews_df['time'].notna().sum() > 0:
    reviews_df["user_review_span_days"] = reviews_df.groupby("user_id")["time"].transform(
        lambda x: (x.max() - x.min()).days + 1 if x.notna().any() else 1
    )
else:
    reviews_df["user_review_span_days"] = 1

# Derived ratios
reviews_df["user_burst_ratio"] = (
    reviews_df["user_max_7d_burst"].astype(float) /
    reviews_df["user_review_count"].replace(0, 1)
)

# Place diversity (requires gmap_id)
if 'gmap_id' in reviews_df.columns:
    reviews_df["user_place_diversity"] = reviews_df.groupby("user_id")["gmap_id"].transform("nunique")
else:
    reviews_df["user_place_diversity"] = 1

# Text statistics
reviews_df["user_avg_length"] = reviews_df.groupby("user_id")["text"].transform(
    lambda s: s.astype(str).str.len().mean()
)

reviews_df["user_text_share"] = (
    (reviews_df["text"].astype(str).str.strip().ne("") & reviews_df["text"].notna())
    .groupby(reviews_df["user_id"])
    .transform("mean")
    .fillna(0.0)
)

# Placeholder for response rate (would need actual response data)
reviews_df["user_response_rate"] = 0.0

print("User-level features complete.")

###[9/9] Photo Extractor

In [20]:
"""### [9/9] Photo Extractor"""

print("[9/9] Adding Photo-based Features...")

def extract_pic_urls(pics):
    """Extract photo URLs from various formats"""
    if pics is None or (isinstance(pics, float) and pd.isna(pics)):
        return []

    if isinstance(pics, str):
        try:
            return extract_pic_urls(json.loads(pics))
        except:
            return [pics] if "http" in pics else []

    if isinstance(pics, list):
        urls = []
        for item in pics:
            urls.extend(extract_pic_urls(item))
        return urls

    if isinstance(pics, dict):
        return extract_pic_urls(pics.get("url", []))

    return []

# Handle photos column
if 'pics' in reviews_df.columns:
    urls_series = reviews_df["pics"].apply(extract_pic_urls)
else:
    print("Warning: No 'pics' column found, using empty photo data")
    urls_series = pd.Series([[]] * len(reviews_df))

# Basic photo features
reviews_df["n_photos"] = urls_series.apply(len).astype(int)

# URL length statistics
reviews_df["photo_url_length_mean"] = urls_series.apply(
    lambda xs: float(np.mean([len(str(u)) for u in xs]) if xs else 0.0)
)

reviews_df["photo_url_length_max"] = urls_series.apply(
    lambda xs: float(max([len(str(u)) for u in xs]) if xs else 0.0)
)

reviews_df["photo_url_length_std"] = urls_series.apply(
    lambda xs: float(np.std([len(str(u)) for u in xs]) if len(xs) > 1 else 0.0)
)

# CDN detection
reviews_df["photo_google_cdn_flag"] = urls_series.apply(
    lambda xs: int(any("googleusercontent" in str(u) for u in xs))
)

# Photo-text relationship features
reviews_df["has_photo_and_no_text"] = (
    (reviews_df["n_photos"] > 0) &
    (reviews_df["text"].fillna("").str.strip() == "")
).astype(int)

reviews_df["has_photo_and_short_text"] = (
    (reviews_df["n_photos"] > 0) &
    (reviews_df["text"].fillna("").str.len() < 20)
).astype(int)

# Photo-rating relationships
if 'rating' in reviews_df.columns:
    reviews_df["has_photo_and_low_rating"] = (
        (reviews_df["n_photos"] > 0) &
        (reviews_df["rating"].fillna(0) <= 2)
    ).astype(int)

    reviews_df["has_photo_and_high_rating"] = (
        (reviews_df["n_photos"] > 0) &
        (reviews_df["rating"].fillna(0) >= 4)
    ).astype(int)
else:
    reviews_df["has_photo_and_low_rating"] = 0
    reviews_df["has_photo_and_high_rating"] = 0

# User-level photo statistics (z-score normalization)
user_photo_stats = reviews_df.groupby("user_id")["n_photos"].agg(['mean', 'std']).fillna(0)
reviews_df = reviews_df.merge(
    user_photo_stats,
    left_on="user_id",
    right_index=True,
    suffixes=('', '_user')
)

reviews_df["n_photos_user_z"] = (
    (reviews_df["n_photos"] - reviews_df["mean"]) /
    reviews_df["std"].replace(0, 1)
).fillna(0.0)

# Initialize remaining photo features that are expected by the second notebook
photo_features_to_initialize = [
    "photo_url_param_count_mean", "photo_host_nunique", "photo_has_multi_hosts",
    "photo_has_thumbnail_param", "photo_url_is_duplicate_any",
    "user_photo_host_diversity", "user_avg_photo_url_len", "user_median_n_photos",
    "place_photo_host_diversity", "place_median_n_photos"
]

for col in photo_features_to_initialize:
    if col not in reviews_df.columns:
        reviews_df[col] = 0.0

# Clean up temporary columns
reviews_df = reviews_df.drop(['mean', 'std'], axis=1, errors='ignore')

print("Photo-based features complete.")

[9/9] Adding Photo-based Features...
Photo-based features complete.


##5. DATA VALIDATION AND SAVE

In [None]:
"""## 5. DATA VALIDATION AND SAVE"""

print("Validating and saving engineered features...")

# Data validation
print("Performing data validation...")

# Check for infinite or extremely large values
numeric_columns = reviews_df.select_dtypes(include=[np.number]).columns
for col in numeric_columns:
    if np.isinf(reviews_df[col]).any():
        print(f"Warning: Infinite values found in {col}, replacing with 0")
        reviews_df[col] = reviews_df[col].replace([np.inf, -np.inf], 0)

    if reviews_df[col].abs().max() > 1e10:
        print(f"Warning: Very large values in {col}, clipping to reasonable range")
        reviews_df[col] = reviews_df[col].clip(-1e6, 1e6)

# Fill any remaining NaN values
reviews_df = reviews_df.fillna(0)


# Ensure required columns exist for the next notebook
required_columns = [
    'text', 'cleaned_text', 'rating', 'time', 'user_id',
    'sentiment_score', 'review_day_of_week', 'review_hour_of_day', 'dominant_topic',
    'review_length_words', 'num_urls', 'num_emails', 'num_phone_numbers',
    'user_review_count', 'n_photos'
]

missing_required = [col for col in required_columns if col not in reviews_df.columns]
if missing_required:
    print(f"Warning: Missing required columns: {missing_required}")
    for col in missing_required:
        reviews_df[col] = 0 if col != 'text' else ''

print(f"Final dataset shape: {reviews_df.shape}")
print(f"Memory usage: {reviews_df.memory_usage(deep=True).sum() / 1024**2:.1f} MB")

# Save the engineered features
try:
    reviews_df.to_pickle(OUTPUT_FILE)
    print(f"Successfully saved DataFrame with {len(reviews_df)} rows to {OUTPUT_FILE}") # Changed len(reviews) to len(reviews_df)
except Exception as e:
    print(f"Error saving DataFrame: {e}")

In [22]:
print(reviews_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 70 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   user_id                     50000 non-null  object        
 1   name                        50000 non-null  object        
 2   time                        50000 non-null  datetime64[ns]
 3   rating                      50000 non-null  int64         
 4   text                        50000 non-null  object        
 5   pics                        50000 non-null  object        
 6   resp                        50000 non-null  object        
 7   gmap_id                     50000 non-null  object        
 8   cleaned_text                50000 non-null  object        
 9   sentiment_score             50000 non-null  float64       
 10  review_day_of_week          50000 non-null  int32         
 11  review_hour_of_day          50000 non-null  int32     