# Section 1: Data Loading and Initial Exploration

This section covers the initial steps of our analysis. We'll import the pandas library for data manipulation and then define a function to load our dataset, which is `review_data.csv`. After loading, we'll perform a basic exploratory data analysis (EDA) by displaying the first few rows, a summary of data types and non-null counts, and descriptive statistics to get a quick overview of the data's structure and content.

In [None]:
import pandas as pd
import numpy as np

def load_and_explore_data(file_path: str):
    """Loads data and performs a basic exploration."""
    df = pd.read_csv(file_path)
    print("--- First 5 Rows ---")
    print(df.head())
    print("\n--- Data Info ---")
    df.info()
    print("\n--- Descriptive Statistics ---")
    print(df.describe(include='all'))
    return df

# Example usage for this section (will be integrated into main later)
# review_df = load_and_explore_data('review_data.csv')

# Section 2: Data Preprocessing

In this section, we'll focus on preparing the data for feature engineering. This involves:
1.  **Timestamp Conversion**: Converting the `timestamp` column to datetime objects, which is essential for time-based calculations like review age.
2.  **Price Conversion**: Ensuring the `price` column is a numeric type. This might involve removing currency symbols or other non-numeric characters.
3.  **Text Cleaning (Basic)**: Converting `review_text` to lowercase and stripping leading/trailing whitespace as a foundational step for text analysis.
4.  **Handling Missing Values**: For simplicity in this draft, we will fill missing numerical values with the median and missing text/categorical values with a placeholder like "Unknown" or an empty string.

In [None]:
def preprocess_data(df: pd.DataFrame):
    """Preprocesses the dataframe: handles data types and basic cleaning."""
    df_processed = df.copy()

    # Convert timestamp
    if 'timestamp' in df_processed.columns:
        df_processed['timestamp'] = pd.to_datetime(df_processed['timestamp'], errors='coerce')

    # Convert price to numeric
    if 'price' in df_processed.columns:
        if df_processed['price'].dtype == 'object':
            df_processed['price'] = df_processed['price'].astype(str).str.replace(r'[^\d\.]', '', regex=True)
            df_processed['price'] = pd.to_numeric(df_processed['price'], errors='coerce')

    # Basic text cleaning
    if 'review_text' in df_processed.columns:
        df_processed['review_text'] = df_processed['review_text'].astype(str).str.lower().str.strip()

    # Handle missing values
    for col in df_processed.select_dtypes(include=np.number).columns:
        df_processed[col] = df_processed[col].fillna(df_processed[col].median())
    for col in df_processed.select_dtypes(include='object').columns:
        # A more robust strategy might be needed per column
        df_processed[col] = df_processed[col].fillna('Unknown')
    if 'review_text' in df_processed.columns and df_processed['review_text'].dtype == 'object':
        df_processed['review_text'] = df_processed['review_text'].replace('Unknown','')


    # Drop rows where critical conversions failed (e.g., timestamp to NaT)
    df_processed.dropna(subset=['timestamp', 'price'], inplace=True)

    return df_processed

# Example usage:
# if 'review_df' in globals():
#     preprocessed_df = preprocess_data(review_df)
#     print("\n--- Preprocessed Data Info ---")
#     preprocessed_df.info()
#     print(preprocessed_df.head())

# Section 3: Feature Engineering

This section is dedicated to creating the features required for our Propensity Score Matching (PSM) analysis:
1.  **Treatment Variable (X)**: Create a binary column `mention_shipping`. It will be 1 if keywords like "shipping," "delivery," or "entrega" are found in `review_text`, and 0 otherwise.
2.  **Review Age**: Calculate `review_age_days` from the `timestamp`, possibly relative to the most recent review date.
3.  **Review Length**: Calculate `review_length` as the number of characters in the `review_text`.
4.  **Sentiment Score (Basic Placeholder)**: A very simple rule-based sentiment score (`sentiment_score_basic`) will be generated. *A more sophisticated NLP approach is recommended for the actual project.*
5.  **Product Category Encoding**: The `product_category` column, being categorical, will be one-hot encoded to be used as a covariate in the propensity score model.

The outcome variable `star_rating` is already present.

In [None]:
import re
from datetime import datetime

SHIPPING_KEYWORDS = [
    'shipping', 'shipment', 'shipped', 'delivery', 'delivered', 'delivering', 'entrega',
    'courier', 'carrier', 'fulfillment', 'dispatch', 'postage', 'freight'
]

def engineer_features(df: pd.DataFrame):
    """Engineers features for PSM."""
    df_featured = df.copy()

    # 1. Treatment Variable
    keyword_pattern = r'\b(' + '|'.join(SHIPPING_KEYWORDS) + r')\b'
    df_featured['mention_shipping'] = df_featured['review_text'].str.contains(keyword_pattern, case=False, regex=True).astype(int)

    # 2. Review Age
    if 'timestamp' in df_featured.columns and not df_featured['timestamp'].empty:
        most_recent_date = df_featured['timestamp'].max()
        df_featured['review_age_days'] = (most_recent_date - df_featured['timestamp']).dt.days
    else:
        df_featured['review_age_days'] = 0 # Fallback

    # 3. Review Length
    df_featured['review_length'] = df_featured['review_text'].apply(lambda x: len(str(x)))

    # 4. Sentiment Score (Basic Placeholder)
    # This is a very simplistic approach.
    positive_keywords = ['good', 'great', 'excellent', 'love', 'happy', 'satisfied', 'amazing', 'perfect']
    negative_keywords = ['bad', 'poor', 'terrible', 'hate', 'unhappy', 'disappointed', 'worst', 'awful']
    def basic_sentiment(text):
        score = 0
        for kw in positive_keywords: score += text.count(kw)
        for kw in negative_keywords: score -= text.count(kw)
        if score > 0: return 1
        if score < 0: return -1
        return 0
    df_featured['sentiment_score_basic'] = df_featured['review_text'].apply(basic_sentiment)

    # 5. Product Category Encoding
    if 'product_category' in df_featured.columns:
        df_featured = pd.get_dummies(df_featured, columns=['product_category'], prefix='cat', dummy_na=False)
    
    # Ensure 'price' exists as it's a key covariate
    if 'price' not in df_featured.columns:
        df_featured['price'] = 0.0 # Fallback, real data should have it

    return df_featured

# Example usage:
# if 'preprocessed_df' in globals():
#     featured_df = engineer_features(preprocessed_df)
#     print("\n--- Featured Data Info ---")
#     featured_df.info()
#     print(featured_df[['review_id', 'star_rating', 'price', 'mention_shipping', 'review_age_days', 'review_length', 'sentiment_score_basic'] + [col for col in featured_df if col.startswith('cat_')]].head())

# Section 4: Data Splitting

Here, we'll split our dataset into training, validation, and test sets. This is crucial for training our propensity score model and then evaluating its performance and the subsequent causal effect estimation on unseen data. We'll use `train_test_split` from `sklearn.model_selection`. Stratification based on the treatment variable (`mention_shipping`) can help ensure similar distributions of treated and control units across the splits, if the data allows. The covariates (`X_psm`) will be features like `price`, `review_age_days`, `review_length`, `sentiment_score_basic`, and the one-hot encoded `product_category` columns. The treatment indicator is `T`, and the outcome is `Y`.

In [None]:
from sklearn.model_selection import train_test_split

def split_data(df: pd.DataFrame, treatment_col: str, outcome_col: str, base_covariates: list, test_size=0.2, val_size=0.125, random_state=42):
    """Splits data into training, validation, and test sets."""
    
    # Identify actual covariate columns after one-hot encoding product_category
    # and ensure only existing columns are used.
    encoded_covariates = [c for c in df.columns if c.startswith('cat_')]
    
    # Combine base covariates (excluding original product_category if it was one-hot encoded)
    # with the new one-hot encoded columns.
    final_covariate_list = [cov for cov in base_covariates if cov != 'product_category' and cov in df.columns]
    final_covariate_list.extend(encoded_covariates)
    
    # Ensure essential columns are present
    missing_cols = [col for col in [treatment_col, outcome_col] + final_covariate_list if col not in df.columns]
    if any(col not in df.columns for col in [treatment_col, outcome_col]):
        raise ValueError(f"Treatment ('{treatment_col}') or outcome ('{outcome_col}') column missing from DataFrame.")
    if not final_covariate_list:
        raise ValueError("No covariate columns found in DataFrame.")
    if missing_cols:
        print(f"Warning: Some covariate columns missing, proceeding with available: {missing_cols}")
        final_covariate_list = [c for c in final_covariate_list if c in df.columns]


    X_psm = df[final_covariate_list]
    T = df[treatment_col]
    Y = df[outcome_col]

    # First split: training and temporary (validation + test)
    X_train_psm, X_temp_psm, T_train, T_temp, Y_train, Y_temp = train_test_split(
        X_psm, T, Y, test_size=(test_size + (val_size * (1-test_size))), # val_size is % of original, adjusted here
        random_state=random_state, stratify=T if T.nunique() > 1 else None
    )

    # Second split: validation and test from temporary
    # val_size here is proportion of the temp set
    actual_val_size_for_temp = val_size / (test_size + val_size) if (test_size + val_size) > 0 else 0
    if X_temp_psm.empty or actual_val_size_for_temp == 0 or actual_val_size_for_temp >= 1: # handle edge case of no val/test desired or rounding
        X_val_psm, T_val, Y_val = pd.DataFrame(), pd.Series(dtype=T_train.dtype), pd.Series(dtype=Y_train.dtype)
        X_test_psm, T_test, Y_test = X_temp_psm, T_temp, Y_temp
    else:
        X_val_psm, X_test_psm, T_val, T_test, Y_val, Y_test = train_test_split(
            X_temp_psm, T_temp, Y_temp, test_size= (1-actual_val_size_for_temp), # test is remainder
            random_state=random_state, stratify=T_temp if T_temp.nunique() > 1 else None
        )
    
    return X_train_psm, T_train, Y_train, X_val_psm, T_val, Y_val, X_test_psm, T_test, Y_test

# Example usage:
# if 'featured_df' in globals():
#     COVARIATES_FOR_PSM = ['price', 'review_age_days', 'review_length', 'sentiment_score_basic', 'product_category'] # Base name
#     TREATMENT = 'mention_shipping'
#     OUTCOME = 'star_rating'
#     splits = split_data(featured_df, TREATMENT, OUTCOME, COVARIATES_FOR_PSM)
#     X_train, T_train, Y_train, X_val, T_val, Y_val, X_test, T_test, Y_test = splits
#     print(f"Train shapes: X-{X_train.shape}, T-{T_train.shape}, Y-{Y_train.shape}")
#     if not X_val.empty: print(f"Val shapes: X-{X_val.shape}, T-{T_val.shape}, Y-{Y_val.shape}")
#     print(f"Test shapes: X-{X_test.shape}, T-{T_test.shape}, Y-{Y_test.shape}")

# Section 5: Propensity Score Model Training

This section details the training of the propensity score model. We will use Logistic Regression to predict the probability of a review mentioning shipping (the treatment), given the set of covariates derived in Section 3. Steps include:
1.  **Scaling Covariates**: Numerical covariates will be standardized (scaled) for better model performance.
2.  **Training**: A Logistic Regression model is trained on the training set (`X_train_psm`, `T_train`).
3.  **Prediction**: Propensity scores (probabilities) are then predicted for the training, validation, and test sets.
Optionally, we can plot the distribution of these scores for the treated and control groups to visually inspect overlap, which is critical for PSM.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

def train_propensity_model(X_train: pd.DataFrame, T_train: pd.Series, 
                        X_val: pd.DataFrame = None, X_test: pd.DataFrame = None):
    """Trains a logistic regression model for propensity scores and predicts scores."""
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    
    model = LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced')
    model.fit(X_train_scaled, T_train)

    ps_train = model.predict_proba(X_train_scaled)[:, 1]
    ps_val = None
    if X_val is not None and not X_val.empty:
        X_val_scaled = scaler.transform(X_val)
        ps_val = model.predict_proba(X_val_scaled)[:, 1]
    
    ps_test = None
    if X_test is not None and not X_test.empty:
        X_test_scaled = scaler.transform(X_test)
        ps_test = model.predict_proba(X_test_scaled)[:, 1]
        
    return model, ps_train, ps_val, ps_test, scaler


def plot_propensity_score_overlap(ps_scores: np.ndarray, treatment_indicator: pd.Series, title: str):
    """Visualizes propensity score distribution for overlap assessment."""
    if ps_scores is None or treatment_indicator is None: return
    
    df_plot = pd.DataFrame({'propensity_score': ps_scores, 'treatment': treatment_indicator})
    plt.figure(figsize=(8, 5))
    sns.histplot(data=df_plot, x='propensity_score', hue='treatment', kde=True, stat="density", common_norm=False)
    plt.title(title)
    plt.xlabel('Propensity Score')
    plt.ylabel('Density')
    plt.show()

# Example usage:
# if 'X_train' in globals() and not X_train.empty:
#     ps_model_trained, prop_scores_train, prop_scores_val, prop_scores_test, fitted_scaler = train_propensity_model(X_train, T_train, X_val, X_test)
#     print("Propensity score model trained.")
#     if prop_scores_train is not None:
#         plot_propensity_score_overlap(prop_scores_train, T_train, "Propensity Score Overlap (Training Set)")
#     if prop_scores_val is not None and not T_val.empty:
#         plot_propensity_score_overlap(prop_scores_val, T_val, "Propensity Score Overlap (Validation Set)")

# Section 6: Main Execution and PSM Application (Conceptual Outline)

This final section brings together all the preceding functions into a main pipeline. It will:
1.  Load and preprocess the data.
2.  Engineer the necessary features.
3.  Split the data into training, validation, and test sets.
4.  Train the propensity score model and generate scores.

After these steps, the actual Propensity Score Matching would be performed. This typically involves using the propensity scores to match treated units with control units (e.g., via nearest neighbor matching). Once matched pairs are formed, covariate balance between the groups should be checked. Finally, the causal effect (e.g., Average Treatment Effect on the Treated - ATT) is estimated by comparing outcomes (`star_rating`) in the matched treated and control groups. For this draft, the matching and effect estimation part will be a conceptual placeholder.

In [None]:
# (Ensure all previous functions: load_and_explore_data, preprocess_data, engineer_features, 
# split_data, train_propensity_model, plot_propensity_score_overlap are defined above)

def perform_matching_and_estimate_att(propensity_scores: np.ndarray, 
                                    treatment: pd.Series, 
                                    outcome: pd.Series,
                                    # covariates_df: pd.DataFrame, # For balance check
                                    method='nearest', caliper=0.05):
    """Conceptual placeholder for matching and ATT estimation."""
    print("\n--- Conceptual Matching and ATT Estimation ---")
    if propensity_scores is None or treatment is None or outcome is None:
        print("Insufficient data for matching.")
        return None

    match_data = pd.DataFrame({
        'propensity_score': propensity_scores,
        'treatment': treatment.values, # ensure numpy array for direct indexing
        'outcome': outcome.values
    })

    treated_units = match_data[match_data['treatment'] == 1]
    control_units = match_data[match_data['treatment'] == 0]

    if treated_units.empty or control_units.empty:
        print("Not enough treated or control units.")
        return None

    # Simplified 1-to-1 nearest neighbor matching with caliper (for illustration)
    matched_control_outcomes = []
    available_controls = control_units.copy().sort_values(by='propensity_score')
    
    for _, treated_unit in treated_units.iterrows():
        potential_matches = available_controls[
            np.abs(available_controls['propensity_score'] - treated_unit['propensity_score']) <= caliper
        ]
        if not potential_matches.empty:
            best_match_idx = (potential_matches['propensity_score'] - treated_unit['propensity_score']).abs().idxmin()
            matched_control_outcomes.append(available_controls.loc[best_match_idx, 'outcome'])
            available_controls.drop(best_match_idx, inplace=True) # No replacement

    if not matched_control_outcomes:
        print("No matches found.")
        return None

    att = treated_units['outcome'].mean() - np.mean(matched_control_outcomes)
    print(f"Estimated ATT: {att:.4f} (based on {len(matched_control_outcomes)} matched pairs)")
    # Further steps: covariate balance check on matched sample.
    return att

def main():
    """Main execution pipeline for the PSM experiment."""
    file_path = 'review_data.csv'
    
    # 1. Load & Explore
    raw_df = load_and_explore_data(file_path)
    if raw_df is None: return

    # 2. Preprocess
    preprocessed_df = preprocess_data(raw_df)
    if preprocessed_df is None or preprocessed_df.empty: 
        print("Preprocessing failed or resulted in empty dataframe.")
        return

    # 3. Feature Engineering
    featured_df = engineer_features(preprocessed_df)
    if featured_df is None or featured_df.empty:
        print("Feature engineering failed or resulted in empty dataframe.")
        return

    # Define columns for splitting and modeling
    # Base covariates list (original 'product_category' will be handled by split_data if one-hot encoded)
    COVARIATES_BASE = ['price', 'review_age_days', 'review_length', 'sentiment_score_basic', 'product_category']
    TREATMENT_COL = 'mention_shipping'
    OUTCOME_COL = 'star_rating'
    
    # Ensure essential columns exist before proceeding
    if any(col not in featured_df.columns for col in [TREATMENT_COL, OUTCOME_COL]):
        print(f"Missing treatment or outcome column in featured_df. Needed: '{TREATMENT_COL}', '{OUTCOME_COL}'")
        return

    # 4. Data Splitting
    try:
        X_train, T_train, Y_train, X_val, T_val, Y_val, X_test, T_test, Y_test = split_data(
            featured_df, TREATMENT_COL, OUTCOME_COL, COVARIATES_BASE, 
            test_size=0.2, val_size=0.1 # val_size is % of original for this func
        )
    except ValueError as e:
        print(f"Error during data splitting: {e}")
        return
        
    if X_train.empty:
        print("Training set is empty after split. Aborting.")
        return

    # 5. Propensity Score Model Training
    ps_model, ps_train, ps_val, ps_test, _ = train_propensity_model(
        X_train, T_train, X_val if not X_val.empty else None, X_test if not X_test.empty else None
    )
    
    if ps_model and ps_train is not None:
        print("\nPropensity score model trained.")
        plot_propensity_score_overlap(ps_train, T_train, "Propensity Score Overlap (Training Set)")
        if ps_val is not None and not T_val.empty:
            plot_propensity_score_overlap(ps_val, T_val, "Propensity Score Overlap (Validation Set)")
        if ps_test is not None and not T_test.empty:
            plot_propensity_score_overlap(ps_test, T_test, "Propensity Score Overlap (Test Set)")

        # 6. Conceptual PSM Application (e.g., on test set)
        if ps_test is not None and not T_test.empty and not Y_test.empty:
            perform_matching_and_estimate_att(ps_test, T_test, Y_test) # covariates_df=X_test
        else:
            print("Skipping ATT estimation on test set due to missing propensity scores or data.")
    else:
        print("Propensity score model training failed.")
        
    print("\n--- Pipeline Finished ---")

if __name__ == '__main__':
    main()