In [2]:
import polars as pl
import os

def create_lag_rolling_features(input_file, output_file):
    print(f"Loading data from {input_file}...")
    # Read the CSV file
    df = pl.read_csv(input_file)
    
    # Convert date column to datetime if it's not already
    if 'date' in df.columns:
        df = df.with_columns(pl.col("date").str.to_datetime())
    
    print(f"Processing {df.shape[0]} rows...")
    
    # Remove any existing lag and rolling window columns
    columns_to_drop = [col for col in df.columns if any(pattern in col for pattern in [
        'price_lag', 'rolling_mean', 'rolling_max', 'rolling_min'
    ])]
    
    if columns_to_drop:
        print(f"Dropping existing columns: {columns_to_drop}")
        df = df.drop(columns_to_drop)
    
    # Create an empty list to store results
    result_dfs = []
    
    # Process each listing independently
    unique_listings = df["listing_id"].unique()
    print(f"Processing {len(unique_listings)} unique listings...")
    
    for listing_id in unique_listings:
        # Filter data for this listing and sort by date
        listing_df = df.filter(pl.col("listing_id") == listing_id).sort("date")
        
        # Create lag and rolling features
        processed_df = listing_df.with_columns([
            # Add lag features
            pl.col("simulated_price").shift(7).alias("price_lag_7d"),
            pl.col("simulated_price").shift(30).alias("price_lag_30d"),
           
            # Add rolling windows
            pl.col("simulated_price").shift(1).rolling_mean(
                window_size=7,
                min_samples=1,
                center=False
            ).alias("rolling_mean_7d"),
           
            pl.col("simulated_price").shift(1).rolling_max(
                window_size=7,
                min_samples=1,
                center=False
            ).alias("rolling_max_7d"),
           
            pl.col("simulated_price").shift(1).rolling_min(
                window_size=7,
                min_samples=1,
                center=False
            ).alias("rolling_min_7d"),
           
            pl.col("simulated_price").shift(1).rolling_mean(
                window_size=14,
                min_samples=1,
                center=False
            ).alias("rolling_mean_14d"),
           
            pl.col("simulated_price").shift(1).rolling_max(
                window_size=14,
                min_samples=1,
                center=False
            ).alias("rolling_max_14d"),
           
            pl.col("simulated_price").shift(1).rolling_min(
                window_size=14,
                min_samples=1,
                center=False
            ).alias("rolling_min_14d"),
           
            pl.col("simulated_price").shift(1).rolling_mean(
                window_size=30,
                min_samples=1,
                center=False
            ).alias("rolling_mean_30d"),
           
            pl.col("simulated_price").shift(1).rolling_max(
                window_size=30,
                min_samples=1,
                center=False
            ).alias("rolling_max_30d"),
           
            pl.col("simulated_price").shift(1).rolling_min(
                window_size=30,
                min_samples=1,
                center=False
            ).alias("rolling_min_30d")
        ])
        
        # Drop rows with null values
        processed_df = processed_df.drop_nulls()
        
        # Append to result
        result_dfs.append(processed_df)
    
    # Combine all processed listings
    if result_dfs:
        final_df = pl.concat(result_dfs)
        
        # Save the result
        print(f"Saving processed data to {output_file}...")
        final_df.write_csv(output_file)
        
        print(f"Processed {len(result_dfs)} listings. Final dataframe has {final_df.shape[0]} rows.")
        return final_df
    else:
        print("No data processed. Check your input file.")
        return None

# Example usage
if __name__ == "__main__":
    input_file = train_path = r"C:\Users\mvk\Documents\DATA_school\thesis\Subset\top_price_changers_subset\train.csv"
    output_file = r"C:\Users\mvk\Documents\DATA_school\thesis\Subset\top_price_changers_subset\train2_with_lags.csv"
    
    processed_data = create_lag_rolling_features(input_file, output_file)

Loading data from C:\Users\mvk\Documents\DATA_school\thesis\Subset\top_price_changers_subset\train.csv...
Processing 1640389 rows...
Dropping existing columns: ['price_lag_1d', 'price_lag_7d', 'price_lag_30d', 'rolling_mean_7d', 'rolling_max_7d', 'rolling_min_7d', 'rolling_mean_14d', 'rolling_max_14d', 'rolling_min_14d', 'rolling_mean_30d', 'rolling_max_30d', 'rolling_min_30d']
Processing 7864 unique listings...


ColumnNotFoundError: simulated_price

Resolved plan until failure:

	---> FAILED HERE RESOLVING 'with_columns' <---
DF ["listing_id", "date", "price", "latitude", ...]; PROJECT */34 COLUMNS