In [4]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh.feature_extraction import EfficientFCParameters
from tsfresh.utilities.dataframe_functions import roll_time_series

def load_and_prepare_data(calendar_path, listings_path, n_listings=500):
    """Load and prepare the data with basic cleaning and sampling."""
    # Load listings first to get the sample
    listings_df = pd.read_csv(listings_path)
    
    # Randomly sample n_listings
    sampled_listings = listings_df['id'].sample(n=n_listings, random_state=42)
    
    # Clean and prepare listings data
    listings_cleaned = listings_df[listings_df['id'].isin(sampled_listings)][
        ['id', 'neighbourhood', 'neighbourhood_cleansed', 'latitude', 'longitude']
    ]
    listings_cleaned = listings_cleaned.rename(columns={'id': 'listing_id'})
    
    # Load calendar data
    calendar_df = pd.read_csv(calendar_path)
    
    # Filter calendar data for sampled listings
    calendar_df = calendar_df[calendar_df['listing_id'].isin(sampled_listings)]
    
    # Merge calendar with listings data
    df = pd.merge(calendar_df, listings_cleaned, on='listing_id', how='left')
    df['date'] = pd.to_datetime(df['date'])
    
    print(f"Total listings in sample: {len(df['listing_id'].unique())}")
    print(f"Total records in sample: {len(df)}")
    
    return df

def create_temporal_features(df):
    """Create temporal features from the date column."""
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['day_of_month'] = df['date'].dt.day
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['quarter'] = df['date'].dt.quarter
    df['week_of_year'] = df['date'].dt.isocalendar().week
    
    # Create seasonal features using sine and cosine transforms
    df['day_of_year'] = df['date'].dt.dayofyear
    df['season_sin'] = np.sin(2 * np.pi * df['day_of_year']/365.25)
    df['season_cos'] = np.cos(2 * np.pi * df['day_of_year']/365.25)
    
    return df

def extract_tsfresh_features(df):
    """Extract time series features using tsfresh with efficient parameters."""
    # Prepare data for tsfresh by creating a price column
    df_rolled = roll_time_series(
        df,
        column_id='listing_id',
        column_sort='date',
        max_timeshift=7,
        rolling_direction=1
    )
    
    # Create a value column for tsfresh (using price)
    df_rolled['value'] = df_rolled['price']
    
    # Use EfficientFCParameters for a predefined set of efficient features
    fc_parameters = EfficientFCParameters()
    
    # Extract features
    try:
        features_filtered_direct = extract_features(
            df_rolled,
            column_id='listing_id',
            column_sort='date',
            column_value='value',
            default_fc_parameters=fc_parameters,
            n_jobs=0  # Adjust based on your CPU
        )
        print(f"Successfully extracted {features_filtered_direct.shape[1]} tsfresh features")
        return features_filtered_direct
    except Exception as e:
        print(f"Error extracting tsfresh features: {str(e)}")
        # Return empty DataFrame with same index as input if extraction fails
        return pd.DataFrame(index=df_rolled.index)

def prepare_features(df, tsfresh_features):
    """Prepare final feature set combining all features."""
    # Encode categorical variables
    le = LabelEncoder()
    df['neighbourhood_encoded'] = le.fit_transform(df['neighbourhood'].fillna('Unknown'))
    df['neighbourhood_cleansed_encoded'] = le.fit_transform(df['neighbourhood_cleansed'].fillna('Unknown'))
    
    # Basic features
    basic_features = [
        'day_of_week', 'month', 'day_of_month', 'is_weekend',
        'quarter', 'week_of_year', 'season_sin', 'season_cos',
        'neighbourhood_encoded', 'neighbourhood_cleansed_encoded',
        'latitude', 'longitude'
    ]
    
    X_basic = df[basic_features]
    
    # Combine with tsfresh features if any were extracted
    if not tsfresh_features.empty:
        X_combined = pd.concat([
            X_basic,
            tsfresh_features.reindex(df.index)
        ], axis=1)
    else:
        X_combined = X_basic
    
    return X_combined.fillna(0)

def evaluate_model(y_true, y_pred, model_name):
    """Calculate and print evaluation metrics."""
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"\n{model_name} Performance Metrics:")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R2 Score: {r2:.4f}")
    
    return {'rmse': rmse, 'mae': mae, 'r2': r2}

def main():
    # Load and prepare data
    calendar_path = r'C:\Users\mvk\Documents\DATA_school\thesis\data_new\paris\paris_merged_calendar.csv'
    listings_path = r'C:\Users\mvk\Documents\DATA_school\thesis\data_new\paris\2024-06-10\listings.csv'
    
    # Load data with sampling
    print("Loading and preparing data...")
    df = load_and_prepare_data(calendar_path, listings_path, n_listings=500)
    df = create_temporal_features(df)
    
    # Extract tsfresh features
    print("Extracting tsfresh features...")
    tsfresh_features = extract_tsfresh_features(df)
    
    # Prepare final feature set
    print("Preparing final feature set...")
    X = prepare_features(df, tsfresh_features)
    y = df['price']
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize models
    models = {
        'Linear Regression': LinearRegression(),
        'Ridge Regression': Ridge(alpha=1.0),
        'Random Forest': RandomForestRegressor(
            n_estimators=50,
            max_depth=6,
            random_state=42
        ),
        'XGBoost': XGBRegressor(
            n_estimators=50,
            learning_rate=0.1,
            max_depth=4,
            min_child_weight=1,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=42
        )
    }
    
    # Train and evaluate models
    results = {}
    for name, model in models.items():
        print(f"\nTraining {name}...")
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
        results[name] = evaluate_model(y_test, y_pred, name)
        
        # Print feature importance for tree-based models
        if name in ['Random Forest', 'XGBoost']:
            feature_importance = pd.DataFrame({
                'feature': X.columns,
                'importance': model.feature_importances_
            })
            print(f"\n{name} Top 10 Important Features:")
            print(feature_importance.nlargest(10, 'importance'))
    
    # Compare model performances
    performance_df = pd.DataFrame(results).T
    print("\nModel Performance Comparison:")
    print(performance_df)

if __name__ == "__main__":
    main()

Loading and preparing data...
Total listings in sample: 500
Total records in sample: 339131
Extracting tsfresh features...


Rolling: 100%|██████████| 40/40 [00:49<00:00,  1.25s/it]
Feature Extraction: 100%|██████████| 500/500 [02:36<00:00,  3.19it/s]


Successfully extracted 777 tsfresh features
Preparing final feature set...

Training Linear Regression...

Linear Regression Performance Metrics:
RMSE: 536.56
MAE: 199.07
R2 Score: 0.0216

Training Ridge Regression...

Ridge Regression Performance Metrics:
RMSE: 536.56
MAE: 199.07
R2 Score: 0.0216

Training Random Forest...

Random Forest Performance Metrics:
RMSE: 306.34
MAE: 125.35
R2 Score: 0.6811

Random Forest Top 10 Important Features:
                           feature  importance
10                        latitude    0.704380
11                       longitude    0.209555
5                     week_of_year    0.051033
9   neighbourhood_cleansed_encoded    0.028586
6                       season_sin    0.002703
7                       season_cos    0.002533
1                            month    0.001014
4                          quarter    0.000085
8            neighbourhood_encoded    0.000061
2                     day_of_month    0.000050

Training XGBoost...

XGBoost Perform