In [1]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import roll_time_series

def load_and_prepare_data(calendar_path, listings_path, n_listings=500):
    """Load and prepare the data with basic cleaning and sampling."""
    # Load listings first to get the sample
    listings_df = pd.read_csv(listings_path)
    sampled_listings = listings_df['id'].sample(n=n_listings, random_state=42)
    
    # Clean and prepare listings data
    listings_cleaned = listings_df[listings_df['id'].isin(sampled_listings)][
        ['id', 'neighbourhood_cleansed', 'latitude', 'longitude']
    ]
    listings_cleaned = listings_cleaned.rename(columns={'id': 'listing_id'})
    
    # Load and filter calendar data
    calendar_df = pd.read_csv(calendar_path)
    calendar_df = calendar_df[calendar_df['listing_id'].isin(sampled_listings)]
    calendar_df['date'] = pd.to_datetime(calendar_df['date'])
    
    # Filter out future dates beyond our current cutoff
    current_date = pd.Timestamp('2024-02-11')  # Set to current date
    calendar_df = calendar_df[calendar_df['date'] <= current_date]
    
    # Merge calendar with listings data
    df = pd.merge(calendar_df, listings_cleaned, on='listing_id', how='left')
    
    print(f"Total listings in sample: {len(df['listing_id'].unique())}")
    print(f"Total records in sample: {len(df)}")
    print(f"Date range: {df['date'].min()} to {df['date'].max()}")
    
    return df

def create_temporal_features(df):
    """Create temporal features from the date column."""
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['season_sin'] = np.sin(2 * np.pi * df['date'].dt.dayofyear/365.25)
    df['season_cos'] = np.cos(2 * np.pi * df['date'].dt.dayofyear/365.25)
    return df

def extract_tsfresh_features(df):
    """Extract time series features using tsfresh with improved parameters."""
    # Create a proper time series dataframe for tsfresh
    df_tsfresh = df[['listing_id', 'date', 'price']].copy()
    df_tsfresh['price'] = pd.to_numeric(df_tsfresh['price'].replace('[\$,]', '', regex=True), errors='coerce')
    df_tsfresh = df_tsfresh.sort_values(['listing_id', 'date'])
    
    # Create rolling windows
    df_rolled = roll_time_series(
        df_tsfresh,
        column_id='listing_id',
        column_sort='date',
        max_timeshift=7,
        rolling_direction=1
    )
    
    # Define minimal but meaningful feature set
    fc_parameters = {
        "mean": None,
        "median": None,
        "standard_deviation": None,
        "variance": None,
        "maximum": None,
        "minimum": None,
        "mean_change": None,
        "mean_abs_change": None
    }
    
    try:
        features_filtered = extract_features(
            df_rolled,
            column_id='listing_id',
            column_sort='date',
            column_value='price',
            default_fc_parameters=fc_parameters,
            n_jobs=0
        )
        
        # Validate features
        features_filtered = features_filtered.replace([np.inf, -np.inf], np.nan)
        features_filtered = features_filtered.dropna(axis=1, how='all')
        non_constant_cols = features_filtered.columns[features_filtered.nunique() > 1]
        features_filtered = features_filtered[non_constant_cols]
        
        # Ensure index is listing_id
        features_filtered.index = features_filtered.index.astype(int)
        
        print(f"\nExtracted {len(non_constant_cols)} valid tsfresh features")
        return features_filtered
        
    except Exception as e:
        print(f"Error in tsfresh feature extraction: {str(e)}")
        return pd.DataFrame()

def prepare_features(df, tsfresh_features):
    """Prepare final feature set."""
    # Encode categorical variables
    le = LabelEncoder()
    df['neighbourhood_cleansed_encoded'] = le.fit_transform(df['neighbourhood_cleansed'].fillna('Unknown'))
    
    # Basic features
    basic_features = [
        'day_of_week', 'month', 'is_weekend', 'season_sin', 'season_cos',
         'neighbourhood_cleansed_encoded',
        'latitude', 'longitude'
    ]
    
    X_basic = df[basic_features]
    
    # Combine with tsfresh features if any were extracted
    if not tsfresh_features.empty:
        # Reset index of basic features to match with tsfresh features
        X_basic = X_basic.reset_index(drop=True)
        
        # Ensure tsfresh features are properly aligned with the basic features
        # by matching on listing_id
        tsfresh_features = tsfresh_features.reindex(df['listing_id'].unique())
        
        # Merge tsfresh features with basic features
        X_combined = pd.DataFrame()
        for listing_id in df['listing_id'].unique():
            # Get basic features for this listing
            listing_basic = X_basic[df['listing_id'] == listing_id]
            
            # Get tsfresh features for this listing
            listing_tsfresh = tsfresh_features.loc[listing_id:listing_id]
            
            # Repeat tsfresh features for each row of this listing
            listing_tsfresh_repeated = pd.DataFrame(
                np.repeat(listing_tsfresh.values, len(listing_basic), axis=0),
                columns=listing_tsfresh.columns,
                index=listing_basic.index
            )
            
            # Combine basic and tsfresh features for this listing
            listing_combined = pd.concat([listing_basic, listing_tsfresh_repeated], axis=1)
            X_combined = pd.concat([X_combined, listing_combined])
    else:
        X_combined = X_basic
    
    return X_combined.fillna(0)

def inspect_tsfresh_features(tsfresh_features, X_combined):
    """Enhanced inspection of tsfresh features."""
    print("\n=== Tsfresh Feature Inspection ===")
    
    if tsfresh_features.empty:
        print("No tsfresh features were extracted!")
        return []
    
    # Print tsfresh features info
    print(f"\nNumber of tsfresh features extracted: {tsfresh_features.shape[1]}")
    
    # Show sample of features with their statistics
    print("\nSample of tsfresh features with statistics:")
    sample_features = tsfresh_features.sample(min(5, tsfresh_features.shape[1]), axis=1)
    for col in sample_features.columns:
        stats = sample_features[col].describe()
        print(f"\nFeature: {col}")
        print(f"Mean: {stats['mean']:.3f}")
        print(f"Std: {stats['std']:.3f}")
        print(f"Min: {stats['min']:.3f}")
        print(f"Max: {stats['max']:.3f}")
    
    # Check feature variance
    low_variance_features = tsfresh_features.columns[tsfresh_features.std() < 1e-6]
    print(f"\nFeatures with very low variance: {len(low_variance_features)}")
    
    # Identify tsfresh features in final dataset
    tsfresh_cols = [col for col in X_combined.columns if '__' in col]
    
    print(f"\nNumber of tsfresh features in final dataset: {len(tsfresh_cols)}")
    if tsfresh_cols:
        print("\nSample of tsfresh features in final dataset:")
        for col in tsfresh_cols[:5]:
            stats = X_combined[col].describe()
            print(f"\n{col}:")
            print(f"Mean: {stats['mean']:.3f}")
            print(f"Std: {stats['std']:.3f}")
    
    return tsfresh_cols

def main():
    # Load and prepare data
    calendar_path = r'C:\Users\mvk\Documents\DATA_school\thesis\data_new\paris\paris_merged_calendar.csv'
    listings_path = r'C:\Users\mvk\Documents\DATA_school\thesis\data_new\paris\2024-06-10\listings.csv'
    
    print("Loading and preparing data...")
    df = load_and_prepare_data(calendar_path, listings_path, n_listings=500)
    df = create_temporal_features(df)
    


    # Reset index and convert prices
    df = df.reset_index(drop=True)
    df['price_numeric'] = pd.to_numeric(df['price'].replace(r'[\$,]', '', regex=True), errors='coerce')

    # Calculate price cutoff and create mask
    price_cutoff = df['price_numeric'].quantile(0.995)
    mask = df['price_numeric'] <= price_cutoff

    # Get list of valid listing IDs
    valid_listings = df[mask]['listing_id'].unique()

    # Remove listings above cutoff
    df = df[df['listing_id'].isin(valid_listings)].copy()

    # Final cleanup
    df = df.reset_index(drop=True)
    df = df.dropna(subset=['price_numeric'])



    print("Extracting tsfresh features...")
    tsfresh_features = extract_tsfresh_features(df)
    
    print("Preparing final feature set...")
    X = prepare_features(df, tsfresh_features)
    y = pd.to_numeric(df['price'].replace(r'[\$,]', '', regex=True), errors='coerce')    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize and train XGBoost
    print("\nTraining XGBoost model...")
    xgb_model = XGBRegressor(
        n_estimators=50,
        learning_rate=0.1,
        max_depth=4,
        min_child_weight=1,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42
    )
    
    xgb_model.fit(X_train_scaled, y_train)
    y_pred = xgb_model.predict(X_test_scaled)
    
    # Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print("\nModel Performance Metrics:")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R2 Score: {r2:.4f}")
    
    # Feature importance analysis
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': xgb_model.feature_importances_
    })
    feature_importance = feature_importance.sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))

        # Prepare final feature set
    print("Preparing final feature set...")
    X = prepare_features(df, tsfresh_features)
    
    # Inspect tsfresh features
    tsfresh_cols = inspect_tsfresh_features(tsfresh_features, X)
    
    # Print feature composition
    print("\n=== Feature Set Composition ===")
    print(f"Total features: {X.shape[1]}")
    print(f"Basic features: {X.shape[1] - len(tsfresh_cols)}")
    print(f"Tsfresh features: {len(tsfresh_cols)}")
    
    # Optional: Save feature names for reference
    feature_composition = pd.DataFrame({
        'feature_name': X.columns,
        'feature_type': ['tsfresh' if col in tsfresh_cols else 'basic' for col in X.columns]
    })
    print("\nFeature composition summary:")
    print(feature_composition['feature_type'].value_counts())

if __name__ == "__main__":
    main()

  df_tsfresh['price'] = pd.to_numeric(df_tsfresh['price'].replace('[\$,]', '', regex=True), errors='coerce')


Loading and preparing data...
Total listings in sample: 333
Total records in sample: 69760
Date range: 2023-06-07 00:00:00 to 2024-02-11 00:00:00
Extracting tsfresh features...


Rolling: 100%|██████████| 36/36 [00:05<00:00,  6.33it/s]
Feature Extraction: 100%|██████████| 330/330 [00:00<00:00, 2732.34it/s]



Extracted 8 valid tsfresh features
Preparing final feature set...

Training XGBoost model...

Model Performance Metrics:
RMSE: 40.64
MAE: 16.65
R2 Score: 0.9777

Top 10 Most Important Features:
                           feature  importance
8                      price__mean    0.263537
5   neighbourhood_cleansed_encoded    0.261083
3                       season_sin    0.133075
12                  price__maximum    0.117120
1                            month    0.059282
6                         latitude    0.053213
9                    price__median    0.033558
4                       season_cos    0.019039
7                        longitude    0.018045
14              price__mean_change    0.015471
Preparing final feature set...

=== Tsfresh Feature Inspection ===

Number of tsfresh features extracted: 8

Sample of tsfresh features with statistics:

Feature: price__mean_change
Mean: 0.012
Std: 0.264
Min: -0.288
Max: 4.701

Feature: price__standard_deviation
Mean: 30.080
Std: 162.10