# Data Cleaning #


### Reading in the datasets 

In [2]:
import pandas as pd
import geopandas as gpd
import json

# Reading CSV file
def read_csv(file_path):
    try:
        df = pd.read_csv(file_path)
        print(f"Data from {file_path} loaded successfully!")
        return df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading Excel file
def read_excel(file_path):
    try:
        df = pd.read_excel(file_path, engine='openpyxl')
        print(f"Data from {file_path} loaded successfully!")
        return df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading JSON file
def read_json(file_path):
    try:
        df = pd.read_json(file_path)
        print(f"Data from {file_path} loaded successfully!")
        return df
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading GeoJSON file
def read_geojson(file_path):
    try:
        # First attempt: Try using geopandas
        try:
            gdf = gpd.read_file(file_path)
            print(f"Data from {file_path} loaded successfully using GeoPandas!")
            return gdf
        except ImportError:
            # If geopandas is not installed, fall back to manual JSON parsing
            with open(file_path, 'r', encoding='utf-8') as f:
                geojson_data = json.load(f)
            
            # Extract features and properties
            features = []
            for feature in geojson_data['features']:
                # Get properties
                properties = feature['properties']
                
                # Get geometry
                geometry = feature['geometry']
                
                # Combine properties and geometry into one dictionary
                feature_dict = {
                    **properties,
                    'geometry_type': geometry['type'],
                    'coordinates': str(geometry['coordinates'])  # Convert to string to avoid nested structure
                }
                features.append(feature_dict)
            
            # Convert to pandas DataFrame
            df = pd.DataFrame(features)
            print(f"Data from {file_path} loaded successfully using manual parsing!")
            return df
            
    except Exception as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Reading a dataset from any of these formats
def read_dataset(file_path):
    # Detect file extension to determine how to read it
    if file_path.endswith('.csv'):
        return read_csv(file_path)
    elif file_path.endswith('.xlsx'):
        return read_excel(file_path)
    elif file_path.endswith('.json'):
        return read_json(file_path)
    elif file_path.endswith('.geojson'):
        return read_geojson(file_path)
    else:
        print("Unsupported file type")
        return None

# Helper function to check if geopandas is installed
def is_geopandas_available():
    try:
        import geopandas
        return True
    except ImportError:
        return False

# Helper function to install geopandas if needed
def install_geopandas():
    try:
        import subprocess
        subprocess.check_call(["pip", "install", "geopandas"])
        print("GeoPandas installed successfully!")
        return True
    except Exception as e:
        print(f"Error installing GeoPandas: {e}")
        return False

In [3]:
# Example usage
merged_calendar = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Merged_Data\paris_merged_calendar.csv")
listings = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings.csv")        
neighbourhoods = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.csv")   
neighbourhoods_geojson = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.geojson")     
reviews = read_dataset(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\reviews.csv")


# If you'd like to inspect the loaded data
if merged_calendar is not None:
    print(merged_calendar.head())
if listings is not None:
    print(listings.head())
if neighbourhoods is not None:
    print(neighbourhoods.head())
if neighbourhoods_geojson is not None:
    print(neighbourhoods_geojson.head())
if reviews is not None:
    print(reviews.head())

Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Merged_Data\paris_merged_calendar.csv loaded successfully!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings.csv loaded successfully!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.csv loaded successfully!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\neighbourhoods.geojson loaded successfully using GeoPandas!
Data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\reviews.csv loaded successfully!
   listing_id        date  price
0        3109  2023-09-05  110.0
1        3109  2023-09-06  110.0
2        3109  2023-09-07  110.0
3        3109  2023-09-08  110.0
4        3109  2023-09-09  110.0
       id                          listing_url       scrape_id last_scraped  \
0    3109    https://www.airbnb.com/rooms/3109  20240906025355   2024-09-11   
1    5396

In [4]:
if merged_calendar is not None:
    print(merged_calendar.head())

   listing_id        date  price
0        3109  2023-09-05  110.0
1        3109  2023-09-06  110.0
2        3109  2023-09-07  110.0
3        3109  2023-09-08  110.0
4        3109  2023-09-09  110.0


In [5]:
if listings is not None:
    print(listings.head())

       id                          listing_url       scrape_id last_scraped  \
0    3109    https://www.airbnb.com/rooms/3109  20240906025355   2024-09-11   
1    5396    https://www.airbnb.com/rooms/5396  20240906025355   2024-09-13   
2    7397    https://www.airbnb.com/rooms/7397  20240906025355   2024-09-06   
3    7964    https://www.airbnb.com/rooms/7964  20240906025355   2024-09-10   
4  241715  https://www.airbnb.com/rooms/241715  20240906025355   2024-09-11   

            source                                               name  \
0      city scrape                                       zen and calm   
1      city scrape       Your perfect Paris studio on Île Saint-Louis   
2      city scrape                   MARAIS - 2ROOMS APT - 2/4 PEOPLE   
3  previous scrape                       Sunny apartment with balcony   
4      city scrape  Big Cosy Appartement with 100 m2 Terrace in Paris   

                                         description  \
0  Lovely Appartment with one 

In [6]:
if reviews is not None:
    print(reviews.head())

   listing_id         id        date  reviewer_id reviewer_name  \
0        3109  207127433  2017-10-28     51636494      Patricia   
1        3109  208779822  2017-11-03      4142888      Patricia   
2        3109  295840159  2018-07-24      7415343       Laurent   
3        3109  553502638  2019-10-24     21159216     Anastasia   
4        5396       4824  2009-06-30        19995         Sarah   

                                            comments  
0            Tout s'est bien déroulé. Merci bien. PG  
1  Un petit nid fouiller douillet situé dans  app...  
2  Appartement spacieux, propre,clair, et calme à...  
3  Appartement totalement rénové, en parfait état...  
4  Perfect location!! Nasrine was a delight and m...  


In [7]:
if neighbourhoods_geojson is not None:
    print(neighbourhoods_geojson.head(20))

          neighbourhood neighbourhood_group  \
0   Batignolles-Monceau                None   
1        Palais-Bourbon                None   
2       Buttes-Chaumont                None   
3                 Opéra                None   
4              Entrepôt                None   
5              Gobelins                None   
6             Vaugirard                None   
7               Reuilly                None   
8                Louvre                None   
9            Luxembourg                None   
10               Élysée                None   
11               Temple                None   
12         Ménilmontant                None   
13             Panthéon                None   
14                Passy                None   
15         Observatoire                None   
16           Popincourt                None   
17               Bourse                None   
18    Buttes-Montmartre                None   
19       Hôtel-de-Ville                None   

            

In [8]:
if neighbourhoods is not None:
    print(neighbourhoods.head())

   neighbourhood_group        neighbourhood
0                  NaN  Batignolles-Monceau
1                  NaN               Bourse
2                  NaN      Buttes-Chaumont
3                  NaN    Buttes-Montmartre
4                  NaN               Élysée


In [9]:
import pandas as pd

def clean_data(df):
    # Drop columns: 'listing_url', 'scrape_id' and 68 other columns
    df = df.drop(columns=['listing_url', 'scrape_id', 'last_scraped', 'source', 'name', 'neighborhood_overview', 'description', 'picture_url', 'host_id', 'host_url', 'host_name', 'host_since', 'host_location', 'host_about', 'host_response_time', 'host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url', 'host_picture_url', 'host_neighbourhood', 'host_total_listings_count', 'host_listings_count', 'host_verifications', 'host_has_profile_pic', 'host_identity_verified', 'neighbourhood', 'neighbourhood_group_cleansed', 'property_type', 'room_type', 'accommodates', 'bathrooms', 'bathrooms_text', 'amenities', 'beds', 'bedrooms', 'minimum_nights', 'maximum_nights', 'minimum_minimum_nights', 'maximum_minimum_nights', 'minimum_maximum_nights', 'maximum_maximum_nights', 'minimum_nights_avg_ntm', 'maximum_nights_avg_ntm', 'calendar_updated', 'has_availability', 'availability_30', 'availability_60', 'availability_90', 'availability_365', 'calendar_last_scraped', 'number_of_reviews', 'number_of_reviews_ltm', 'number_of_reviews_l30d', 'first_review', 'last_review', 'review_scores_rating', 'review_scores_accuracy', 'review_scores_cleanliness', 'review_scores_checkin', 'license', 'review_scores_communication', 'review_scores_location', 'review_scores_value', 'instant_bookable', 'calculated_host_listings_count', 'calculated_host_listings_count_entire_homes', 'calculated_host_listings_count_private_rooms', 'calculated_host_listings_count_shared_rooms', 'reviews_per_month'])
    return df

# Loaded variable 'df' from URI: c:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings.csv
df = pd.read_csv(r'c:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings.csv')

cleaned_listings = clean_data(df.copy())
cleaned_listings.head()

Unnamed: 0,id,neighbourhood_cleansed,latitude,longitude,price
0,3109,Observatoire,48.83191,2.3187,$113.00
1,5396,Hôtel-de-Ville,48.85247,2.35835,$95.00
2,7397,Hôtel-de-Ville,48.85909,2.35315,$145.00
3,7964,Opéra,48.87417,2.34245,
4,241715,Buttes-Chaumont,48.893464,2.378341,$450.00


In [10]:
def clean_data(cleaned_listings):
    # Rename column 'id' to 'listing_id'
    cleaned_listings = cleaned_listings.rename(columns={'id': 'listing_id'})
    return cleaned_listings

cleaned_listings_clean = clean_data(cleaned_listings.copy())
cleaned_listings_clean.head()

Unnamed: 0,listing_id,neighbourhood_cleansed,latitude,longitude,price
0,3109,Observatoire,48.83191,2.3187,$113.00
1,5396,Hôtel-de-Ville,48.85247,2.35835,$95.00
2,7397,Hôtel-de-Ville,48.85909,2.35315,$145.00
3,7964,Opéra,48.87417,2.34245,
4,241715,Buttes-Chaumont,48.893464,2.378341,$450.00


In [11]:
# Assuming your dataframes are called df1 and df2
merged_df = pd.merge(merged_calendar, cleaned_listings_clean, 
                    on='listing_id',
                    how='inner',
                    suffixes=('_1', '_2'))

# Print info about the merge
print("Original shapes:")
print(f"DataFrame 1: {merged_calendar.shape}")
print(f"DataFrame 2: {cleaned_listings_clean.shape}")
print(f"\nMerged shape: {merged_df.shape}")
print(f"Number of unique listings: {merged_df['listing_id'].nunique()}")

# Check for duplicate columns
duplicate_cols = [col for col in merged_df.columns if col.endswith('_1') or col.endswith('_2')]
if duplicate_cols:
    print("\nColumns that were duplicated during merge:")
    print(duplicate_cols)

# Show first few rows
print("\nFirst few rows of merged dataset:")
print(merged_df.head())

Original shapes:
DataFrame 1: (79418789, 3)
DataFrame 2: (95461, 5)

Merged shape: (64012312, 7)
Number of unique listings: 95461

Columns that were duplicated during merge:
['price_1', 'price_2']

First few rows of merged dataset:
   listing_id        date  price_1 neighbourhood_cleansed  latitude  \
0        3109  2023-09-05    110.0           Observatoire  48.83191   
1        3109  2023-09-06    110.0           Observatoire  48.83191   
2        3109  2023-09-07    110.0           Observatoire  48.83191   
3        3109  2023-09-08    110.0           Observatoire  48.83191   
4        3109  2023-09-09    110.0           Observatoire  48.83191   

   longitude  price_2  
0     2.3187  $113.00  
1     2.3187  $113.00  
2     2.3187  $113.00  
3     2.3187  $113.00  
4     2.3187  $113.00  


In [12]:
merged_df = merged_df.drop(columns=['price_2'])


In [13]:
merged_df

Unnamed: 0,listing_id,date,price_1,neighbourhood_cleansed,latitude,longitude
0,3109,2023-09-05,110.0,Observatoire,48.831910,2.318700
1,3109,2023-09-06,110.0,Observatoire,48.831910,2.318700
2,3109,2023-09-07,110.0,Observatoire,48.831910,2.318700
3,3109,2023-09-08,110.0,Observatoire,48.831910,2.318700
4,3109,2023-09-09,110.0,Observatoire,48.831910,2.318700
...,...,...,...,...,...,...
64012307,1239136641042729451,2025-09-06,110.0,Panthéon,48.839583,2.346317
64012308,1239136641042729451,2025-09-07,110.0,Panthéon,48.839583,2.346317
64012309,1239136641042729451,2025-09-08,110.0,Panthéon,48.839583,2.346317
64012310,1239136641042729451,2025-09-09,110.0,Panthéon,48.839583,2.346317


In [14]:
def clean_data(merged_df):
    # Rename column 'price_1' to 'price'
    merged_df = merged_df.rename(columns={'price_1': 'price'})
    return merged_df

merged_df_clean = clean_data(merged_df.copy())
merged_df_clean.head()

Unnamed: 0,listing_id,date,price,neighbourhood_cleansed,latitude,longitude
0,3109,2023-09-05,110.0,Observatoire,48.83191,2.3187
1,3109,2023-09-06,110.0,Observatoire,48.83191,2.3187
2,3109,2023-09-07,110.0,Observatoire,48.83191,2.3187
3,3109,2023-09-08,110.0,Observatoire,48.83191,2.3187
4,3109,2023-09-09,110.0,Observatoire,48.83191,2.3187


In [15]:
merged_df_clean

Unnamed: 0,listing_id,date,price,neighbourhood_cleansed,latitude,longitude
0,3109,2023-09-05,110.0,Observatoire,48.831910,2.318700
1,3109,2023-09-06,110.0,Observatoire,48.831910,2.318700
2,3109,2023-09-07,110.0,Observatoire,48.831910,2.318700
3,3109,2023-09-08,110.0,Observatoire,48.831910,2.318700
4,3109,2023-09-09,110.0,Observatoire,48.831910,2.318700
...,...,...,...,...,...,...
64012307,1239136641042729451,2025-09-06,110.0,Panthéon,48.839583,2.346317
64012308,1239136641042729451,2025-09-07,110.0,Panthéon,48.839583,2.346317
64012309,1239136641042729451,2025-09-08,110.0,Panthéon,48.839583,2.346317
64012310,1239136641042729451,2025-09-09,110.0,Panthéon,48.839583,2.346317


In [17]:
merged_df_clean['price'] = merged_df_clean['price'].ffill()


In [18]:
merged_df_clean

Unnamed: 0,listing_id,date,price,neighbourhood_cleansed,latitude,longitude
0,3109,2023-09-05,110.0,Observatoire,48.831910,2.318700
1,3109,2023-09-06,110.0,Observatoire,48.831910,2.318700
2,3109,2023-09-07,110.0,Observatoire,48.831910,2.318700
3,3109,2023-09-08,110.0,Observatoire,48.831910,2.318700
4,3109,2023-09-09,110.0,Observatoire,48.831910,2.318700
...,...,...,...,...,...,...
64012307,1239136641042729451,2025-09-06,110.0,Panthéon,48.839583,2.346317
64012308,1239136641042729451,2025-09-07,110.0,Panthéon,48.839583,2.346317
64012309,1239136641042729451,2025-09-08,110.0,Panthéon,48.839583,2.346317
64012310,1239136641042729451,2025-09-09,110.0,Panthéon,48.839583,2.346317


In [20]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
 
# Encode categorical variables
le = LabelEncoder()
merged_df_clean['neighbourhood_cleansed_encoded'] = le.fit_transform(merged_df_clean['neighbourhood_cleansed'])

In [22]:
merged_df_clean = merged_df_clean.drop(columns=['neighbourhood_cleansed'])

In [23]:
merged_df_clean

Unnamed: 0,listing_id,date,price,latitude,longitude,neighbourhood_cleansed_encoded
0,3109,2023-09-05,110.0,48.831910,2.318700,10
1,3109,2023-09-06,110.0,48.831910,2.318700,10
2,3109,2023-09-07,110.0,48.831910,2.318700,10
3,3109,2023-09-08,110.0,48.831910,2.318700,10
4,3109,2023-09-09,110.0,48.831910,2.318700,10
...,...,...,...,...,...,...
64012307,1239136641042729451,2025-09-06,110.0,48.839583,2.346317,13
64012308,1239136641042729451,2025-09-07,110.0,48.839583,2.346317,13
64012309,1239136641042729451,2025-09-08,110.0,48.839583,2.346317,13
64012310,1239136641042729451,2025-09-09,110.0,48.839583,2.346317,13


In [24]:
df = merged_df_clean.copy()

In [28]:
df['date'] = pd.to_datetime(df['date'])


In [30]:
import numpy as np

df['DTF_day_of_week'] = df['date'].dt.dayofweek
df['DTF_month'] = df['date'].dt.month
df['DTF_is_weekend'] = df['DTF_day_of_week'].isin([5, 6]).astype(int)
df['DTF_season_sin'] = np.sin(2 * np.pi * df['date'].dt.dayofyear/365.25)
df['DTF_season_cos'] = np.cos(2 * np.pi * df['date'].dt.dayofyear/365.25)

In [None]:
threshold = df['price'].quantile(0.99)  # 99e percentiel

# Vind alle listings die minstens één outlier hebben
outlier_listings = df[df['price'] > threshold]['listing_id'].unique()
df = df[~df['listing_id'].isin(outlier_listings)]  # Verwijder alle rijen van deze listings


In [36]:
df

Unnamed: 0,listing_id,date,price,latitude,longitude,neighbourhood_cleansed_encoded,DTF_day_of_week,DTF_month,DTF_is_weekend,DTF_season_sin,DTF_season_cos
0,3109,2023-09-05,110.0,48.831910,2.318700,10,1,9,0,-0.902099,-0.431530
1,3109,2023-09-06,110.0,48.831910,2.318700,10,2,9,0,-0.909388,-0.415948
2,3109,2023-09-07,110.0,48.831910,2.318700,10,3,9,0,-0.916409,-0.400244
3,3109,2023-09-08,110.0,48.831910,2.318700,10,4,9,0,-0.923158,-0.384421
4,3109,2023-09-09,110.0,48.831910,2.318700,10,5,9,1,-0.929634,-0.368484
...,...,...,...,...,...,...,...,...,...,...,...
64012307,1239136641042729451,2025-09-06,110.0,48.839583,2.346317,13,5,9,1,-0.909388,-0.415948
64012308,1239136641042729451,2025-09-07,110.0,48.839583,2.346317,13,6,9,1,-0.916409,-0.400244
64012309,1239136641042729451,2025-09-08,110.0,48.839583,2.346317,13,0,9,0,-0.923158,-0.384421
64012310,1239136641042729451,2025-09-09,110.0,48.839583,2.346317,13,1,9,0,-0.929634,-0.368484


In [42]:
df

Unnamed: 0,listing_id,date,price,latitude,longitude,neighbourhood_cleansed_encoded,DTF_day_of_week,DTF_month,DTF_is_weekend,DTF_season_sin,DTF_season_cos
0,3109,2023-09-05,110.0,48.831910,2.318700,10,1,9,0,-0.902099,-0.431530
1,3109,2023-09-06,110.0,48.831910,2.318700,10,2,9,0,-0.909388,-0.415948
2,3109,2023-09-07,110.0,48.831910,2.318700,10,3,9,0,-0.916409,-0.400244
3,3109,2023-09-08,110.0,48.831910,2.318700,10,4,9,0,-0.923158,-0.384421
4,3109,2023-09-09,110.0,48.831910,2.318700,10,5,9,1,-0.929634,-0.368484
...,...,...,...,...,...,...,...,...,...,...,...
64012307,1239136641042729451,2025-09-06,110.0,48.839583,2.346317,13,5,9,1,-0.909388,-0.415948
64012308,1239136641042729451,2025-09-07,110.0,48.839583,2.346317,13,6,9,1,-0.916409,-0.400244
64012309,1239136641042729451,2025-09-08,110.0,48.839583,2.346317,13,0,9,0,-0.923158,-0.384421
64012310,1239136641042729451,2025-09-09,110.0,48.839583,2.346317,13,1,9,0,-0.929634,-0.368484


In [58]:
def create_non_leaking_features(df, forecast_horizon=60, max_listings=20000):
    """
    Create lag and rolling window features that don't leak future information,
    properly shifted by the forecast horizon.
    
    Args:
        df: DataFrame with 'listing_id', 'date', and 'price' columns
        forecast_horizon: Number of days ahead to predict (default 60)
        max_listings: Maximum number of listings to process (default 20000)
    """
    # Ensure data is sorted
    df = df.sort_values(['listing_id', 'date'])
    
    # Get first max_listings unique listing IDs
    unique_listings = df['listing_id'].unique()[:max_listings]
    df = df[df['listing_id'].isin(unique_listings)]
    
    processed_data = []
    total_listings = len(unique_listings)
    print(f"Starting feature creation for {total_listings} listings...")
    
    for idx, listing_id in enumerate(unique_listings, 1):
        if idx % 100 == 0 or idx == 1 or idx == total_listings:  # Print every 100 listings
            print(f"Processing listing {idx}/{total_listings} ({(idx/total_listings*100):.1f}%)")
        
        listing_data = df[df['listing_id'] == listing_id].copy()
        
        # Create lag features (all greater than forecast horizon)
        lag_periods = [90, 120, 150, 180]
        print(f"  Creating lag features for listing {listing_id}...") if idx % 1000 == 0 else None
        for lag in lag_periods:
            listing_data[f'price_lag_{lag}d'] = listing_data['price'].shift(lag)
        
        # For rolling windows, first create shifted series that stops at prediction point
        shifted_series = listing_data['price'].shift(forecast_horizon)
        
        # Calculate rolling windows on the shifted series
        windows = [30, 60, 90]
        print(f"  Calculating rolling windows for listing {listing_id}...") if idx % 1000 == 0 else None
        for window in windows:
            listing_data[f'rolling_mean_{window}d'] = shifted_series.rolling(window=window, min_periods=1, closed='left', center=False).mean()
            listing_data[f'rolling_std_{window}d'] = shifted_series.rolling(window=window, min_periods=1, closed='left', center=False).std()
            listing_data[f'rolling_max_{window}d'] = shifted_series.rolling(window=window, min_periods=1, closed='left', center=False).max()
            listing_data[f'rolling_min_{window}d'] = shifted_series.rolling(window=window, min_periods=1, closed='left', center=False).min()
        
        processed_data.append(listing_data)
    
    print("Concatenating processed data...")
    processed_df = pd.concat(processed_data)
    
    # Remove NaN values from feature creation
    initial_rows = len(processed_df)
    processed_df = processed_df.dropna()
    final_rows = len(processed_df)
    
    print(f"\nFeature creation complete!")
    print(f"Initial rows: {initial_rows}")
    print(f"Final rows after removing NaN: {final_rows}")
    print(f"Rows removed: {initial_rows - final_rows} ({((initial_rows - final_rows)/initial_rows*100):.1f}%)")
    
    return processed_df

In [59]:
df = create_non_leaking_features(df)

Starting feature creation for 20000 listings...
Processing listing 1/20000 (0.0%)
Processing listing 100/20000 (0.5%)
Processing listing 200/20000 (1.0%)
Processing listing 300/20000 (1.5%)
Processing listing 400/20000 (2.0%)
Processing listing 500/20000 (2.5%)
Processing listing 600/20000 (3.0%)
Processing listing 700/20000 (3.5%)
Processing listing 800/20000 (4.0%)
Processing listing 900/20000 (4.5%)
Processing listing 1000/20000 (5.0%)
  Creating lag features for listing 938858...
  Calculating rolling windows for listing 938858...
Processing listing 1100/20000 (5.5%)
Processing listing 1200/20000 (6.0%)
Processing listing 1300/20000 (6.5%)
Processing listing 1400/20000 (7.0%)
Processing listing 1500/20000 (7.5%)
Processing listing 1600/20000 (8.0%)
Processing listing 1700/20000 (8.5%)
Processing listing 1800/20000 (9.0%)
Processing listing 1900/20000 (9.5%)
Processing listing 2000/20000 (10.0%)
  Creating lag features for listing 2020964...
  Calculating rolling windows for listing

In [60]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

def split_and_standardize(df, train_start='2024-04-01', train_end='2025-03-31', 
                         test_start='2025-04-01', test_end='2025-05-31'):
    """
    Split data into train and test sets based on dates and standardize features.
    
    Args:
        df: DataFrame with features and target
        train_start: Start date for training set
        train_end: End date for training set
        test_start: Start date for test set
        test_end: End date for test set
    """
    
    # Create train and test masks
    train_mask = (df['date'] >= train_start) & (df['date'] <= train_end)
    test_mask = (df['date'] >= test_start) & (df['date'] <= test_end)
    
    # Split data
    train_df = df[train_mask].copy()
    test_df = df[test_mask].copy()
    
    # Identify columns to standardize
    price_columns = [col for col in df.columns if any(x in col.lower() for x in 
                    ['price', 'lag', 'rolling'])]
    
    # Initialize scaler
    scaler = StandardScaler()
    
    # Fit scaler on train data and transform both sets
    train_df[price_columns] = scaler.fit_transform(train_df[price_columns])
    test_df[price_columns] = scaler.transform(test_df[price_columns])
    
    # Print info about the split
    print(f"Training set: {len(train_df)} rows, {train_df['date'].min()} to {train_df['date'].max()}")
    print(f"Test set: {len(test_df)} rows, {test_df['date'].min()} to {test_df['date'].max()}")
    
    # Save to CSV
    train_df.to_csv('train_data_2024.csv', index=False)
    test_df.to_csv('test_data_2025.csv', index=False)
    
    return train_df, test_df, scaler

# Example usage:
# Assuming df is your preprocessed DataFrame with all features
train_df, test_df, scaler = split_and_standardize(df)

Training set: 6931044 rows, 2024-04-01 00:00:00 to 2025-03-31 00:00:00
Test set: 1220000 rows, 2025-04-01 00:00:00 to 2025-05-31 00:00:00


In [1]:
import pandas as pd

# Load the data
train_data = pd.read_csv(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Github Repository\Airbnb-spatiotemporal\train_data_2024.csv")
test_data = pd.read_csv(r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Github Repository\Airbnb-spatiotemporal\test_data_2025.csv")

# Get first 200 unique listing IDs from train data
first_200_listings = train_data['listing_id'].unique()[:200]

# Filter both dataframes
train_filtered = train_data[train_data['listing_id'].isin(first_200_listings)]
test_filtered = test_data[test_data['listing_id'].isin(first_200_listings)]

# Save filtered data
train_filtered.to_csv('train_filtered_200.csv', index=False)
test_filtered.to_csv('test_filtered_200.csv', index=False)

# Print info about the filtered datasets
print("Original datasets:")
print(f"Train: {len(train_data)} rows, {len(train_data['listing_id'].unique())} unique listings")
print(f"Test: {len(test_data)} rows, {len(test_data['listing_id'].unique())} unique listings")

print("\nFiltered datasets:")
print(f"Train: {len(train_filtered)} rows, {len(train_filtered['listing_id'].unique())} unique listings")
print(f"Test: {len(test_filtered)} rows, {len(test_filtered['listing_id'].unique())} unique listings")

Original datasets:
Train: 6931044 rows, 20000 unique listings
Test: 1220000 rows, 20000 unique listings

Filtered datasets:
Train: 66131 rows, 200 unique listings
Test: 12200 rows, 200 unique listings


In [2]:
import os
import glob

# Print current working directory
print("Current working directory:", os.getcwd())

# List all files in current directory
print("\nFiles in current directory:")
for file in glob.glob("*"):
    print(file)

# List all csv files specifically
print("\nCSV files in current directory:")
for file in glob.glob("*.csv"):
    print(file)

Current working directory: c:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Github Repository\Airbnb-spatiotemporal

Files in current directory:
Baseline
Calender merge.ipynb
coord_scaler.save
Data_Cleaned.ipynb
EDA.ipynb
neural network
price_scaler.save
test_data_2025.csv
test_filtered_200.csv
train_data_2024.csv
train_filtered_200.csv

CSV files in current directory:
test_data_2025.csv
test_filtered_200.csv
train_data_2024.csv
train_filtered_200.csv
