In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import os

# Enable inline plotting
%matplotlib inline

# Set style for all plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")
import os


base_folder = 'C:/Users/mvk/Documents/DATA_school/thesis/data_new'

def load_city_data(base_folder, city, date):
    """Load and preprocess data for a specific city and date."""
    # Use os.path.join to handle path separators correctly
    base_path = os.path.join(base_folder, city, date)
    
    # Print the paths to verify they're correct
    listings_path = os.path.join(base_path, 'listings.csv')
    calendar_path = os.path.join(base_path, 'calendar.csv')
    neighborhoods_path = os.path.join(base_path, 'neighbourhoods.csv')
    
    print(f"Attempting to load from:\n{listings_path}")
    
    # Load data files
    listings = pd.read_csv(listings_path, low_memory=False)
    calendar = pd.read_csv(calendar_path, low_memory=False)
    neighborhoods = pd.read_csv(neighborhoods_path)
    
    return listings, calendar, neighborhoods

city = "paris"
date = "2023-09-04"
try:
    listings, calendar, neighborhoods = load_city_data(base_folder, city, date)
    print("Data loaded successfully!")
except Exception as e:
    print(f"Error loading data: {str(e)}")
    print(f"Current working directory: {os.getcwd()}")

Attempting to load from:
C:/Users/mvk/Documents/DATA_school/thesis/data_new\paris\2023-09-04\listings.csv
Data loaded successfully!


In [7]:
# Import required libraries
import pandas as pd
from pathlib import Path

# Function to merge calendar data
def merge_calendar_data(base_path, city):
    """
    Merge calendar data from different quarterly snapshots into one comprehensive dataset.
    Keep only listing_id, date, and price columns.
    
    Parameters:
    base_path (str): Path to the root directory containing city folders
    city (str): Name of the city to process (e.g., 'paris', 'amsterdam')
    
    Returns:
    pd.DataFrame: Merged calendar data with selected columns
    """
    # Initialize empty list to store dataframes
    dfs = []
    
    # Get all date folders for the specified city
    city_path = Path(base_path) / city
    date_folders = sorted([d for d in city_path.iterdir() if d.is_dir()])
    
    print(f"Processing {len(date_folders)} folders for {city}...")
    
    for date_folder in date_folders:
        calendar_file = date_folder / 'calendar.csv'
        
        if calendar_file.exists():
            # Read the calendar file
            df = pd.read_csv(calendar_file)
            print(f"Reading data from {date_folder.name}: {df.shape[0]} rows")
            
            # Keep only required columns
            df = df[['listing_id', 'date', 'price']]
            
            # Convert date column to datetime
            df['date'] = pd.to_datetime(df['date'])
            
            # Clean price column
            df['price'] = df['price'].str.replace('$', '').str.replace(',', '').astype(float)
            
            # Add source date (when the data was scraped)
            df['source_date'] = pd.to_datetime(date_folder.name)
            
            dfs.append(df)
    
    print("\nMerging dataframes...")
    # Concatenate all dataframes
    merged_df = pd.concat(dfs, ignore_index=True)
    
    # Sort by listing_id and date
    merged_df = merged_df.sort_values(['listing_id', 'date', 'source_date'])
    
    # For each listing_id and date combination, keep the row with the latest source_date
    merged_df = merged_df.drop_duplicates(subset=['listing_id', 'date'], keep='last')
    
    # Drop the source_date column as it's no longer needed
    merged_df = merged_df.drop('source_date', axis=1)
    
    # Sort the final dataset
    merged_df = merged_df.sort_values(['listing_id', 'date'])
    
    return merged_df

# Set the base path and city
city = "paris"   # Can be changed to any city in the dataset

# Merge the calendar data
merged_calendar = merge_calendar_data(base_folder, city)

# Basic information about the merged dataset
print("\nMerged Dataset Information:")
print("-" * 30)
print(f"Total rows: {merged_calendar.shape[0]}")
print(f"Total columns: {merged_calendar.shape[1]}")
print(f"Date range: {merged_calendar['date'].min()} to {merged_calendar['date'].max()}")
print(f"Number of unique listings: {merged_calendar['listing_id'].nunique()}")

# Display the first few rows of the merged dataset
print("\nFirst few rows of the merged dataset:")
display(merged_calendar.head())

# Save the merged data
output_filename = f"{city}_merged_calendar.csv"
merged_calendar.to_csv(output_filename, index=False)
print(f"\nMerged calendar data saved to {output_filename}")

Processing 6 folders for paris...
Reading data from 2023-06-06: 22521650 rows
Reading data from 2023-09-04: 24798862 rows


  df = pd.read_csv(calendar_file)


Reading data from 2023-12-12: 27134477 rows


  df = pd.read_csv(calendar_file)


Reading data from 2024-03-16: 30804889 rows


  df = pd.read_csv(calendar_file)


Reading data from 2024-06-10: 34997121 rows


  df = pd.read_csv(calendar_file)


Reading data from 2024-09-06: 34842829 rows

Merging dataframes...

Merged Dataset Information:
------------------------------
Total rows: 79418789
Total columns: 3
Date range: 2023-06-07 00:00:00 to 2025-09-12 00:00:00
Number of unique listings: 128063

First few rows of the merged dataset:


Unnamed: 0,listing_id,date,price
22521650,3109,2023-09-05,110.0
22521651,3109,2023-09-06,110.0
22521652,3109,2023-09-07,110.0
22521653,3109,2023-09-08,110.0
22521654,3109,2023-09-09,110.0



Merged calendar data saved to paris_merged_calendar.csv
