In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split

def create_listing_id_split_files(data_path, output_dir='.', test_size=0.2, random_state=42):
    """
    Create two text files containing listing IDs for train (80%) and test (20%) sets.
    
    Parameters:
    -----------
    data_path : str
        Path to the CSV file containing listing data with listing_id column
    output_dir : str
        Directory where train_ids.txt and test_ids.txt will be saved
    test_size : float
        Proportion of unique listings to be used as test set
    random_state : int
        Random seed for reproducibility
    """
    # Check if the file exists
    if not os.path.exists(data_path):
        print(f"Error: File {data_path} does not exist!")
        return
    
    # Read the data
    print(f"Reading data from {data_path}...")
    try:
        # First try reading with listing_id column
        df = pd.read_csv(data_path)
        id_column = 'listing_id' if 'listing_id' in df.columns else 'id'
        
        if id_column not in df.columns:
            print(f"Error: Neither 'listing_id' nor 'id' column found in the dataset!")
            return
    except:
        print(f"Error reading {data_path}. Please check the file format.")
        return
    
    # Get unique listing IDs
    unique_listings = df[id_column].unique()
    print(f"Found {len(unique_listings)} unique listings.")
    
    # Split the unique listing IDs into train and test sets
    print(f"Creating {int((1-test_size)*100)}-{int(test_size*100)} train-test split...")
    train_listings, test_listings = train_test_split(
        unique_listings, 
        test_size=test_size, 
        random_state=random_state
    )
    
    # Create output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    # Save the train and test listing IDs to text files
    train_path = os.path.join(output_dir, 'train_ids.txt')
    test_path = os.path.join(output_dir, 'test_ids.txt')
    
    # Save train listing IDs
    print(f"Saving {len(train_listings)} training listing IDs to {train_path}...")
    with open(train_path, 'w') as f:
        for listing_id in train_listings:
            f.write(f"{listing_id}\n")
    
    # Save test listing IDs
    print(f"Saving {len(test_listings)} test listing IDs to {test_path}...")
    with open(test_path, 'w') as f:
        for listing_id in test_listings:
            f.write(f"{listing_id}\n")
    
    print("Done!")
    return train_listings, test_listings

# Example usage
if __name__ == "__main__":
    # You can use either calendar.csv or listings.csv to get unique listing IDs
    data_file = r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\De Ultieme Data\top_price_changers_subset\train.csv"  # Update with your specific path
    output_directory = "."  # Current directory
    
    train_listings, test_listings = create_listing_id_split_files(
        data_path=data_file,
        output_dir=output_directory,
        test_size=0.2,  # 80-20 split
        random_state=42  # For reproducibility
    )
    
    # Print summary statistics
    print(f"\nSummary:")
    print(f"Training set: {len(train_listings)} unique listings ({len(train_listings) / (len(train_listings) + len(test_listings)) * 100:.1f}%)")
    print(f"Test set: {len(test_listings)} unique listings ({len(test_listings) / (len(train_listings) + len(test_listings)) * 100:.1f}%)")

Reading data from C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\De Ultieme Data\top_price_changers_subset\train.csv...
Found 7864 unique listings.
Creating 80-20 train-test split...
Saving 6291 training listing IDs to .\train_ids.txt...
Saving 1573 test listing IDs to .\test_ids.txt...
Done!

Summary:
Training set: 6291 unique listings (80.0%)
Test set: 1573 unique listings (20.0%)
