In [2]:
import pandas as pd
import sys
import os

# Folder containing the text files
data_folder = "Fold5/"
train_file = "train.txt"
output_csv = "Letor_train_fold5.csv"  # Output CSV file

# LETOR datasets typically have 136 features + 2 extra columns (label, query ID)
num_features = 136
column_names = ["relevance", "query_id"] + [f"feature_{i}" for i in range(1, num_features + 1)]

# Define chunk size and maximum rows to process
chunk_size = 50000   
nrows = 3_000_000  # Stop processing after nrows (Put > 2,263,154M if you want to read entire training set)
processed_rows = 0

# Open CSV file to write in chunks
with open(output_csv, "w") as output_file:
    # Write header
    output_file.write(",".join(column_names) + "\n")

    # Read the train file in chunks
    print("\nStarting to process in chunks...")
    for chunk in pd.read_csv(
        os.path.join(data_folder, train_file),
        names=column_names,
        delim_whitespace=True,
        dtype=str,  # Read everything as string initially
        low_memory=False,
        chunksize=chunk_size,  # Read in chunks
    ):
        # Remove "qid:" prefix from `query_id`
        chunk["query_id"] = chunk["query_id"].str.replace("qid:", "", regex=True).astype(int)

        # Remove feature index prefixes (e.g., "7:0.5" → "0.5")
        for col in column_names[2:]:  # Skip relevance & query_id
            chunk[col] = chunk[col].str.split(":").str[1].astype(float)

        # Append chunk to CSV
        chunk.to_csv(output_file, mode="a", index=False, header=False)

        # Update processed rows count
        processed_rows += len(chunk)

        # Print real-time progress
        sys.stdout.write(f"\rProcessed {processed_rows:,} rows out of 2,263,154...")
        sys.stdout.flush()

        # Stop processing if we exceed max_rows
        if processed_rows >= nrows:
            print("\nReached nrows limit. Stopping early.")
            break

# Final check
print("\nProcessing complete.")
print(f"CSV saved as: {output_csv}")



Starting to process in chunks...


  for chunk in pd.read_csv(


Processed 2,263,154 rows out of 2,263,154...
Processing complete.
CSV saved as: Letor_train_fold5.csv


In [3]:
import pandas as pd
import sys
import os

# Folder containing the text files
data_folder = "Fold5/"
vali_file = "vali.txt"
output_csv = "Letor_vali_fold5.csv"  # Output CSV file

# LETOR datasets typically have 136 features + 2 extra columns (label, query ID)
num_features = 136
column_names = ["relevance", "query_id"] + [f"feature_{i}" for i in range(1, num_features + 1)]

# Define chunk size and maximum rows to process
chunk_size = 50000   
nrows = 1_000_000  # Stop processing after nrows (Set > 760,752 to read the entire validation set)
processed_rows = 0

# Open CSV file to write in chunks
with open(output_csv, "w") as output_file:
    # Write header
    output_file.write(",".join(column_names) + "\n")

    # Read the validation file in chunks
    print("\nStarting to process validation data in chunks...")
    for chunk in pd.read_csv(
        os.path.join(data_folder, vali_file),
        names=column_names,
        delim_whitespace=True,
        dtype=str,  # Read everything as string initially
        low_memory=False,
        chunksize=chunk_size,  # Read in chunks
    ):
        # Remove "qid:" prefix from `query_id`
        chunk["query_id"] = chunk["query_id"].str.replace("qid:", "", regex=True).astype(int)

        # Remove feature index prefixes (e.g., "7:0.5" → "0.5")
        for col in column_names[2:]:  # Skip relevance & query_id
            chunk[col] = chunk[col].str.split(":").str[1].astype(float)

        # Append chunk to CSV
        chunk.to_csv(output_file, mode="a", index=False, header=False)

        # Update processed rows count
        processed_rows += len(chunk)

        # Print real-time progress
        sys.stdout.write(f"\rProcessed {processed_rows:,} rows out of 760,752...")
        sys.stdout.flush()

        # Stop processing if we exceed max_rows
        if processed_rows >= nrows:
            print("\nReached nrows limit. Stopping early.")
            break

# Final check
print("\nProcessing complete.")
print(f"Validation CSV saved as: {output_csv}")



Starting to process validation data in chunks...


  for chunk in pd.read_csv(


Processed 760,753 rows out of 760,752...
Processing complete.
Validation CSV saved as: Letor_vali_fold5.csv


In [1]:
import pandas as pd
import sys
import os

# Folder containing the text files
data_folder = "Fold5/"
test_file = "test.txt"
output_csv = "Letor_test_fold5.csv"  # Output CSV file

# LETOR datasets typically have 136 features + 2 extra columns (label, query ID)
num_features = 136
column_names = ["relevance", "query_id"] + [f"feature_{i}" for i in range(1, num_features + 1)]

# Define chunk size and maximum rows to process
chunk_size = 50000   
nrows = 1_000_000  # Stop processing after nrows (Set > 747,217 to read the entire test set)
processed_rows = 0

# Open CSV file to write in chunks
with open(output_csv, "w") as output_file:
    # Write header
    output_file.write(",".join(column_names) + "\n")

    # Read the test file in chunks
    print("\nStarting to process test data in chunks...")
    for chunk in pd.read_csv(
        os.path.join(data_folder, test_file),
        names=column_names,
        delim_whitespace=True,
        dtype=str,  # Read everything as string initially
        low_memory=False,
        chunksize=chunk_size,  # Read in chunks
    ):
        # Remove "qid:" prefix from `query_id`
        chunk["query_id"] = chunk["query_id"].str.replace("qid:", "", regex=True).astype(int)

        # Remove feature index prefixes (e.g., "7:0.5" → "0.5")
        for col in column_names[2:]:  # Skip relevance & query_id
            chunk[col] = chunk[col].str.split(":").str[1].astype(float)

        # Append chunk to CSV
        chunk.to_csv(output_file, mode="a", index=False, header=False)

        # Update processed rows count
        processed_rows += len(chunk)

        # Print real-time progress
        sys.stdout.write(f"\rProcessed {processed_rows:,} rows out of 747,217...")
        sys.stdout.flush()

        # Stop processing if we exceed max_rows
        if processed_rows >= nrows:
            print("\nReached nrows limit. Stopping early.")
            break

# Final check
print("\nProcessing complete.")
print(f"Test CSV saved as: {output_csv}")



Starting to process test data in chunks...


  for chunk in pd.read_csv(


Processed 747,218 rows out of 747,217...
Processing complete.
Test CSV saved as: Letor_test_fold5.csv
