In [1]:
import os
import glob
import pandas as pd
import re # For more robust number extraction

In [2]:
def combine_log_files(data_folder_path: str) -> pd.DataFrame:
    """
    Reads all LOG_*.TXT files from the specified folder, sorts them numerically,
    and combines them into a single Pandas DataFrame.

    Args:
        data_folder_path (str): The path to the folder containing the .txt log files.

    Returns:
        pd.DataFrame: A DataFrame containing all combined data, or an empty
                      DataFrame if no suitable files are found or an error occurs.
    """
    if not os.path.isdir(data_folder_path):
        print(f"Error: Folder '{data_folder_path}' not found.")
        return pd.DataFrame()

    # Pattern to match LOG_ followed by numbers and .TXT extension (case-insensitive for .TXT)
    file_pattern = os.path.join(data_folder_path, "LOG_*.TXT")
    
    # Using glob to find all matching files
    # We need to be careful with case sensitivity on different OS for the extension
    # So we list all .txt files and then filter
    all_files_in_folder = glob.glob(os.path.join(data_folder_path, "*"))
    
    txt_files = []
    for f_path in all_files_in_folder:
        filename = os.path.basename(f_path)
        if filename.upper().startswith("LOG_") and filename.upper().endswith(".TXT"):
            txt_files.append(f_path)

    if not txt_files:
        print(f"No LOG_*.TXT files found in '{data_folder_path}'.")
        return pd.DataFrame()

    # Sort files based on the number in their name (e.g., LOG_1.TXT, LOG_2.TXT, LOG_10.TXT)
    def sort_key(filepath):
        filename = os.path.basename(filepath)
        # Extract number using regex: looks for LOG_ followed by one or more digits
        match = re.search(r'LOG_(\d+)\.TXT', filename, re.IGNORECASE)
        if match:
            return int(match.group(1))
        # If pattern doesn't match, put it at the end (or handle as error)
        return float('inf') 

    sorted_files = sorted(txt_files, key=sort_key)
    
    print("Files to be processed in order:")
    for f in sorted_files:
        print(f"  - {os.path.basename(f)}")

    all_dataframes = []
    for file_path in sorted_files:
        try:
            # Assuming the first line is always the header
            df = pd.read_csv(file_path)
            # Optional: Add a column to indicate the source file
            # df['source_file'] = os.path.basename(file_path) 
            all_dataframes.append(df)
            print(f"Successfully read and processed: {os.path.basename(file_path)}")
        except pd.errors.EmptyDataError:
            print(f"Warning: File {os.path.basename(file_path)} is empty and will be skipped.")
        except Exception as e:
            print(f"Error reading file {os.path.basename(file_path)}: {e}")

    if not all_dataframes:
        print("No data could be read from the files.")
        return pd.DataFrame()

    # Concatenate all dataframes
    combined_df = pd.concat(all_dataframes, ignore_index=True)
    
    # The 'Log_Index' will restart for each file. 
    # 'ignore_index=True' above gives a new unique index to the combined DataFrame.
    # The original 'Log_Index' column remains, showing its per-file sequence.

    return combined_df

In [3]:
    input_folder = "Data_In" 
    
    combined_data = combine_log_files(input_folder)

    if not combined_data.empty:
        print("\n--- Combined DataFrame ---")
        print("Shape:", combined_data.shape)
        print("\nHead:")
        print(combined_data.head())
        print("\nTail:")
        print(combined_data.tail())
        print("\nInfo:")
        combined_data.info()

        # Optional: Save the combined data to a new CSV file
        # output_csv_path = "combined_logs.csv"
        # combined_data.to_csv(output_csv_path, index=False)
        # print(f"\nCombined data saved to {output_csv_path}")
    else:
        print("No data was combined.")

Files to be processed in order:
  - LOG_18.TXT
  - LOG_19.TXT
Successfully read and processed: LOG_18.TXT
Successfully read and processed: LOG_19.TXT

--- Combined DataFrame ---
Shape: (37434, 23)

Head:
   Log_Index  CPU_TempC  BMP_TempC  BMP_PressPa  CO2_ppm  SCD30_TempC  \
0          0      34.42      25.24     99251.11        0        28.27   
1          1      34.42      25.24     99251.11        0        28.27   
2          2      35.10      25.24     99251.11      637        28.23   
3          3      34.42      25.24     99251.11      637        28.23   
4          4      34.42      25.24     99251.12      724        28.20   

   SCD30_Hum_%  SHT_TempC  SHT_Hum_%  GPS_Lat  ...  GPS_Month  GPS_Day  \
0        33.64      25.29      28.25      0.0  ...          0        0   
1        33.64      25.36      28.37      0.0  ...          0        0   
2        33.36      25.43      28.47      0.0  ...          0        0   
3        33.36      25.45      28.50      0.0  ...          0