In [11]:
import pandas as pd
import glob
import os

def process_cmapps_file(file_path, fault_mode, condition):
    """
    Process a raw CMAPSS file and return a DataFrame with proper column names and metadata.
    
    Parameters:
      file_path (str): Path to the raw text file.
      fault_mode (str): Fault mode description.
      condition (str): Operating condition description.
      
    Returns:
      pd.DataFrame: Processed DataFrame.
    """
    # Read the file (assumed whitespace-separated with no header)
    df = pd.read_csv(file_path, sep='\s+', header=None)
    
    # Check if file has 26 columns
    if df.shape[1] != 26:
        raise ValueError(f"Expected 26 columns, but found {df.shape[1]} in {file_path}.")
    
    # Define column names based on dataset documentation
    col_names = ['unit', 'cycle'] + ['op_set1', 'op_set2', 'op_set3'] + \
                [f'sensor{i}' for i in range(1, 22)]
    df.columns = col_names
    
    # Add metadata columns
    df['fault_mode'] = fault_mode
    df['condition'] = condition
    # Also add the source file name for traceability
    df['source_file'] = os.path.basename(file_path)
    
    return df

# Define mapping for each dataset file (customize these as needed)
# For demonstration, assume files are named FD001.txt, FD002.txt, etc.
file_info = {
    'test_FD001.txt': {'fault_mode': 'HPC Degradation', 'condition': 'Sea Level'},
    'test_FD002.txt': {'fault_mode': 'HPC Degradation', 'condition': 'SIX'},
    'test_FD003.txt': {'fault_mode': 'HPC & Fan Degradation', 'condition': 'Sea Level'},
    'test_FD004.txt': {'fault_mode': 'HPC & Fan Degradation', 'condition': 'SIX'},
}

# Use glob to find all relevant files in a folder
file_list = glob.glob('*.txt')  # Adjust the pattern or path as needed

# List to hold processed DataFrames
dfs = []

for file in file_list:
    # Use mapping if available; otherwise, set defaults
    info = file_info.get(os.path.basename(file), {'fault_mode': 'Unknown', 'condition': 'Unknown'})
    try:
        df_temp = process_cmapps_file(file, fault_mode=info['fault_mode'], condition=info['condition'])
        dfs.append(df_temp)
    except ValueError as e:
        print(e)

# Concatenate all DataFrames into one
if dfs:
    final_df = pd.concat(dfs, ignore_index=True)
    # Save to CSV
    final_df.to_csv('combined_cmapps_dataset.csv', index=False)
    print("Combined dataset saved as 'combined_cmapps_dataset.csv'.")
else:
    print("No valid files found.")


'utf-8' codec can't decode byte 0x96 in position 707: invalid start byte
Expected 26 columns, but found 1 in RUL_FD001.txt.
Expected 26 columns, but found 1 in RUL_FD002.txt.
Expected 26 columns, but found 1 in RUL_FD003.txt.
Expected 26 columns, but found 1 in RUL_FD004.txt.
Expected 26 columns, but found 1 in x.txt.
Combined dataset saved as 'combined_cmapps_dataset.csv'.


In [12]:
df=pd.read_csv('combined_cmapps_dataset.csv')

In [13]:
df.columns

Index(['unit', 'cycle', 'op_set1', 'op_set2', 'op_set3', 'sensor1', 'sensor2',
       'sensor3', 'sensor4', 'sensor5', 'sensor6', 'sensor7', 'sensor8',
       'sensor9', 'sensor10', 'sensor11', 'sensor12', 'sensor13', 'sensor14',
       'sensor15', 'sensor16', 'sensor17', 'sensor18', 'sensor19', 'sensor20',
       'sensor21', 'fault_mode', 'condition', 'source_file'],
      dtype='object')

In [10]:
import pandas as pd
import glob
import os

def process_cmapps_file(file_path, fault_mode, condition):
    """
    Process a raw CMAPSS file and return a DataFrame with proper column names and metadata.
    
    Parameters:
      file_path (str): Path to the raw text file.
      fault_mode (str): Fault mode description.
      condition (str): Operating condition description.
      
    Returns:
      pd.DataFrame: Processed DataFrame.
    """
    # Read the file (assumed whitespace-separated with no header)
    df = pd.read_csv(file_path, sep='\s+', header=None)
    
    # Check if file has 26 columns
    if df.shape[1] != 26:
        raise ValueError(f"Expected 26 columns, but found {df.shape[1]} in {file_path}.")
    
    # Define column names based on dataset documentation
    col_names = ['unit', 'cycle'] + ['op_set1', 'op_set2', 'op_set3'] + \
                [f'sensor{i}' for i in range(1, 22)]
    df.columns = col_names
    
    # Add metadata columns
    df['fault_mode'] = fault_mode
    df['condition'] = condition
    # Also add the source file name for traceability
    df['source_file'] = os.path.basename(file_path)
    
    return df

# Define mapping for each dataset file (customize these as needed)
# For demonstration, assume files are named FD001.txt, FD002.txt, etc.
file_info = {
    'train_FD001.txt': {'fault_mode': 'HPC Degradation', 'condition': 'Sea Level'},
    'train_FD002.txt': {'fault_mode': 'HPC Degradation', 'condition': 'SIX'},
    'train_FD003.txt': {'fault_mode': 'HPC & Fan Degradation', 'condition': 'Sea Level'},
    'train_FD004.txt': {'fault_mode': 'HPC & Fan Degradation', 'condition': 'SIX'},
}

# Use glob to find all relevant files in a folder
file_list = glob.glob('*.txt')  # Adjust the pattern or path as needed

# List to hold processed DataFrames
dfs = []

for file in file_list:
    # Use mapping if available; otherwise, set defaults
    info = file_info.get(os.path.basename(file), {'fault_mode': 'Unknown', 'condition': 'Unknown'})
    try:
        df_temp = process_cmapps_file(file, fault_mode=info['fault_mode'], condition=info['condition'])
        dfs.append(df_temp)
    except ValueError as e:
        print(e)

# Concatenate all DataFrames into one
if dfs:
    final_df = pd.concat(dfs, ignore_index=True)
    # Save to CSV
    final_df.to_csv('combined_cmapps_training.csv', index=False)
    print("Combined dataset saved as 'combined_cmapps_training.csv'.")
else:
    print("No valid files found.")


'utf-8' codec can't decode byte 0x96 in position 707: invalid start byte
Expected 26 columns, but found 1 in RUL_FD001.txt.
Expected 26 columns, but found 1 in RUL_FD002.txt.
Expected 26 columns, but found 1 in RUL_FD003.txt.
Expected 26 columns, but found 1 in RUL_FD004.txt.
Expected 26 columns, but found 1 in x.txt.
Combined dataset saved as 'combined_cmapps_training.csv'.
