In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set some plotting styles for better visuals
sns.set_style('whitegrid')
plt.style.use("fivethirtyeight")
%matplotlib inline

In [3]:
# Define the column names for the dataset
column_names = [
    'unit_number', 'time_in_cycles', 'op_setting_1', 'op_setting_2', 'op_setting_3',
    'sensor_1', 'sensor_2', 'sensor_3', 'sensor_4', 'sensor_5', 'sensor_6',
    'sensor_7', 'sensor_8', 'sensor_9', 'sensor_10', 'sensor_11', 'sensor_12',
    'sensor_13', 'sensor_14', 'sensor_15', 'sensor_16', 'sensor_17', 'sensor_18',
    'sensor_19', 'sensor_20', 'sensor_21'
]

# Load the training data for FD001 using a raw string for the separator
df_train = pd.read_csv(
    '../data/raw/C-MAPSS/train_FD001.txt',
    sep=r'\s+',  # Use r'' for raw string to avoid SyntaxWarning
    header=None,
    names=column_names
)

# Display the first 5 rows of the dataframe
df_train.head()

Unnamed: 0,unit_number,time_in_cycles,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,...,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


In [4]:
# Get a concise summary of the dataframe
print("DataFrame Info:")
df_train.info()

print("\n\n") # Add some space for readability

# Get descriptive statistics
print("Descriptive Statistics:")
df_train.describe().transpose()

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20631 entries, 0 to 20630
Data columns (total 26 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   unit_number     20631 non-null  int64  
 1   time_in_cycles  20631 non-null  int64  
 2   op_setting_1    20631 non-null  float64
 3   op_setting_2    20631 non-null  float64
 4   op_setting_3    20631 non-null  float64
 5   sensor_1        20631 non-null  float64
 6   sensor_2        20631 non-null  float64
 7   sensor_3        20631 non-null  float64
 8   sensor_4        20631 non-null  float64
 9   sensor_5        20631 non-null  float64
 10  sensor_6        20631 non-null  float64
 11  sensor_7        20631 non-null  float64
 12  sensor_8        20631 non-null  float64
 13  sensor_9        20631 non-null  float64
 14  sensor_10       20631 non-null  float64
 15  sensor_11       20631 non-null  float64
 16  sensor_12       20631 non-null  float64
 17  sensor_13      

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
unit_number,20631.0,51.506568,29.22763,1.0,26.0,52.0,77.0,100.0
time_in_cycles,20631.0,108.807862,68.88099,1.0,52.0,104.0,156.0,362.0
op_setting_1,20631.0,-9e-06,0.002187313,-0.0087,-0.0015,0.0,0.0015,0.0087
op_setting_2,20631.0,2e-06,0.0002930621,-0.0006,-0.0002,0.0,0.0003,0.0006
op_setting_3,20631.0,100.0,0.0,100.0,100.0,100.0,100.0,100.0
sensor_1,20631.0,518.67,6.537152e-11,518.67,518.67,518.67,518.67,518.67
sensor_2,20631.0,642.680934,0.5000533,641.21,642.325,642.64,643.0,644.53
sensor_3,20631.0,1590.523119,6.13115,1571.04,1586.26,1590.1,1594.38,1616.91
sensor_4,20631.0,1408.933782,9.000605,1382.25,1402.36,1408.04,1414.555,1441.49
sensor_5,20631.0,14.62,3.3947e-12,14.62,14.62,14.62,14.62,14.62


In [None]:
# First, get the maximum cycle for each unit
max_cycles = df_train.groupby('unit_number')['time_in_cycles'].max().reset_index()
max_cycles.columns = ['unit_number', 'max_cycles']

# Merge the max_cycles back into the main dataframe
df_train = pd.merge(df_train, max_cycles, on='unit_number', how='left')

# Calculate RUL
df_train['RUL'] = df_train['max_cycles'] - df_train['time_in_cycles']

# Drop the temporary 'max_cycles' column as it's no longer needed
df_train.drop(columns=['max_cycles'], inplace=True)

# Display the last 5 rows for unit_number 1 to verify RUL calculation
print("Data for Unit #1, showing the newly calculated RUL column:")
df_train[df_train['unit_number'] == 1].tail()