In [1]:
# This notebook is used for preprocessing the data

### Importing Libraries
This script imports essential Python libraries for data analysis and visualization. `Pandas` and `NumPy` are used for data manipulation, `h5py` for handling HDF5 files, and `matplotlib` and `seaborn` for plotting. `scipy.stats` provides statistical tools, crucial for insightful data exploration in machine learning projects.

In [1]:
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats



### Loading and Reading the Data
The Python script efficiently loads multiple datasets from an HDF5 file, crucial for machine learning in predictive maintenance. It handles development and test sets, including operational settings, sensor readings, and RUL values. This structured data loading approach is vital for analysis and model training in aerospace prognostics.

In [15]:


file_name = '/mnt/e/PdM/DataX/NASA/N-CMAPSS_DS01-005.h5'

# Load data
with h5py.File(file_name, 'r') as hdf:
        # Development set
        W_dev = np.array(hdf.get('W_dev'))             # W
        X_s_dev = np.array(hdf.get('X_s_dev'))         # X_s
        X_v_dev = np.array(hdf.get('X_v_dev'))         # X_v
        T_dev = np.array(hdf.get('T_dev'))             # T
        Y_dev = np.array(hdf.get('Y_dev'))             # RUL  
        A_dev = np.array(hdf.get('A_dev'))             # Auxiliary

        # Test set
        W_test = np.array(hdf.get('W_test'))           # W
        X_s_test = np.array(hdf.get('X_s_test'))       # X_s
        X_v_test = np.array(hdf.get('X_v_test'))       # X_v
        T_test = np.array(hdf.get('T_test'))           # T
        Y_test = np.array(hdf.get('Y_test'))           # RUL  
        A_test = np.array(hdf.get('A_test'))           # Auxiliary
        
        # Varnams
        W_var = np.array(hdf.get('W_var'))
        X_s_var = np.array(hdf.get('X_s_var'))  
        X_v_var = np.array(hdf.get('X_v_var')) 
        T_var = np.array(hdf.get('T_var'))
        A_var = np.array(hdf.get('A_var'))
        
        # from np.array to list dtype U4/U5
        W_var = list(np.array(W_var, dtype='U20'))
        X_s_var = list(np.array(X_s_var, dtype='U20'))  
        X_v_var = list(np.array(X_v_var, dtype='U20')) 
        T_var = list(np.array(T_var, dtype='U20'))
        A_var = list(np.array(A_var, dtype='U20'))


### Building Dataframes
This script converts numpy arrays (`W_dev`, `X_s_dev`, `X_v_dev`, `T_dev`, `Y_dev`, `A_dev`, and their test counterparts) into pandas DataFrames, assigning appropriate column names for each. It's an essential step in organizing and preparing the data for further analysis and machine learning modeling in predictive maintenance.

In [23]:
W_dev_df = pd.DataFrame(W_dev, columns=W_var)
X_s_dev_df = pd.DataFrame(X_s_dev, columns=X_s_var)
X_v_dev_df = pd.DataFrame(X_v_dev, columns=X_v_var)
T_dev_df = pd.DataFrame(T_dev, columns=T_var)
Y_dev_df = pd.DataFrame(Y_dev, columns=['RUL'])
A_dev_df = pd.DataFrame(A_dev, columns=A_var)

W_test_df = pd.DataFrame(W_test, columns=W_var)
X_s_test_df = pd.DataFrame(X_s_test, columns=X_s_var)
X_v_test_df = pd.DataFrame(X_v_test, columns=X_v_var)
T_test_df = pd.DataFrame(T_test, columns=T_var)
Y_test_df = pd.DataFrame(Y_test, columns=['RUL'])
A_test_df = pd.DataFrame(A_test, columns=A_var)


### Creating Development and Test Dataframes
The script merges individual feature sets into comprehensive `train_df` and `test_df`, crucial for developing and evaluating machine learning models in predictive maintenance.

In [25]:
# Concatenate the development datasets to create the training DataFrame
train_df = pd.concat([W_dev_df, X_s_dev_df, X_v_dev_df, T_dev_df, A_dev_df, Y_dev_df], axis=1)

# Concatenate the test datasets to create the test DataFrame
test_df = pd.concat([W_test_df, X_s_test_df, X_v_test_df, T_test_df, A_test_df, Y_test_df], axis=1)


Shape and Size of DataFrames

In [30]:
#### Shape and Size of DataFramesn
print("Train DataFrame Shape:", train_df.shape)
print("Test DataFrame Shape:", test_df.shape)


Train DataFrame Shape: (4906636, 47)
Test DataFrame Shape: (2735232, 47)


Data Types in Each Column

In [31]:
#### Data Types in Each Column

print("Data Types in Train DataFrame:\n", train_df.dtypes)
print("Data Types in Test DataFrame:\n", test_df.dtypes)


Data Types in Train DataFrame:
 alt             float64
Mach            float64
TRA             float64
T2              float64
T24             float64
T30             float64
T48             float64
T50             float64
P15             float64
P2              float64
P21             float64
P24             float64
Ps30            float64
P40             float64
P50             float64
Nf              float64
Nc              float64
Wf              float64
T40             float64
P30             float64
P45             float64
W21             float64
W22             float64
W25             float64
W31             float64
W32             float64
W48             float64
W50             float64
SmFan           float64
SmLPC           float64
SmHPC           float64
phi             float64
fan_eff_mod     float64
fan_flow_mod    float64
LPC_eff_mod     float64
LPC_flow_mod    float64
HPC_eff_mod     float64
HPC_flow_mod    float64
HPT_eff_mod     float64
HPT_flow_mod    float64
LPT_eff_

Descriptive Statistics

In [32]:
#### Descriptive Statistics
print("Train DataFrame Statistics:\n", train_df.describe())
print("Test DataFrame Statistics:\n", test_df.describe())


Train DataFrame Statistics:
                 alt          Mach           TRA            T2           T24  \
count  4.906636e+06  4.906636e+06  4.906636e+06  4.906636e+06  4.906636e+06   
mean   1.568371e+04  5.384156e-01  6.059576e+01  4.902042e+02  5.696602e+02   
std    8.007308e+03  1.194006e-01  1.840391e+01  1.960585e+01  2.085329e+01   
min    3.001000e+03  3.150000e-04  2.355452e+01  4.213779e+02  4.841972e+02   
25%    9.206000e+03  4.440240e-01  4.684537e+01  4.744894e+02  5.548824e+02   
50%    1.465400e+04  5.481000e-01  6.539016e+01  4.940213e+02  5.672901e+02   
75%    2.235100e+04  6.383160e-01  7.707953e+01  5.063441e+02  5.831270e+02   
max    3.503300e+04  7.492590e-01  8.876890e+01  5.343834e+02  6.340001e+02   

                T30           T48           T50           P15            P2  \
count  4.906636e+06  4.906636e+06  4.906636e+06  4.906636e+06  4.906636e+06   
mean   1.330678e+03  1.640812e+03  1.129691e+03  1.293220e+01  1.008869e+01   
std    6.813023e+01  1

Checking for Null Values

In [33]:
#### Checking for Null Values

print("Null Values in Train DataFrame:\n", train_df.isnull().sum())
print("Null Values in Test DataFrame:\n", test_df.isnull().sum())


Null Values in Train DataFrame:
 alt             0
Mach            0
TRA             0
T2              0
T24             0
T30             0
T48             0
T50             0
P15             0
P2              0
P21             0
P24             0
Ps30            0
P40             0
P50             0
Nf              0
Nc              0
Wf              0
T40             0
P30             0
P45             0
W21             0
W22             0
W25             0
W31             0
W32             0
W48             0
W50             0
SmFan           0
SmLPC           0
SmHPC           0
phi             0
fan_eff_mod     0
fan_flow_mod    0
LPC_eff_mod     0
LPC_flow_mod    0
HPC_eff_mod     0
HPC_flow_mod    0
HPT_eff_mod     0
HPT_flow_mod    0
LPT_eff_mod     0
LPT_flow_mod    0
unit            0
cycle           0
Fc              0
hs              0
RUL             0
dtype: int64
Null Values in Test DataFrame:
 alt             0
Mach            0
TRA             0
T2              0
T24 

In [None]:
print(df.describe())


In [None]:
stats.shapiro(df['sensor_column'])  # 


In [None]:
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.show()


In [None]:
sns.histplot(df['sensor_column'], kde=True)
plt.show()


In [None]:
plt.plot(df['time_column'], df['sensor_column'])  # جایگزین 'time_column' و 'sensor_column' با نام ستون‌های مورد نظر
plt.show()


In [None]:
sns.boxplot(data=df['sensor_column'])
plt.show()


In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plot_acf(df['sensor_column'])
plot_pacf(df['sensor_column'])
plt.show()
