In [1]:
# This notebook is used for preprocessing the data

In [4]:
import pandas as pd
import numpy as np
import h5py
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats



This Python script efficiently opens an HDF5 file, extracts all datasets into pandas DataFrames, and then prints their key properties. It's designed for streamlined data loading and preliminary analysis, perfect for large-scale machine learning tasks in predictive maintenance using the N-CMAPSS dataset.

In [13]:
import h5py
import pandas as pd

file_name = '/mnt/e/PdM/DataX/NASA/N-CMAPSS_DS01-005.h5'

# Function to load a dataset into a DataFrame
def load_dataset(file, name):
    return pd.DataFrame(file[name][:])

# Open the HDF5 file and load datasets
with h5py.File(file_name, 'r') as file:
    datasets = file.keys()
    dataframes = {name: load_dataset(file, name) for name in datasets}

# Print properties of each DataFrame
for name, df in dataframes.items():
    print(f"DataFrame: {name}")
    print(f"Shape: {df.shape}")
    print(f"Data Types:\n{df.dtypes}")
    print(f"First few rows:\n{df.head()}")
    print("-" * 50)  # Separator line


DataFrame: A_dev
Shape: (4906636, 4)
Data Types:
0    float64
1    float64
2    float64
3    float64
dtype: object
First few rows:
     0    1    2    3
0  1.0  1.0  1.0  1.0
1  1.0  1.0  1.0  1.0
2  1.0  1.0  1.0  1.0
3  1.0  1.0  1.0  1.0
4  1.0  1.0  1.0  1.0
--------------------------------------------------
DataFrame: A_test
Shape: (2735232, 4)
Data Types:
0    float64
1    float64
2    float64
3    float64
dtype: object
First few rows:
     0    1    2    3
0  7.0  1.0  1.0  1.0
1  7.0  1.0  1.0  1.0
2  7.0  1.0  1.0  1.0
3  7.0  1.0  1.0  1.0
4  7.0  1.0  1.0  1.0
--------------------------------------------------
DataFrame: A_var
Shape: (4, 1)
Data Types:
0    |S5
dtype: object
First few rows:
          0
0   b'unit'
1  b'cycle'
2     b'Fc'
3     b'hs'
--------------------------------------------------
DataFrame: T_dev
Shape: (4906636, 10)
Data Types:
0    float64
1    float64
2    float64
3    float64
4    float64
5    float64
6    float64
7    float64
8    float64
9    float6

In [None]:
print(df.describe())


In [None]:
stats.shapiro(df['sensor_column'])  # 


In [None]:
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.show()


In [None]:
sns.histplot(df['sensor_column'], kde=True)
plt.show()


In [None]:
plt.plot(df['time_column'], df['sensor_column'])  # جایگزین 'time_column' و 'sensor_column' با نام ستون‌های مورد نظر
plt.show()


In [None]:
sns.boxplot(data=df['sensor_column'])
plt.show()


In [None]:
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

plot_acf(df['sensor_column'])
plot_pacf(df['sensor_column'])
plt.show()
