In [1]:
import sys

In [2]:
sys.path.append('../src/parquet')

In [3]:
from load_parquet import load_parquet_data

In [4]:
help(load_parquet_data)

Help on function load_parquet_data in module load_parquet:

load_parquet_data(folder_path: str, file_name: str) -> pandas.core.frame.DataFrame
    Load aggregated data from a Parquet file and return it as a DataFrame.

    Args:
        folder_path (Path | str): Path to the folder containing the file.
        file_name (str): Name of the Parquet file containing aggregated data.

    Returns:
        pd.DataFrame: DataFrame containing the training data.

    Raises:
        FileNotFoundError: If the file does not exist.
        ValueError: If the file is empty, contains null values, or has invalid data.
        PermissionError: If the file is not readable.



In [5]:
folder = '../data/processed/'
file_name = 'training_data.parquet'
df = load_parquet_data(folder, file_name)

2025-07-27 20:02:26,409 - INFO - load_parquet -31 - Loading data from ../data/processed/training_data.parquet
2025-07-27 20:02:26,411 - INFO - load_parquet -50 - File ../data/processed/training_data.parquet exists and is readable. Proceeding with loading data.
2025-07-27 20:02:26,556 - INFO - load_parquet -64 - Data loaded for training from ../data/processed/training_data.parquet with shape (82, 10)


In [6]:
df.head()

Unnamed: 0,time,TO,sensor1_min,sensor1_max,sensor2_min,sensor2_max,sensor2_mean,sensor3_min,sensor3_max,sensor3_mean
0,1743532199,10540337,65.83,65.27,10.0,98.0,55.59,12.0,99.0,54.11
1,1743618599,10540337,76.09,54.77,10.0,98.0,57.7,10.0,99.0,51.68
2,1743704999,10540337,67.66,66.12,10.0,99.0,50.29,10.0,98.0,53.25
3,1743791399,10540337,83.22,80.61,10.0,99.0,58.97,11.0,98.0,49.92
4,1743877799,10540337,78.29,66.14,10.0,99.0,59.6,10.0,99.0,61.02


In [7]:
df.describe()

Unnamed: 0,time,TO,sensor1_min,sensor1_max,sensor2_min,sensor2_max,sensor2_mean,sensor3_min,sensor3_max,sensor3_mean
count,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0
mean,1745394000.0,10397780.0,64.194756,62.583293,10.470366,98.295244,55.406951,10.028902,98.610488,54.18939
std,914265.3,229726.0,18.550563,18.324626,0.6327,0.778831,2.146464,1.851964,0.357929,2.550052
min,1743532000.0,10032100.0,20.0,20.0,9.72,96.0,50.29,5.0,98.0,48.93
25%,1744844000.0,10032100.0,52.15,48.3525,10.0,97.9775,53.715,9.4775,98.335,52.6125
50%,1745044000.0,10540340.0,66.0,65.635,10.175,98.46,55.585,10.03,98.625,53.91
75%,1746275000.0,10540340.0,79.61,78.2675,10.79,99.0,57.1625,11.1425,99.0,55.6775
max,1746815000.0,10540340.0,98.0,98.0,12.0,99.18,59.6,13.0,99.12,61.02


In [8]:
stats = df.describe()

In [9]:
stats

Unnamed: 0,time,TO,sensor1_min,sensor1_max,sensor2_min,sensor2_max,sensor2_mean,sensor3_min,sensor3_max,sensor3_mean
count,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0
mean,1745394000.0,10397780.0,64.194756,62.583293,10.470366,98.295244,55.406951,10.028902,98.610488,54.18939
std,914265.3,229726.0,18.550563,18.324626,0.6327,0.778831,2.146464,1.851964,0.357929,2.550052
min,1743532000.0,10032100.0,20.0,20.0,9.72,96.0,50.29,5.0,98.0,48.93
25%,1744844000.0,10032100.0,52.15,48.3525,10.0,97.9775,53.715,9.4775,98.335,52.6125
50%,1745044000.0,10540340.0,66.0,65.635,10.175,98.46,55.585,10.03,98.625,53.91
75%,1746275000.0,10540340.0,79.61,78.2675,10.79,99.0,57.1625,11.1425,99.0,55.6775
max,1746815000.0,10540340.0,98.0,98.0,12.0,99.18,59.6,13.0,99.12,61.02


In [10]:
type(stats)

pandas.core.frame.DataFrame

In [11]:
stats.drop(['time', 'TO'], axis=1, inplace=True)

In [12]:
stats

Unnamed: 0,sensor1_min,sensor1_max,sensor2_min,sensor2_max,sensor2_mean,sensor3_min,sensor3_max,sensor3_mean
count,82.0,82.0,82.0,82.0,82.0,82.0,82.0,82.0
mean,64.194756,62.583293,10.470366,98.295244,55.406951,10.028902,98.610488,54.18939
std,18.550563,18.324626,0.6327,0.778831,2.146464,1.851964,0.357929,2.550052
min,20.0,20.0,9.72,96.0,50.29,5.0,98.0,48.93
25%,52.15,48.3525,10.0,97.9775,53.715,9.4775,98.335,52.6125
50%,66.0,65.635,10.175,98.46,55.585,10.03,98.625,53.91
75%,79.61,78.2675,10.79,99.0,57.1625,11.1425,99.0,55.6775
max,98.0,98.0,12.0,99.18,59.6,13.0,99.12,61.02


In [13]:
# from stats only take mean values for all the columns to develop a service
service1_mean = stats.loc['mean'].to_frame().T

In [14]:
service1_mean

Unnamed: 0,sensor1_min,sensor1_max,sensor2_min,sensor2_max,sensor2_mean,sensor3_min,sensor3_max,sensor3_mean
mean,64.194756,62.583293,10.470366,98.295244,55.406951,10.028902,98.610488,54.18939


In [15]:
# from stats only take min values for all the columns to develop a service
service1_min = stats.loc['min'].to_frame().T

In [16]:
service1_min

Unnamed: 0,sensor1_min,sensor1_max,sensor2_min,sensor2_max,sensor2_mean,sensor3_min,sensor3_max,sensor3_mean
min,20.0,20.0,9.72,96.0,50.29,5.0,98.0,48.93


In [17]:
# from stats only take max values for all the columns to develop a service
service1_max = stats.loc['max'].to_frame().T

In [18]:
service1_max

Unnamed: 0,sensor1_min,sensor1_max,sensor2_min,sensor2_max,sensor2_mean,sensor3_min,sensor3_max,sensor3_mean
max,98.0,98.0,12.0,99.18,59.6,13.0,99.12,61.02


In [19]:
# from stats only take 50% values for all the columns to develop a service
service1_median = stats.loc['50%'].to_frame().T

In [20]:
service1_median

Unnamed: 0,sensor1_min,sensor1_max,sensor2_min,sensor2_max,sensor2_mean,sensor3_min,sensor3_max,sensor3_mean
50%,66.0,65.635,10.175,98.46,55.585,10.03,98.625,53.91
