In [1]:
import sys
import os
import pandas as pd
from pathlib import Path

# Add project root to path to allow imports from src
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from src.data_preprocessing import load_parquet_to_df

## Obtaining the Dataframes ##

The dataframes used are: 
- Consum Total Agregat (CTA)
- Detecció de Consums Anòmals (DCA)

In [9]:
# Define the path to your raw data file
file_names = ["consum_total_agregat.parquet", "repte_consums_anomals.parquet", "fuites_experiencia_client.parquet","incidencies_comptadors_intelligents.parquet"]
files = ["../data/" + file for file in file_names]
sample_files = ["../data/sample_" + file for file in file_names]

# Run just with samples until being sure for time optimization 
df_CTA = load_parquet_to_df(sample_files[0])
df_RCA = load_parquet_to_df(sample_files[1])
df_FEC = load_parquet_to_df(sample_files[2])
df_ICI = load_parquet_to_df(sample_files[3])

df_list = [df_CTA,df_RCA, df_FEC, df_ICI]

['poliza_suministro', 'fecha', 'consumo_real', 'seccio_censal', 'us_aigua_gest', 'num_mun_sgab', 'num_dte_muni', 'num_complet', 'data_inst_comp', 'marca_comp', 'codi_model', 'diam_comp']
['polissa_subm', 'codi_anomalia', 'start_date', 'end_date', 'us_aigua_subm', 'seccio_censal', 'numeroseriecontador', 'consumo_real', 'fecha_hora']
['polissa_subm', 'data_ini_fact', 'data_fin_fact', 'created_mensaje', 'codigo_mensaje', 'tipo_mensaje', 'us_aigua_subm', 'seccio_censal', 'numeroseriecontador', 'consumo_real', 'fecha_hora']
['poliza_suministro', 'fecha', 'consumo_real', 'seccio_censal', 'us_aigua_gest', 'num_mun_sgab', 'num_dte_muni', 'num_complet', 'data_inst_comp', 'marca_comp', 'codi_model', 'diam_comp']


#### Descriptions of Dataframes

In [30]:
for i, df in enumerate(df_list):
    print("-"*10, file_names[i], "-" *10)
    df_list[i].info(show_counts=True)


---------- consum_total_agregat.parquet ----------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1711271 entries, 0 to 1711270
Data columns (total 12 columns):
 #   Column             Non-Null Count    Dtype  
---  ------             --------------    -----  
 0   poliza_suministro  1711271 non-null  object 
 1   fecha              1711271 non-null  object 
 2   consumo_real       1711271 non-null  int64  
 3   seccio_censal      579792 non-null   float64
 4   us_aigua_gest      579792 non-null   object 
 5   num_mun_sgab       579792 non-null   float64
 6   num_dte_muni       579792 non-null   float64
 7   num_complet        579792 non-null   object 
 8   data_inst_comp     579792 non-null   object 
 9   marca_comp         579792 non-null   object 
 10  codi_model         579792 non-null   float64
 11  diam_comp          579792 non-null   float64
dtypes: float64(5), int64(1), object(6)
memory usage: 156.7+ MB
---------- repte_consums_anomals.parquet ----------
<class 'pandas.core.

### Cleaninf df

In [32]:
#Detecció de consums anòmals dataframe cleaned 
df_RCA.fillna(0.0, inplace=True)

#Consum Total Agregat cleaning
df_CTA.dropna()     #As they indicate the secció censal and the districte, without them we cannot 
                    #identify where data belongs
df_CTA = df_CTA[(df_CTA['districte'].notna()) & (df_CTA['districte'] != '>')]
df_CTA['districte'] = df_CTA['districte'].replace('06', '6')
df_CTA['districte'] = df_CTA['districte'].replace('03', '3')


KeyError: 'districte'

In [31]:
for df in df_list: 
    display(df.head(5))

Unnamed: 0,poliza_suministro,fecha,consumo_real,seccio_censal,us_aigua_gest,num_mun_sgab,num_dte_muni,num_complet,data_inst_comp,marca_comp,codi_model,diam_comp
0,T2KIU6UKQLJ543UR,2022-03-30,596,,,,,,,,,
1,BZFRIYRZUANEY4C6,2021-06-29,180,801906022.0,D,0.0,6.0,YE5TR4KNBPIOXQDO,2016-02-11,OKV4SQYX72EBODPA,27.0,15.0
2,5RRET3OP33BY2CH6,2021-06-20,251,810101013.0,D,10.0,1.0,PLDGCWYHKWNJJNCF,2016-06-17,R7GO7PZAU5F6DHFH,2.0,15.0
3,VDV56S3TA3K3WZJV,2024-10-25,12,810106025.0,D,10.0,6.0,LYXK53WJW75Q4VU4,2016-05-13,5557SZ47QZAZ56EQ,31.0,15.0
4,5QGKV4XIPZHDGLG5,2023-12-31,524,,,,,,,,,


Unnamed: 0,polissa_subm,codi_anomalia,start_date,end_date,us_aigua_subm,seccio_censal,numeroseriecontador,consumo_real,fecha_hora
0,7YFBITDLZC2X2EBK,163840,2023-02-01,2023-04-05,COMERCIAL,801901030,H23VA237979H,0.0,2024-08-10 20:01:44
1,EFBQF7ZISVODMVAK,163840,2024-03-28,2024-05-28,DOMÈSTIC,801908037,J23OA015349X,0.0,2024-05-02 18:57:13
2,CXXA5WXQT6BTFOJ3,32768,2023-08-02,2023-10-03,DOMÈSTIC,810101009,P16VA115198P,0.0,2024-03-11 21:33:36
3,75W3DOTAAGXZXCX5,32768,2023-12-11,2024-02-07,COMERCIAL,801909004,P17VA110405S,0.0,2024-12-09 05:36:19
4,SC72IJ2VIEG22ZHP,163840,2023-06-27,2023-08-28,DOMÈSTIC,801903015,P18VA123792Q,0.0,2024-11-08 03:45:33


Unnamed: 0,polissa_subm,data_ini_fact,data_fin_fact,created_mensaje,codigo_mensaje,tipo_mensaje,us_aigua_subm,seccio_censal,numeroseriecontador,consumo_real,fecha_hora
0,WIRRWCISPIMZCYYH,2024-02-09 00:00:00,2024-04-11 00:00:00,2025-05-27 15:39:33,FUITA,SMS,COMERCIAL,801910043,GAX7AKU2DMRUGKN5,72.0,2024-06-05 20:00:55
1,HMTGNNT2KQKODNEE,2024-05-14 00:00:00,2024-07-15 00:00:00,2024-07-10 16:05:50,REITERACIÓ DE FUITA,Mail,DOMÈSTIC,801905026,HBCQFRILYI3MVR77,5.0,2024-04-14 15:58:22
2,LHTC5PPWW2V77FWS,2023-07-13 00:00:00,2023-09-13 00:00:00,2023-04-14 15:26:32,REITERACIÓ DE FUITA,SMS,INDUSTRIAL,801905003,QJNI354KK4XHZ4LB,20.0,2024-05-17 13:21:02
3,3TBFYGVTVTN6G7F2,2023-08-23 00:00:00,2023-09-22 00:00:00,2023-10-07 15:26:52,FUITA,SMS,INDUSTRIAL,801905027,6OKU5U54KSPKIW2M,118.0,2024-08-13 18:28:28
4,LNSJCKGG3WFA56BL,2023-09-13 00:00:00,2023-11-16 00:00:00,2023-10-11 15:25:08,REITERACIÓ DE FUITA,Mail,COMUNITARI,801905097,GBG7ZJVMXHM7XJK4,0.0,2024-02-03 08:00:07


Unnamed: 0,poliza_suministro,fecha,consumo_real,seccio_censal,us_aigua_gest,num_mun_sgab,num_dte_muni,num_complet,data_inst_comp,marca_comp,codi_model,diam_comp
0,T2KIU6UKQLJ543UR,2022-03-30,596,,,,,,,,,
1,BZFRIYRZUANEY4C6,2021-06-29,180,801906022.0,D,0.0,6.0,YE5TR4KNBPIOXQDO,2016-02-11,OKV4SQYX72EBODPA,27.0,15.0
2,5RRET3OP33BY2CH6,2021-06-20,251,810101013.0,D,10.0,1.0,PLDGCWYHKWNJJNCF,2016-06-17,R7GO7PZAU5F6DHFH,2.0,15.0
3,VDV56S3TA3K3WZJV,2024-10-25,12,810106025.0,D,10.0,6.0,LYXK53WJW75Q4VU4,2016-05-13,5557SZ47QZAZ56EQ,31.0,15.0
4,5QGKV4XIPZHDGLG5,2023-12-31,524,,,,,,,,,
