# Data Cleaning for AlertaRio Table

### Import Modules

In [1]:
import pandas as pd
import numpy as np

### Load Data

In [2]:
df = pd.read_csv('../../../../../data/raw/taxa_precipitacao_alertario-full.csv')

display(df.head())

Unnamed: 0.1,Unnamed: 0,primary_key,id_estacao,acumulado_chuva_15_min,acumulado_chuva_1_h,acumulado_chuva_4_h,acumulado_chuva_24_h,acumulado_chuva_96_h,horario,data_particao
0,0,1_2004-06-30 21:48:20,1,0.0,0.0,0.0,0.0,0.0,21:48:20,2004-06-30
1,1,1_2005-05-30 16:18:20,1,0.0,0.0,0.0,0.0,0.0,16:18:20,2005-05-30
2,2,1_2009-01-29 07:18:20,1,0.0,0.0,0.0,2.8,7.6,07:18:20,2009-01-29
3,3,1_2012-09-26 12:00:00,1,0.4,1.0,7.8,79.4,79.4,12:00:00,2012-09-26
4,4,1_1999-03-29 07:48:20,1,0.0,0.0,0.0,0.0,0.0,07:48:20,1999-03-29


### Drop columns

In [3]:
if 'Unnamed: 0' in df:
    df.drop('Unnamed: 0', axis=1, inplace=True)

### Extract fields from primary_key by splitting it

In [4]:
df[["id_estacao_extracted", "datetime_extracted"]] = df["primary_key"].str.split("_", n=1, expand=True)

### Convert data types

#### Convert extracted datetime into datetime format

In [5]:
df["datetime_extracted"] = pd.to_datetime(df["datetime_extracted"], format='ISO8601')

#### Convert datetime columns 

In [6]:
df["data_particao"] = pd.to_datetime(df["data_particao"])
df["horario"] = pd.to_timedelta(df["horario"])

#### Convert numeric columns

In [7]:
df["id_estacao_extracted"] = pd.to_numeric(df["id_estacao_extracted"])
df["id_estacao"] = pd.to_numeric(df["id_estacao"])

num_cols = [
    'acumulado_chuva_15_min',
    'acumulado_chuva_1_h',
    'acumulado_chuva_4_h',
    'acumulado_chuva_24_h',
    'acumulado_chuva_96_h'
]

df[num_cols] = df[num_cols].apply(pd.to_numeric)

### Handle missing values

#### Percentage of Missing Values per column

In [8]:
print('Percentual missing in each row:')
display(df.isna().mean().round(4).to_frame('Missing (%)') * 100)

Percentual missing in each row:


Unnamed: 0,Missing (%)
primary_key,0.0
id_estacao,0.0
acumulado_chuva_15_min,0.38
acumulado_chuva_1_h,0.39
acumulado_chuva_4_h,0.43
acumulado_chuva_24_h,0.49
acumulado_chuva_96_h,0.62
horario,0.02
data_particao,0.0
id_estacao_extracted,0.0


#### Drop rows missing `primary_key`

In [9]:
print(f'Dropping missing primary_key: {df["rimary_key"].isna().sum()} rows ...')
df.dropna(subset=["primary_key"], inplace=True)  # Remove rows with missing primary_key

Dropping missing primary_key: 0 rows ...


#### Fill missing values in `horario` column with pd.to_timedelta(0.0)

In [10]:
df['horario'] = df['horario'].fillna(pd.to_timedelta(0.0))

#### Fill missing values in numeric columns with 0.0

In [11]:
num_cols = [
    'acumulado_chuva_15_min',
    'acumulado_chuva_1_h',
    'acumulado_chuva_4_h',
    'acumulado_chuva_24_h',
    'acumulado_chuva_96_h'
]

df[num_cols] = df[num_cols].fillna(0.0)  # Fill missing numerical values with 0

### Validate that extracted components match original columns

In [12]:
df["datetime_combined"] = df["data_particao"] + df["horario"].fillna(pd.to_timedelta(0))
df["is_consistent"] = (df["id_estacao"] == df["id_estacao_extracted"]) & (df["datetime_extracted"] == df["datetime_combined"])

if (~df['is_consistent']).sum():
    print('Inconsistent rows:\n')
    display(df[~df["is_consistent"]])
else:
    print('No inconsistent rows found.')

INCONSISTENT ROWS:



Unnamed: 0,primary_key,id_estacao,acumulado_chuva_15_min,acumulado_chuva_1_h,acumulado_chuva_4_h,acumulado_chuva_24_h,acumulado_chuva_96_h,horario,data_particao,id_estacao_extracted,datetime_extracted,datetime_combined,is_consistent


### Drop inconsistent rows

In [13]:
if (~df['is_consistent']).sum():
    df = df[df["is_consistent"]]

### Drop Temporary columns

In [14]:
df = df.drop(columns=["id_estacao_extracted", "datetime_combined", "is_consistent"])

### Rename timestamp column

In [15]:
df.rename(columns={'datetime_extracted': 'timestamp'}, inplace=True)

### Remove duplicates

In [16]:
print(f'Duplicates of primary_key: {(df["primary_key"].value_counts() != 1).sum()}')

df.drop_duplicates(subset=["primary_key"], inplace=True)

Duplicates of primary_key: 33


### Sort data

In [17]:
df.sort_values('timestamp', inplace=True)

### Save cleaned data

In [18]:
!mkdir -p ../data/clean
df.to_csv("../../../../../data/clean/alertario.csv", index=False)  # Optimized storage format

print("Data cleaning complete. Cleaned dataset saved.")

display(df.head())

Data cleaning complete. Cleaned dataset saved.


Unnamed: 0,primary_key,id_estacao,acumulado_chuva_15_min,acumulado_chuva_1_h,acumulado_chuva_4_h,acumulado_chuva_24_h,acumulado_chuva_96_h,horario,data_particao,timestamp
18345354,5_1997-01-01 01:00:40,5,0.0,0.0,0.0,0.0,0.0,0 days 01:00:40,1997-01-01,1997-01-01 01:00:40
3121428,13_1997-01-01 01:01:00,13,0.0,0.0,0.0,0.0,0.0,0 days 01:01:00,1997-01-01,1997-01-01 01:01:00
9667966,15_1997-01-01 01:01:20,15,0.0,0.0,0.0,0.0,0.0,0 days 01:01:20,1997-01-01,1997-01-01 01:01:20
28719005,7_1997-01-01 01:01:40,7,0.0,0.0,0.0,0.0,0.0,0 days 01:01:40,1997-01-01,1997-01-01 01:01:40
15135364,23_1997-01-01 01:02:00,23,0.0,0.0,0.0,0.0,0.0,0 days 01:02:00,1997-01-01,1997-01-01 01:02:00
