### Install Modules

In [4]:
# !pip install langchain
# !pip install langchain-experimental
# !pip install langchain_google_genai

### Import Modules

In [5]:
import os
import json
from IPython.display import Markdown
import pandas as pd
import matplotlib.pyplot as plt
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_experimental.agents import create_pandas_dataframe_agent

### Utility Functions

In [6]:
def parse_response(response):
    steps = []
    for step in response['intermediate_steps']:
        if len(step) > 2:
            print('MULTI-STEP IDENTIFIED... PLEASE UPDATE THIS FUNCTION...')
        if step[0].tool == 'python_repl_ast':
            data = {
                'code': step[0].tool_input,
                'output': step[1],
                'reasoning': step[0].log
            }
            steps.append(data)
    return steps

def write_script(steps, filepath):
    """
    Writes the data cleaning steps to a Python file.

    Args:
        steps: A list of dictionaries, where each dictionary represents a data cleaning step
               and contains 'reasoning' and 'code' keys.
        filepath: The path to the file where the code should be written.
    """

    # Create the directory if it doesn't exist
    directory = os.path.dirname(filepath)
    if not os.path.exists(directory):
        os.makedirs(directory)

    with open(filepath, 'w') as f:
        for step in steps:
            f.write("\n")
            f.write("'''\n")
            f.write(step['reasoning'].split('Action')[0] + "\n")  # Split to handle potential "Action:" prefixes
            f.write("'''\n")
            f.write("\n")
            code = step['code'].replace('```python\n', '').replace('```', '').strip('`')
            f.write(code + "\n")
            # f.write("\n")  #Removed Extra Empty line after code

    display(Markdown(f'Script saved to: [{filepath}]({filepath})'))

# Example usage (assuming 'response' and 'parse_response' are defined elsewhere):
# steps = parse_response(response)
# write_estacoes_alertario_script(steps)

### List of datasets and tables

In [7]:
datasets = {
    "adm_cor_comando": ["ocorrencias"],
    "clima_estacao_meteorologica": [
        "estacoes_alertario", "estacoes_alertario_met", "estacoes_inmet", "estacoes_redemet", 
        "intensidade_vento", "meteorologia_alertario", "meteorologia_inmet", "meteorologia_redemet"
    ],
    "clima_fluviometro": ["estacoes_inea", "lamina_agua_inea"],
    "clima_pluviometro": [
        "estacoes_alertario", "estacoes_cemaden", "estacoes_inea", "estacoes_websirene",
        "taxa_precipitacao_alertario", "taxa_precipitacao_alertario_5min", "taxa_precipitacao_cemaden",
        "taxa_precipitacao_inea", "taxa_precipitacao_websirene"
    ],
    "clima_radar": ["taxa_precipitacao_guaratiba"],
    "dados_mestres": ["bairro"],
    "rionowcast": ["grid_points", "radar", "rain_gauge"]
}

### Define Data Directory

In [30]:
input_data_directory = '../../../../data/meteorologia/raw'
output_data_directory = '../../../../data/meteorologia/clean'

### Define Gemini Credential

In [9]:
GENAI_API_KEY = "AIzaSyC_UvSH-abkbWWgtWzhex58GqjEwfjq2ok"

### Initialize LLM Model

In [10]:
# Initialize the Gemini model

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-pro",
    # model="gemini-2.0-flash",
    # model='gemini-2.0-pro-exp-02-05',
    # model='gemini-2.0-flash-thinking-exp-01-21',
    temperature=0.0,
    api_key=GENAI_API_KEY
)

---

## Clean the Datasets

## clima_pluviometro.estacoes_alertario

In [13]:
dataset = 'clima_pluviometro'
table_name = 'estacoes_alertario'

#### Initialize Pandas Dataframe Agent

In [7]:
df = pd.read_csv(f"{input_data_directory}/{dataset}/{table_name}.csv")

# Create the agent
agent = create_pandas_dataframe_agent(
    llm,
    df,
    verbose=True,
    return_intermediate_steps=True,
    allow_dangerous_code=True,
    max_iterations=25,
)

display(df.head())

Unnamed: 0,id_estacao,estacao,latitude,longitude,cota,x,y,endereco,situacao,data_inicio_operacao,data_fim_operacao,data_atualizacao
0,1,Vidigal,-22.9925,-43.23306,85.0,681138532.0,7456241000.0,"Hotel Sheraton - Av. Niemeyer,121",Operante,,,
1,2,Urca,-22.95583,-43.16667,90.0,688004213.0,7460236000.0,"Instituto Militar de Engenharia - Av. Pasteur,35",Operante,,,
2,3,Rocinha,-22.98583,-43.245,160.0,679831802.0,7457041000.0,"Região Administrativa - Estrada da Gávea,242",Operante,,,
3,4,Tijuca,-22.93194,-43.22167,340.0,682358108.0,7462941000.0,Centro de Estudos do Sumaré - Estrada do Sumar...,Operante,,,
4,5,Santa teresa,-22.93167,-43.19639,170.0,684951792.0,7462972000.0,Escola Estadual Monteiro de Carvalho - Rua Alm...,Operante,,,


#### Run Agent

In [8]:
prompt = f'''
You are an expert data analyst tasked with cleaning newly downloaded datasets.
Your goal is to prepare each dataframe for further analysis by performing the required cleaning steps.
Follow an iterative process of carefully understanding the data, applying a cleaning step, and then carefully re-evaluating the data.
`pandas` is already imported as `pd`.
The dataframe is already loaded in the `df` variable.
Be brief in your reasoning.

Steps required:
- Drop completely empty columns.
- Missing values should have proper notation: NaN, NaT.
- Using the 'unique' method, ensure there aren't any unusual representations of missing values that pandas didn't automatically recognize as NaN or NaT.

Once you are done. save the clean data using:

```python
import os
os.makedirs('{output_data_directory}/{dataset}', exist_ok=True)
df.to_csv('{output_data_directory}/{dataset}/{table_name}.csv', index=False)
```

'''

response = agent.invoke(prompt)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: First, I need to inspect the dataframe to identify completely empty columns.
Action: python_repl_ast
Action Input: `df.isna().all()`[0m[36;1m[1;3mid_estacao              False
estacao                 False
latitude                False
longitude               False
cota                    False
x                       False
y                       False
endereco                False
situacao                False
data_inicio_operacao     True
data_fim_operacao        True
data_atualizacao         True
[32;1m[1;3mThe columns 'data_inicio_operacao', 'data_fim_operacao', and 'data_atualizacao' are completely empty. I will drop them.
Action: python_repl_ast
[32;1m[1;3mNow I will check for unusual representations of missing values in each column using the `unique` method.[36;1m[1;3m[0m
Action: python_repl_ast
Action Input: `for col in df.columns:
    print(f"Column {col}: {df[col].unique()}")`[0m[36;1m[1;3mColu

#### Get Full Generated Code

In [14]:
steps = parse_response(response)

write_script(steps, filepath=f'data-cleaning-scripts/{dataset}/{table_name}.py')

---

## clima_estacao_meteorologica.estacoes_inmet

In [7]:
dataset = 'clima_estacao_meteorologica'
table_name = 'estacoes_inmet'

#### Initialize Pandas Dataframe Agent

In [8]:
df = pd.read_csv(f"{input_data_directory}/{dataset}/{table_name}.csv")

# Create the agent
agent = create_pandas_dataframe_agent(
    llm,
    df,
    verbose=True,
    return_intermediate_steps=True,
    allow_dangerous_code=True,
    max_iterations=25,
)

display(df.head())

Unnamed: 0,id_estacao,estacao,latitude,longitude,situacao,tipo_estacao,entidade_responsavel,data_inicio_operacao,data_fim_operacao,data_atualizacao
0,A602,Marambaia,-23.050278,-43.595556,Operante,Automatica,INMET,2002-11-08,,2023-05-01
1,A621,Vila militar,-22.861389,-43.411389,Operante,Automatica,INMET,2007-04-13,,2023-05-01
2,A636,Jacarepagua,-22.94,-43.402778,Operante,Automatica,INMET,2017-08-10,,2023-05-01
3,A651,Escola naval,-22.914722,-43.159444,,,,,,2023-05-01
4,A652,Forte de copacabana,-22.988333,-43.190556,Operante,Automatica,INMET,2007-05-18,,2023-05-01


#### Run Agent

In [9]:
prompt = f'''
You are an expert data analyst tasked with cleaning newly downloaded datasets.
Your goal is to prepare each dataframe for further analysis by performing the required cleaning steps.
Follow an iterative process of carefully understanding the data, applying a cleaning step, and then carefully re-evaluating the data.
`pandas` is already imported as `pd`.
The dataframe is already loaded in the `df` variable.
Be brief in your reasoning.

Steps required:
- Drop completely empty columns.
- Missing values should have proper notation: NaN, NaT.
- Using the 'unique' method, ensure there aren't any unusual representations of missing values that pandas didn't automatically recognize as NaN or NaT.
- Convert to proper data types

Once you are done. save the clean data using:

```python
import os
os.makedirs('{output_data_directory}/{dataset}', exist_ok=True)
df.to_csv('{output_data_directory}/{dataset}/{table_name}.csv', index=False)
```

'''

response = agent.invoke(prompt)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: First, I'll inspect the dataframe's info and head to understand its structure and identify potential issues.
Action: python_repl_ast
Action Input:
```python
print(df.info())
print(df.head())
```[0m<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   id_estacao            9 non-null      object 
 1   estacao               9 non-null      object 
 2   latitude              9 non-null      float64
 3   longitude             9 non-null      float64
 4   situacao              4 non-null      object 
 5   tipo_estacao          4 non-null      object 
 6   entidade_responsavel  4 non-null      object 
 7   data_inicio_operacao  4 non-null      object 
 8   data_fim_operacao     0 non-null      float64
 9   data_atualizacao      9 non-null      object 
dtypes: float64(3),

#### Get Full Generated Code

In [10]:
steps = parse_response(response)

write_script(steps, filepath=f'data-cleaning-scripts/{dataset}/{table_name}.py')

---

## adm_cor_comando.ocorrencias

In [6]:
dataset = 'adm_cor_comando'
table_name = 'ocorrencias'

#### Initialize Pandas Dataframe Agent

In [17]:
df = pd.read_csv(f"{input_data_directory}/{dataset}/{table_name}.csv")

# Create the agent
agent = create_pandas_dataframe_agent(
    llm,
    df,
    verbose=True,
    return_intermediate_steps=True,
    allow_dangerous_code=True,
    max_iterations=25,
    handle_parsing_errors=True
)

display(df.head())



Unnamed: 0,id_pop,id_evento,bairro,data_inicio,data_fim,prazo,descricao,gravidade,latitude,longitude,status,tipo,data_particao
0,1.0,66561,,2021-10-01 17:18:15+00:00,2021-10-01 17:21:59+00:00,Curto,"Rod. washington luiz, alt. reduc - santo antôn...",Sem_classificacao,-22.636843,-43.284458,Fechado,Primario,2021-10-01
1,1.0,66656,,2021-10-04 17:26:09+00:00,2021-10-04 17:41:03+00:00,Curto,"R. pinheiro machado, 151 - laranjeiras sent. ...",Baixo,-22.936863,-43.183634,Fechado,Primario,2021-10-04
2,1.0,66641,Caju,2021-10-04 08:45:51+00:00,2021-10-04 09:59:37+00:00,Curto,Av. brasil - caju - sent. centro,Baixo,-22.887603,-43.224062,Fechado,Primario,2021-10-04
3,1.0,66979,Caju,2021-10-16 19:40:22+00:00,2021-10-16 20:09:09+00:00,Curto,"Linha vermelha, alt. km 3 - caju - sent. baixada",Baixo,-22.876475,-43.232307,Fechado,Primario,2021-10-16
4,1.0,66576,Caju,2021-10-02 09:29:24+00:00,2021-10-02 10:25:34+00:00,Curto,"Linha vermelha, alt. km 02 - sent. centro",Baixo,-22.876112,-43.231962,Fechado,Primario,2021-10-02


#### Run Agent

In [18]:
prompt = f'''
Your goal is to prepare the dataframe for further analysis by performing the cleaning steps.
Follow an iterative process of understanding the data, applying a cleaning step, and then carefully re-evaluating the data.
After each step, re-evaluate the dataframe to ensure the step was applied correctly.
Never execute the next cleaning step before re-evaluating the previous step.
Never repeat code already executed, as changes to the dataframe from previous steps persist in subsequent steps.
The dataframe is already loaded in the `df` variable.
Be brief in your reasoning.

- Obs: The 'prazo' column has mixed data types (string and float). It seems that some values represent durations in seconds, while others are categorical ('Curto', 'Medio', 'Longo').

Required:
- 'data_inicio' and 'data_fim' are strings, so they must be converted to datetime objects. Fill the missing 'data_fim' with 'data_inicio' + 1 hour.
- Fill missing 'descricao', 'gravidade', 'tipo', 'bairro' with appropriate placeholders or values.
- Fill invalid coordinates with NaN. Valid coordinates belong to the proper range (e.i. belong to Rio de Janeiro city).
- Create a new column 'prazo_num' with the numerical values from 'prazo' as float. Use NaN instead of the categorical values.
- In 'prazo' column, replace the non-categorical and NaN values with 'Desconhecido'
- After processing 'prazo' re-check its unique values, to make sure only categorical values exist.
- Ensure data_particao is datetime and pop_id is integer with missing values -1.
- 

 Once you are done. save the clean data using:
import os
os.makedirs('{output_data_directory}/{dataset}', exist_ok=True)
df.to_csv('{output_data_directory}/{dataset}/{table_name}.csv', index=False)

'''

response = agent.invoke(prompt)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: Convert 'data_inicio' and 'data_fim' to datetime objects and fill missing 'data_fim'.

Action: python_repl_ast
Action Input:
```python
import pandas as pd
df['data_inicio'] = pd.to_datetime(df['data_inicio'])
df['data_fim'] = pd.to_datetime(df['data_fim'])
df['data_fim'] = df['data_fim'].fillna(df['data_inicio'] + pd.Timedelta(hours=1))
print(df[['data_inicio', 'data_fim']].head())

[36;1m[1;3m                data_inicio                  data_fim
0 2021-10-01 17:18:15+00:00 2021-10-01 17:21:59+00:00
1 2021-10-04 17:26:09+00:00 2021-10-04 17:41:03+00:00
2 2021-10-04 08:45:51+00:00 2021-10-04 09:59:37+00:00
3 2021-10-16 19:40:22+00:00 2021-10-16 20:09:09+00:00
4 2021-10-02 09:29:24+00:00 2021-10-02 10:25:34+00:00
[32;1m[1;3mFill missing values in 'descricao', 'gravidade', 'tipo', and 'bairro'.

Action: python_repl_ast
Action Input:
```python
df['descricao'] = df['descricao'].fillna('Sem descrição')
df['gravidade'] 

#### Get Full Generated Code

In [20]:
steps = parse_response(response)

write_script(steps, filepath=f'data-cleaning-scripts/{dataset}/{table_name}.py')

---

## clima_pluviometro.taxa_precipitacao_alertario

In [2]:
dataset = 'clima_pluviometro'
table_name = 'taxa_precipitacao_alertario'

#### Initialize Pandas Dataframe Agent

In [26]:
df = pd.read_csv(f"{input_data_directory}/{dataset}/{table_name}.csv", nrows=10000, index_col=0)

# Create the agent
agent = create_pandas_dataframe_agent(
    llm,
    df,
    verbose=True,
    return_intermediate_steps=True,
    allow_dangerous_code=True,
    max_iterations=25,
    handle_parsing_errors=True
)

display(df.head())



Unnamed: 0_level_0,acumulado_chuva_15_min,acumulado_chuva_1_h,acumulado_chuva_4_h,acumulado_chuva_24_h,acumulado_chuva_96_h,data_hora
id_estacao,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0.0,0.0,0.0,0.0,0.0,2004-06-30 21:48:20
1,0.0,0.0,0.0,0.0,0.0,2005-05-30 16:18:20
1,0.0,0.0,0.0,2.8,7.6,2009-01-29 07:18:20
1,0.4,1.0,7.8,79.4,79.4,2012-09-26 12:00:00
1,0.0,0.0,0.0,0.0,0.0,1999-03-29 07:48:20


#### Run Agent

In [24]:
prompt = f'''
Your goal is to prepare the dataframe for further analysis by performing the cleaning steps.
Follow an iterative process of understanding the data, applying a cleaning step, and then carefully re-evaluating the data.
After each step, re-evaluate the dataframe to ensure the step was applied correctly.
Never execute the next cleaning step before re-evaluating the previous step.
Changes to the dataframe from previous steps persist in subsequent steps.
The dataframe is already loaded in the `df` variable.
Be brief in your reasoning.

Required:
- This table represents rain precipitation time series for multiple rain gauge stations.
- Ensure the dataframe follows correct standards and patterns to allow data exploration and visualization.
- Missing or invalid values must be NaN.
- Numeric values should be in the proper expected range.
- Datetime values should be in the proper expected range.

Once you are done. save the clean data using:
import os
os.makedirs('{output_data_directory}/{dataset}', exist_ok=True)
df.to_csv('{output_data_directory}/{dataset}/{table_name}.csv', index=False)

'''

response = agent.invoke(prompt)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought:
First, I'll check the data types of each column.

Action:
python_repl_ast
Action Input:
```python
print(df.dtypes)
```[0m[36;1m[1;3mid_estacao                  int64
acumulado_chuva_15_min    float64
acumulado_chuva_1_h       float64
acumulado_chuva_4_h       float64
acumulado_chuva_24_h      float64
acumulado_chuva_96_h      float64
data_hora                  object
dtype: object
[32;1m[1;3mThe `data_hora` column should be datetime. Let's convert it.

Action:
python_repl_ast
Action Input:
```python
df['data_hora'] = pd.to_datetime(df['data_hora'])
print(df.dtypes)
[32;1m[1;3mThought:ameError: name 'pd' is not defined[0m
I need to import the pandas library.

Action:
python_repl_ast
Action Input:
```python
import pandas as pd
df['data_hora'] = pd.to_datetime(df['data_hora'])
print(df.dtypes)
```[0m[36;1m[1;3mid_estacao                         int64
acumulado_chuva_15_min           float64
acumulado_chuva_1_

#### Get Full Generated Code

In [None]:
steps = parse_response(response)

write_script(steps, filepath=f'data-cleaning-scripts/{dataset}/{table_name}.py')