## Overview

### Data preparation

* Read csv file
* Pick the max value from all the columns

### Imports

In [1]:
import pandas as pd

### Load Data

In [2]:
# old dataset

df_old = pd.read_excel("../data/raw/CUIABA_CARDIO_AVC0310.xls")

In [3]:
# new dataset
drop_list = ["mun_geocod","mun_nome","mun_lat","mun_lon","mun_uf_nome"]

df_2016 = pd.read_csv("../data/raw/export-2016.csv").drop(columns=drop_list)
df_2017 = pd.read_csv("../data/raw/export-2017.csv").drop(columns=drop_list)
df_2018 = pd.read_csv("../data/raw/export-2018.csv").drop(columns=drop_list)

df_list = [df_2016, df_2017, df_2018]

In [4]:
df_2016

Unnamed: 0,datahora,co_ppb,no2_ppb,o3_ppb,pm25_ugm3,so2_ugm3
0,2016-01-01 00:00,183.4,1.1,9.4,16.9,0.9
1,2016-01-01 06:00,191.6,1.8,7.0,15.8,1.4
2,2016-01-01 12:00,192.8,1.0,7.0,20.8,0.6
3,2016-01-01 18:00,184.9,0.1,12.7,16.0,0.3
4,2016-01-02 00:00,201.0,1.4,7.0,16.8,1.1
...,...,...,...,...,...,...
1459,2016-12-30 18:00,109.2,0.1,18.9,2.1,0.3
1460,2016-12-31 00:00,144.0,2.3,5.6,8.3,1.6
1461,2016-12-31 06:00,137.3,1.7,10.3,6.9,1.3
1462,2016-12-31 12:00,119.2,0.3,14.0,1.2,0.4


In [5]:
df_2016.dtypes

datahora      object
co_ppb       float64
no2_ppb      float64
o3_ppb       float64
pm25_ugm3    float64
so2_ugm3     float64
dtype: object

In [6]:
df_2016 = df_2016.astype({"datahora": "datetime64[ns]"})

In [7]:
df_2016["hour"] = df_2016.datahora.dt.hour

In [8]:
df_2016 = df_2016[["datahora","hour","co_ppb","no2_ppb","o3_ppb","pm25_ugm3","so2_ugm3"]]
df_2016

Unnamed: 0,datahora,hour,co_ppb,no2_ppb,o3_ppb,pm25_ugm3,so2_ugm3
0,2016-01-01 00:00:00,0,183.4,1.1,9.4,16.9,0.9
1,2016-01-01 06:00:00,6,191.6,1.8,7.0,15.8,1.4
2,2016-01-01 12:00:00,12,192.8,1.0,7.0,20.8,0.6
3,2016-01-01 18:00:00,18,184.9,0.1,12.7,16.0,0.3
4,2016-01-02 00:00:00,0,201.0,1.4,7.0,16.8,1.1
...,...,...,...,...,...,...,...
1459,2016-12-30 18:00:00,18,109.2,0.1,18.9,2.1,0.3
1460,2016-12-31 00:00:00,0,144.0,2.3,5.6,8.3,1.6
1461,2016-12-31 06:00:00,6,137.3,1.7,10.3,6.9,1.3
1462,2016-12-31 12:00:00,12,119.2,0.3,14.0,1.2,0.4


In [9]:
def max_values(df_to_max: pd.DataFrame, index: str):

    df_to_max = df_to_max.astype({"datahora": "datetime64[ns]"})
    df_to_max = df_to_max.set_index(index)

    df_max = df_to_max.resample('D').max()

    return df_max

df = pd.DataFrame()

for DF in df_list:
    df = pd.concat([df, max_values(DF, "datahora")], ignore_index=True)

In [10]:
df["data"] = pd.date_range("2016-01-01", "2018-12-31")

In [11]:
df = df[["data","co_ppb","no2_ppb","o3_ppb","pm25_ugm3","so2_ugm3"]]

Grouping the datasets

In [12]:
df_old.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096 entries, 0 to 1095
Data columns (total 35 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   DATA       1096 non-null   datetime64[ns]
 1   co_ppb     1096 non-null   float64       
 2   pm25_ugm3  1089 non-null   float64       
 3   so2_ugm3   1096 non-null   float64       
 4   NO2_NOVO   1096 non-null   float64       
 5   OZ_MICRO   1096 non-null   float64       
 6   TMAX       1096 non-null   float64       
 7   T03        1067 non-null   float64       
 8   T05        1066 non-null   float64       
 9   T07        0 non-null      float64       
 10  TMIN       1082 non-null   float64       
 11  UR         1096 non-null   float64       
 12  PRESS      1096 non-null   float64       
 13  DS         1096 non-null   int64         
 14  LD         1096 non-null   int64         
 15  LAG0       1096 non-null   int64         
 16  LAG1       1095 non-null   float64       


In [13]:
columns_to_drop = ["co_ppb", "pm25_ugm3", "so2_ugm3", "NO2_NOVO",
                   "OZ_MICRO", "T03", "T05", "T07", "TMIN", "DIFERTEMP",
                   "VAR00001", "LD", "LAG1", "LAG2", "LAG3", "LAG4",
                   "LAG5", "LAG6", "LAG7", "L1AVC", "L2AVC", "Ç3AVC",
                   "L4AVC", "L5AVC", "L6AVC", "L7AVC", "TAPMIN", "TAPMAX"]

In [14]:
df_old = df_old.drop(columns=columns_to_drop)
df_old = df_old.rename(columns={"DATA": "data"})

In [15]:
df_new = df.merge(df_old, on='data')

In [16]:
df_new

Unnamed: 0,data,co_ppb,no2_ppb,o3_ppb,pm25_ugm3,so2_ugm3,TMAX,UR,PRESS,DS,LAG0,LOAVC
0,2016-01-01,192.8,1.8,12.7,20.8,1.4,31.6,87.00,983.029167,0,2,0
1,2016-01-02,222.7,2.6,12.4,19.6,1.7,33.9,75.75,983.408333,1,4,0
2,2016-01-03,214.2,2.4,14.5,19.1,1.5,28.1,88.50,983.854167,1,2,1
3,2016-01-04,236.6,2.8,17.9,24.9,1.8,34.8,75.75,982.791667,0,8,1
4,2016-01-05,218.3,2.6,14.0,18.5,1.8,31.1,83.50,984.125000,0,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1091,2018-12-27,176.2,3.7,13.7,23.0,2.5,33.5,73.00,981.450000,0,5,2
1092,2018-12-28,133.3,1.1,10.0,13.0,1.0,31.7,82.50,982.395833,0,7,0
1093,2018-12-29,136.1,1.0,11.5,13.8,0.9,31.2,80.50,982.372727,0,3,1
1094,2018-12-30,127.7,0.8,13.5,12.9,0.8,30.5,70.25,984.550000,1,2,0


In [17]:
df_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1096 entries, 0 to 1095
Data columns (total 12 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   data       1096 non-null   datetime64[ns]
 1   co_ppb     1096 non-null   float64       
 2   no2_ppb    1096 non-null   float64       
 3   o3_ppb     1096 non-null   float64       
 4   pm25_ugm3  1096 non-null   float64       
 5   so2_ugm3   1096 non-null   float64       
 6   TMAX       1096 non-null   float64       
 7   UR         1096 non-null   float64       
 8   PRESS      1096 non-null   float64       
 9   DS         1096 non-null   int64         
 10  LAG0       1096 non-null   int64         
 11  LOAVC      1096 non-null   int64         
dtypes: datetime64[ns](1), float64(8), int64(3)
memory usage: 111.3 KB


In [18]:
df_new

Unnamed: 0,data,co_ppb,no2_ppb,o3_ppb,pm25_ugm3,so2_ugm3,TMAX,UR,PRESS,DS,LAG0,LOAVC
0,2016-01-01,192.8,1.8,12.7,20.8,1.4,31.6,87.00,983.029167,0,2,0
1,2016-01-02,222.7,2.6,12.4,19.6,1.7,33.9,75.75,983.408333,1,4,0
2,2016-01-03,214.2,2.4,14.5,19.1,1.5,28.1,88.50,983.854167,1,2,1
3,2016-01-04,236.6,2.8,17.9,24.9,1.8,34.8,75.75,982.791667,0,8,1
4,2016-01-05,218.3,2.6,14.0,18.5,1.8,31.1,83.50,984.125000,0,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...
1091,2018-12-27,176.2,3.7,13.7,23.0,2.5,33.5,73.00,981.450000,0,5,2
1092,2018-12-28,133.3,1.1,10.0,13.0,1.0,31.7,82.50,982.395833,0,7,0
1093,2018-12-29,136.1,1.0,11.5,13.8,0.9,31.2,80.50,982.372727,0,3,1
1094,2018-12-30,127.7,0.8,13.5,12.9,0.8,30.5,70.25,984.550000,1,2,0


In [19]:
df_new.to_parquet("../data/processed/avc-cuiaba.parquet")