### Unifying Datasets


In [2]:
import pandas as pd
import numpy as np
from glob import glob
from math import floor

In [14]:
files = glob('../inmet_datasets/*.csv')

In [4]:
# replaces every ';' with ',' and ' ' with '_', and creates a new file

def rewrite_file(file_name):
    with open('../pre-processed_datasets/' + file_name.split('datasets/')[1], 'w') as new_file:
        with open(file_name, 'r+') as file:            
            for line in file:
                line = line.replace(';', ',')
                line = line.replace(' ', '_')
                line = line[: -2]
                
                new_file.write(line + '\n')
        
            file.close()
        
        new_file.close()
        

In [5]:
def merge_rows (dataset):
    """"
    Function responsible for normalizing the dataset
    
    It receives a pandas dataframe, and, for each row where the column 'Hora' is 1200,
    it takes the 'Precipitacao' and 'TempMinima' columns and puts its values on the
    row above, for it does not have such values (NaN value).
    """
    for index, row in dataset.iterrows():
        if row.loc['Hora'] == 1200:
            precipitacao = row.loc['Precipitacao']
            temp_minima = row.loc['TempMinima']

            dataset.at[index - 1, 'Precipitacao'] = precipitacao
            dataset.at[index - 1, 'TempMinima'] = temp_minima

In [6]:
def precipitation_rate(dataset, min_len):
    r0 = len(dataset[dataset['Precipitacao'] == 0])
    r1 = len(dataset[dataset['Precipitacao'] == 1])
    
    r0 = r0/len(dataset)
    r1 = r1/len(dataset)
    
    n0 = int(floor(r0 * min_len))
    n1 = int(floor(r1 * min_len))
    
    df0 = dataset[dataset['Precipitacao'] == 0].sample(n=n0)
    df1 = dataset[dataset['Precipitacao'] == 1].sample(n=n1)
    
    return df0.append(df1)

In [16]:
dataset_min_len = []

for file in files:
    rewrite_file(file)
    
    csv_file = pd.read_csv('../pre-processed_datasets/' + file.split('datasets/')[1])
    
    dates = csv_file['Data'].unique()
    
    # deleting unique dates
    for date in dates:
        if (len(csv_file[csv_file['Data'] == str(date)]) < 2):
            csv_file = csv_file[csv_file['Data'] != str(date)]
    
    # enconding 'Precipitacao' into 0 or 1
    csv_file['Precipitacao'] = csv_file['Precipitacao'].apply(lambda x: 1 if x > 0 else 0)
    
    merge_rows(csv_file)
    
    # maybe change this
    csv_file.dropna(how='any', inplace=True)
    
    if len(csv_file) != 0:
        dataset_min_len.append(len(csv_file))
        # saving file
        csv_file.to_csv('../pre-processed_datasets/' + file.split('datasets/')[1], index=False)
    else:
        files.remove(file)
        
    print(file.split('datasets/')[1])

IRATI.csv
CATALAO.csv
LENCOIS.csv
FEIRA_DE_SANTANA.csv
agua_branca.csv
PORTO ALEGRE.csv
FORMOSA.csv
CAMPOS.csv
PALMAS.csv
JOAO_PESSOA.csv
BRASILIA.csv
PARACATU.csv
IAUARETE.csv
CAXIAS_DO_SUL.csv
MACHADO.csv
CAMPOS_NOVOS.csv
MONTEIRO.csv
ITUMBIARA.csv
PELOTAS.csv
JANAUBA.csv
MONTE_ALEGRE.csv
RONDONOPOLIS.csv
FRANCA.csv
PARINTINS.csv
LAGOA_VERMELHA.csv
GLEBA_CELESTE.csv
JURAMENTO.csv
BAGE.csv
RESENDE.csv
LAGES.csv
CRATEUS.csv
CUIABA.csv
ALTO_PARNAIBA.csv
BARBACENA.csv
CALDEIRAO.csv
ENCRUZILHADA_DO_SUL.csv
PETROLINA.csv
DIAMANTINO.csv
PEIXE.csv
BOM_JESUS_DO_PIAUI.csv
JUIZ DE FORA.csv
FLORIANOPOLIS.csv
MOCAMBINHO.csv
BOM_JESUS_DA_LAPA.csv
BOM_DESPACHO.csv
PAULO_AFONSO.csv
BELO_HORIZONTE.csv
CARACOL.csv
PATOS_DE_MINAS.csv
IRAI.csv
FLORIANO.csv
RECIFE.csv
MORRO_DO_CHAPEU.csv
BARCELOS.csv
BARREIRAS.csv
CORDEIRO.csv
TAUBATE.csv
FLORANIA.csv
BARRA.csv
ALAGOINHAS.csv
MARABA.csv
LUZILANDIA_LAG_DO_PIAUI.csv
COLINAS.csv
PATOS.csv
CARACARAI.csv
IMPERATRIZ.csv
ITUIUTABA.csv
PARANAIBA.csv
CABROBO.csv


In [8]:
print(dataset_min_len)

[12685, 15473, 10111, 3116, 9407, 18756, 13466, 10692, 7405, 11975, 19769, 10982, 10061, 13845, 17794, 14196, 11506, 4630, 8906, 9245, 13669, 570, 15591, 11131, 6032, 8164, 8356, 16575, 14080, 15001, 12622, 11667, 10889, 15826, 6319, 15489, 11285, 4982, 14476, 10660, 17045, 15552, 6989, 12013, 6071, 11256, 18766, 7534, 17411, 14989, 10105, 18019, 10108, 9554, 9439, 7371, 9229, 11222, 5722, 6163, 8644, 5234, 7208, 9323, 5311, 10201, 6563, 9952, 11691, 9558, 15185, 15106, 10194, 11115, 14258, 11643, 4712, 10256, 7981, 11647, 9460, 9279, 4710, 10303, 11305, 2177, 12944, 8163, 15810, 10097, 8615, 15025, 12379, 13147, 12816, 10782, 15175, 15276, 7725, 15096, 12853, 14629, 9891, 11019, 10734, 10755, 7476, 7263, 6980, 7634, 6760, 12411, 13550, 15061, 8464, 6237, 13441, 7910, 9417, 15642, 6448, 18122, 17981, 14518, 5660, 11042, 9820, 3750, 11578, 16993, 17895, 9216, 8649, 11022, 12142, 13088, 5856, 10992, 8148, 9853, 9465, 12033, 13606, 7437, 15994, 9276, 11278, 14157, 7678, 8384, 11336, 11072

In [9]:
unified = pd.DataFrame(columns=['Estacao', 'Data', 'Hora', 'Precipitacao', 'TempMaxima', 'TempMinima', 'Insolacao', 'Evaporacao_Piche', 'Temp_Comp_Media', 'Umidade_Relativa_Media', 'Velocidade_do_Vento_Media'])

In [10]:
min_len = np.amin(dataset_min_len)

In [11]:
print(min_len)

570


In [12]:
for file in files:
    csv_file = pd.read_csv('../pre-processed_datasets/' + file.split('datasets/')[1])
    if len(csv_file) > 0:
        print(file.split('datasets/')[1])
        dataframe = precipitation_rate(csv_file, min_len)
        unified = unified.append(dataframe)

IRATI.csv
CATALAO.csv
LENCOIS.csv
FEIRA_DE_SANTANA.csv
agua_branca.csv
PORTO ALEGRE.csv
FORMOSA.csv
CAMPOS.csv
PALMAS.csv
JOAO_PESSOA.csv
BRASILIA.csv
PARACATU.csv
IAUARETE.csv
CAXIAS_DO_SUL.csv
MACHADO.csv
CAMPOS_NOVOS.csv
MONTEIRO.csv
ITUMBIARA.csv
PELOTAS.csv
JANAUBA.csv
MONTE_ALEGRE.csv
RONDONOPOLIS.csv
FRANCA.csv
PARINTINS.csv
LAGOA_VERMELHA.csv
GLEBA_CELESTE.csv
JURAMENTO.csv
BAGE.csv
RESENDE.csv
LAGES.csv
CRATEUS.csv
CUIABA.csv
ALTO_PARNAIBA.csv
BARBACENA.csv
CALDEIRAO.csv
ENCRUZILHADA_DO_SUL.csv
PETROLINA.csv
DIAMANTINO.csv
PEIXE.csv
BOM_JESUS_DO_PIAUI.csv
JUIZ DE FORA.csv
FLORIANOPOLIS.csv
MOCAMBINHO.csv
BOM_JESUS_DA_LAPA.csv
BOM_DESPACHO.csv
PAULO_AFONSO.csv
BELO_HORIZONTE.csv
CARACOL.csv
PATOS_DE_MINAS.csv
IRAI.csv
FLORIANO.csv
RECIFE.csv
MORRO_DO_CHAPEU.csv
BARCELOS.csv
BARREIRAS.csv
CORDEIRO.csv
TAUBATE.csv
FLORANIA.csv
BARRA.csv
ALAGOINHAS.csv
MARABA.csv
LUZILANDIA_LAG_DO_PIAUI.csv
COLINAS.csv
PATOS.csv
CARACARAI.csv
IMPERATRIZ.csv
ITUIUTABA.csv
PARANAIBA.csv
CABROBO.csv


FileNotFoundError: [Errno 2] File b'../pre-processed_datasets/MANAUS.csv' does not exist: b'../pre-processed_datasets/MANAUS.csv'

In [None]:
unified.to_csv('./unified.csv', index=False)