In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from glob import glob
import seaborn as sns
import geopandas as gpd
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
import wget
import requests
import threading
import asyncio


# Stations dataframe

In [None]:
# creating the stations dataframe
df = pd.read_excel(r'Data_stations\ANA_DATA\vwEstacoes.xlsx')
stations = df[['TipoEstacao','Codigo', 'Nome', 'Latitude', 'Longitude', 'Altitude']]
stations = stations[stations['TipoEstacao'] == 2].drop(columns=['TipoEstacao'])
stations.drop('Unnamed: 0', axis=1, inplace=True)
stations.rename(columns={'Codigo': 'Code', 'Nome': 'Name'}, inplace=True)
stations.to_csv(r'Data_stations\stations.csv')

In [None]:
# creating the stations geodataframe
gdf = gpd.GeoDataFrame(stations, geometry=gpd.points_from_xy(stations.Longitude, stations.Latitude, crs='epsg:4326'))
gdf.to_file(r"Data\stations.geojson", driver='GeoJSON')

# download and unzip

## download

In [None]:
# download all the ANA data
def background(f):
    def wrapped(*args, **kwargs):
        return asyncio.get_event_loop().run_in_executor(None, f, *args, **kwargs)
    return wrapped

@background
def download(stat):
  url = 'https://www.snirh.gov.br/hidroweb/rest/api/documento/convencionais?tipo=3&documentos=' + str(stat)  
  wget.download(url, out=r'Data_stations\ANA_DATA\ZIP\all' + str(stat) + '.zip')



for stat in stations:
    download(stat)


## unzip

In [None]:
for file in files:
    df = pd.read_csv(file, encoding='latin-1', skiprows=12, delimiter=';', index_col=False, usecols=[0,1,2,5], decimal=',', parse_dates=[2], dayfirst=True)
    idx = df['EstacaoCodigo'][0]
    df.to_csv(r'Data_stations\data\clean\\' + str(idx) + '.csv', index=False)


In [None]:
path = r'Data_stations\ANA_DATA\ZIP'
files = glob(os.path.join(path, '*.zip'))

In [None]:
#check for corrupt files or bad files(downloaded incorecctly)
corrupt = []
badfile = []
for file in files:
    try:

        with ZipFile(file) as file1:
                if file1.testzip() is not None:
                    print('ruim')
                    corrupt.append(file)
                else:
                    file1.extractall(path+'aaa')
    except:
        badfile.append(file)
        print('bad')   

In [None]:
print(len(badfile))
len(corrupt)

badfiles_df = pd.DataFrame({'badfiles': badfile})
badfiles_df.to_csv(r'Data_stations\badiles.csv')
corrupt_df = pd.DataFrame({'corrupt': corrupt})
corrupt_df.to_csv(r'Data_stations\corrupt.csv')

## corrupt data


In [None]:
corr = pd.read_csv(r'Data_stations\corrupt.csv')['corrupt'].to_list()
list_corrupt = [int(i.split('data')[2].split('.')[0]) for i in corr]
badd = pd.read_csv(r'Data_stations\badiles.csv')['badfiles'].to_list()
list_badd = [int(i.split('data')[2].split('.')[0]) for i in badd]

for station in list_corrupt:
    url = 'https://www.snirh.gov.br/hidroweb/rest/api/documento/convencionais?tipo=3&documentos=' + str(station)
    r = requests.get(url)
    with open(r'data\corrupts\\' + str(station) + '.zip', "wb") as f:
      f.write(r.content)

for station in list_badd:
    url = 'https://www.snirh.gov.br/hidroweb/rest/api/documento/convencionais?tipo=3&documentos=' + str(station)
    r = requests.get(url)
    with open(r'data\corrupts\\' + str(station) + '.zip', "wb") as f:
      f.write(r.content)


In [None]:
path = r'Data_stations\corrupts\csv'
files = glob(os.path.join(path, '*.csv'))

# cleaning data and creating a dataframe

In [None]:
path = r'Data_stations\ANA_DATA\CSV\csv_raw'
files = glob(os.path.join(path, '*.csv'))
li = []
for file in files:
    df = pd.read_csv(file, encoding='latin-1', skiprows=12, delimiter=';', index_col=False, usecols=[0,1,2,5], decimal=',', parse_dates=[2], dayfirst=True)
    idx = df['EstacaoCodigo'][0]
    li.append(df)
    df.to_csv(r'Data_stations\clean\\' + str(idx) + '.csv', index=False)

frame = pd.concat(li, axis=0, ignore_index=True)
frame = frame.rename(columns={'EstacaoCodigo':'Code','NivelConsistencia':'Consistency', 'Data': 'Date'})
frame.to_pickle(r'Data\ANA_clean.pkl')   
   