## Meteorological data extraction using the IPMA API

Data import

In [2]:
import wget
import pandas as pd
import numpy as np
ROOT_FOLDER = "C:\\Users\\liamo\\Documents\\BIOINF\\SIB\\tp\\trab\\"
distritos_path = ROOT_FOLDER+"distritos.csv"
concelhos_path = ROOT_FOLDER+"concelhos.csv"
distritos = pd.read_csv(distritos_path,header=0)
concelhos = pd.read_csv(concelhos_path,header=0)

Formatting

In [3]:
rep_dict = {"ç":"c","â":"a","ã":"a","á":"a","à":"a","é":"e","ê":"e","í":"i","ó":"o","ô":"o","ú":"u"," ":"-"}
#convert to ipma format
distritos["nome_distrito"] = distritos["nome_distrito"].str.lower() 
distritos["nome_distrito"].replace(rep_dict,regex=True,inplace=True)
concelhos["nome_concelho"] = concelhos["nome_concelho"].str.lower()
concelhos["nome_concelho"].replace(rep_dict,regex=True,inplace=True)
# distritos["nome_distrito"].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True) #wrong!!!
# concelhos["nome_concelho"].replace({r'[^\x00-\x7F]+':''}, regex=True, inplace=True)

#filter districts
distritos = distritos[distritos["cod_distrito"] <= 18] 
concelhos = concelhos[concelhos["cod_distrito"] <= 18]

print(concelhos.head(),"\n",distritos.head())
print(distritos["nome_distrito"].unique())

   cod_distrito  cod_concelho         nome_concelho
0             1             6      castelo-de-paiva
1             1             7               espinho
2             1             8             estarreja
3             1             9  santa-maria-da-feira
4             1            10                ilhavo 
    cod_distrito   nome_distrito
0             1          aveiro
1             2            beja
2             3           braga
3             4        braganca
4             5  castelo-branco
['aveiro' 'beja' 'braga' 'braganca' 'castelo-branco' 'coimbra' 'evora'
 'faro' 'guarda' 'leiria' 'lisboa' 'portalegre' 'porto' 'santarem'
 'setubal' 'viana-do-castelo' 'vila-real' 'viseu']


In [4]:
cod_dist2 = {}
for row in range(distritos.shape[0]): cod_dist2[distritos.iloc[row,0]] = distritos.iloc[row,1]
cod_dist3 = [cod_dist2[x] for x in concelhos["cod_distrito"]]
#add column with correct format
concelhos["nome_distrito"] = np.array(cod_dist3)

#covert ids to ipma format
concelhos["cod_concelho"] = np.array(["0"+str(x) if x < 10 else str(x) for x in concelhos["cod_concelho"]])
concelhos["cod_distrito"] = np.array(["0"+str(x) if x < 10 else str(x) for x in concelhos["cod_distrito"]])

#add full id column
concelhos["cod_distconc"] = np.array([concelhos["cod_distrito"].iloc[x]+concelhos["cod_concelho"].iloc[x] for x in range(concelhos.shape[0])])
concelhos = concelhos.sort_values("cod_distconc",axis=0).reset_index()
concelhos.head()

Unnamed: 0,index,cod_distrito,cod_concelho,nome_concelho,nome_distrito,cod_distconc
0,14,1,1,agueda,aveiro,101
1,15,1,2,albergaria-a-velha,aveiro,102
2,16,1,3,anadia,aveiro,103
3,17,1,4,arouca,aveiro,104
4,18,1,5,aveiro,aveiro,105


Download

In [8]:
#download
# from os import path
variavel = {"temperature-min":"mtnmn","temperature-max":"mtxmx","precipitation-total":"mrrto"}
base_url = "https://api.ipma.pt/open-data/observation/climate/"
#https://api.ipma.pt/open-data/observation/climate/temperature-max/{distrito}/mtxmn-{DICO}-{concelho}.csv 

for v in variavel.keys():
    for row in range(concelhos.shape[0]):  
        url = base_url+v+"/"+concelhos["nome_distrito"][row]+"/"+variavel[v]+"-"+concelhos["cod_distconc"][
            row]+"-"+concelhos["nome_concelho"][row]+".csv"
#         filename = ROOT_FOLDER+"ipma\\"+v+"_"+concelhos["nome_distrito"][row]+"_"+concelhos[
#                 "nome_concelho"][row]+".csv"
        try:
#             if not path.exists(filename):
            wget.download(url, ROOT_FOLDER+"ipma\\"+v+"_"+concelhos["nome_distrito"][row]+"_"+concelhos[
                "nome_concelho"][row]+".csv")
        except: pass


100% [................................................................................] 6590 / 6590

File management

In [9]:
import os
import glob
os.chdir(ROOT_FOLDER+"ipma")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

#headers
precip = []
temp_min = []
temp_max = []
for i in glob.glob('*.{}'.format(extension)):
    if "precipitation" in i: precip.extend([i.split("_")[-1][:-4]]*6)
    elif "temperature-min" in i: temp_min.extend([i.split("_")[-1][:-4]]*6)
    else: temp_max.extend([i.split("_")[-1][:-4]]*6)        


In [10]:
# combine files per variable
comb_precip = pd.concat([pd.read_csv(f) for f in all_filenames if "precipitation" in f],axis=1)
comb_precip.columns = pd.MultiIndex.from_tuples(zip(precip, comb_precip.columns))

comb_mintemp = pd.concat([pd.read_csv(f) for f in all_filenames if "temperature-min" in f],axis=1)
comb_mintemp.columns = pd.MultiIndex.from_tuples(zip(temp_min, comb_mintemp.columns))

comb_maxtemp = pd.concat([pd.read_csv(f) for f in all_filenames if "temperature-max" in f],axis=1)
comb_maxtemp.columns = pd.MultiIndex.from_tuples(zip(temp_max, comb_maxtemp.columns))

print(comb_precip.shape,comb_mintemp.shape,comb_maxtemp.shape)

(86, 1662) (86, 1662) (87, 1662)


In [11]:
# comb_mintemp.loc[:,(slice(None),"date")]
date_precip = comb_precip.xs('date', axis = 1, level = 1)
date_mintemp = comb_mintemp.xs('date', axis = 1, level = 1)
date_maxtemp = comb_maxtemp.xs('date', axis = 1, level = 1)

#dates are consistent in each df
print((date_precip.nunique(axis=1) == 1).sum()) 
print((date_mintemp.nunique(axis=1) == 1).sum())
print((date_maxtemp.nunique(axis=1) == 1).sum())

#common dates
(date_precip.iloc[:,0].isin(date_mintemp.iloc[:,0]) & date_precip.iloc[:,0].isin(date_maxtemp.iloc[:,0])).sum()
#date filter
comb_precip = comb_precip.loc[date_precip.iloc[:,0].isin(date_mintemp.iloc[:,0]) & date_precip.iloc[:,0].isin(date_maxtemp.iloc[:,0]),:]
comb_mintemp = comb_mintemp.loc[date_mintemp.iloc[:,0].isin(date_precip.iloc[:,0]) & date_mintemp.iloc[:,0].isin(date_maxtemp.iloc[:,0]),:]
comb_maxtemp = comb_maxtemp.loc[date_maxtemp.iloc[:,0].isin(date_precip.iloc[:,0]) & date_maxtemp.iloc[:,0].isin(date_mintemp.iloc[:,0]),:]
print(comb_precip.shape,comb_mintemp.shape,comb_maxtemp.shape)

#filtrar linhas comuns com baixa variabilidade?

86
86
87
(85, 1662) (85, 1662) (85, 1662)


In [12]:
dates_final = comb_precip.loc[:,("albergaria-a-velha","date")].rename("Date",axis="columns",inplace=True)

comb_precip2 = comb_precip.drop(["date",'minimum','maximum','range','std'],level=1,axis=1)
comb_mintemp2 = comb_mintemp.drop(["date",'minimum','maximum','range','std'],level=1,axis=1)
comb_maxtemp2 = comb_maxtemp.drop(["date",'minimum','maximum','range','std'],level=1,axis=1)

comb_precip2 = comb_precip2.stack().reset_index().iloc[:,2:]
comb_mintemp2 = comb_mintemp2.stack().reset_index().iloc[:,2:]
comb_maxtemp2 = comb_maxtemp2.stack().reset_index().iloc[:,2:]
print(comb_precip2.shape,comb_mintemp2.shape,comb_maxtemp2.shape)

#final df
d = {'Precipitation' : comb_precip2, 'MinTemp' : comb_mintemp2,"MaxTemp":comb_maxtemp2}
ipma = pd.concat(d.values(), axis=1, keys=d.keys())
ipma.index = dates_final
print(ipma.head())

#export final file to csv
ipma.to_csv(ROOT_FOLDER+"ipma_clean.csv", index=True, encoding='utf-8-sig')

(85, 277) (85, 277) (85, 277)
           Precipitation                                     \
                abrantes   agueda aguiar-da-beira alandroal   
Date                                                          
2020-09-04      0.000000  0.00000             0.0       0.0   
2020-09-05      0.006481  0.00371             0.0       0.0   
2020-09-06      0.000110  0.00000             0.0       0.0   
2020-09-07      0.000000  0.00000             0.0       0.0   
2020-09-08      0.000000  0.00000             0.0       0.0   

                                                                            \
           albergaria-a-velha albufeira alcacer-do-sal  alcanena  alcobaca   
Date                                                                         
2020-09-04           0.000000  0.000000       0.000000  0.000000  0.000000   
2020-09-05           0.005914  0.000000       0.000000  0.002494  0.001766   
2020-09-06           0.000000  0.000000       0.000000  0.006322  0.035467 

### References
[1] Meteorological data: http://api.ipma.pt/#services<br>
[2] Area code data (concelhos/distritos): https://github.com/centraldedados/codigos_postais<br>