In [3]:
import pandas as pd
import requests
from urllib.parse import quote

BASE_URL = (
    "https://sistemas.anac.gov.br/dadosabertos/"
    "Voos%20e%20opera%C3%A7%C3%B5es%20a%C3%A9reas/"
    "Voo%20Regular%20Ativo%20%28VRA%29/"
)

MONTHS = {
    1: "Janeiro",
    2: "Fevereiro",
    3: "Março",
    4: "Abril",
    5: "Maio",
    6: "Junho",
    7: "Julho",
    8: "Agosto",
    9: "Setembro",
    10: "Outubro",
    11: "Novembro",
    12: "Dezembro",
}

YEARS = [2018, 2019, 2020, 2021, 2022, 2023, 2024]

dfs = []

def file_exists(url):
    try:
        r = requests.head(url, timeout=10)
        return r.status_code == 200
    except requests.RequestException:
        return False

for year in YEARS:
    for month_num, month_name in MONTHS.items():

        folder = f"{month_num:02d} - {month_name}"
        filename = f"VRA_{year}{month_num}.csv"

        url = f"{BASE_URL}{year}/{quote(folder)}/{filename}"

        if not file_exists(url):
            print(f"❌ Missing: {year}-{month_num:02d}")
            continue

        print(f"✔️ Downloading {year}-{month_num:02d}")

        df = pd.read_csv(
            url,
            sep=";",
            encoding="utf-8",
            skiprows=1,
            low_memory=False
        )

        df["year"] = year
        df["month"] = month_num

        dfs.append(df)

# =========================
# FINAL MERGE
# =========================

df_final = pd.concat(dfs, ignore_index=True)
print("✅ All available VRA data merged")
print(df_final.shape)


✔️ Downloading 2018-01
✔️ Downloading 2018-02
✔️ Downloading 2018-03
✔️ Downloading 2018-04
✔️ Downloading 2018-05
✔️ Downloading 2018-06
✔️ Downloading 2018-07
✔️ Downloading 2018-08
✔️ Downloading 2018-09
✔️ Downloading 2018-10
✔️ Downloading 2018-11
✔️ Downloading 2018-12
✔️ Downloading 2019-01
✔️ Downloading 2019-02
✔️ Downloading 2019-03
✔️ Downloading 2019-04
✔️ Downloading 2019-05
✔️ Downloading 2019-06
✔️ Downloading 2019-07
✔️ Downloading 2019-08
✔️ Downloading 2019-09
✔️ Downloading 2019-10
✔️ Downloading 2019-11
✔️ Downloading 2019-12
✔️ Downloading 2020-01
✔️ Downloading 2020-02
✔️ Downloading 2020-03
✔️ Downloading 2020-04
✔️ Downloading 2020-05
✔️ Downloading 2020-06
✔️ Downloading 2020-07
✔️ Downloading 2020-08
✔️ Downloading 2020-09
✔️ Downloading 2020-10
✔️ Downloading 2020-11
✔️ Downloading 2020-12
✔️ Downloading 2021-01
✔️ Downloading 2021-02
✔️ Downloading 2021-03
✔️ Downloading 2021-04
✔️ Downloading 2021-05
✔️ Downloading 2021-06
✔️ Downloading 2021-07
✔️ Download

In [2]:
df_final

Unnamed: 0,ICAO Empresa Aérea,Número Voo,Código Autorização (DI),Código Tipo Linha,ICAO Aeródromo Origem,ICAO Aeródromo Destino,Partida Prevista,Partida Real,Chegada Prevista,Chegada Real,Situação Voo,Código Justificativa,year,month
0,AFR,454,0,I,LFPG,SBGR,01/01/2018 20:20,,02/01/2018 08:20,,REALIZADO,,2018,1
1,ARG,1248,0,I,SABE,SBGR,01/01/2018 19:30,,01/01/2018 23:15,,REALIZADO,,2018,1
2,ARG,1251,1,I,SBGL,SABE,,01/01/2018 19:53,,01/01/2018 22:10,REALIZADO,,2018,1
3,ARG,1254,1,I,SABE,SBGL,,01/01/2018 11:02,,01/01/2018 14:29,REALIZADO,,2018,1
4,ARG,1278,0,I,SABE,SBFL,01/01/2018 17:55,,01/01/2018 21:05,,REALIZADO,,2018,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1035164,AZU,4040,0,N,SBCF,SBBR,31/12/2018 07:45,31/12/2018 08:07,31/12/2018 09:10,31/12/2018 09:26,REALIZADO,,2018,12
1035165,AZU,4034,0,N,SBVT,SBKP,31/12/2018 20:20,31/12/2018 20:20,31/12/2018 22:00,31/12/2018 22:00,REALIZADO,,2018,12
1035166,AZU,4033,0,N,SBRJ,SBKP,31/12/2018 16:35,31/12/2018 16:29,31/12/2018 17:45,31/12/2018 17:28,REALIZADO,,2018,12
1035167,AZU,4029,0,N,SBRJ,SBKP,31/12/2018 07:55,31/12/2018 07:46,31/12/2018 09:05,31/12/2018 08:49,REALIZADO,,2018,12


In [5]:
df_final['year'].unique()

array([2018, 2019, 2020, 2021, 2022, 2023, 2024])

In [21]:
anac_df = df_final.copy()

In [22]:
anac_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6411860 entries, 0 to 6411859
Data columns (total 14 columns):
 #   Column                   Dtype 
---  ------                   ----- 
 0   ICAO Empresa Aérea       object
 1   Número Voo               object
 2   Código Autorização (DI)  object
 3   Código Tipo Linha        object
 4   ICAO Aeródromo Origem    object
 5   ICAO Aeródromo Destino   object
 6   Partida Prevista         object
 7   Partida Real             object
 8   Chegada Prevista         object
 9   Chegada Real             object
 10  Situação Voo             object
 11  Código Justificativa     object
 12  year                     int64 
 13  month                    int64 
dtypes: int64(2), object(12)
memory usage: 684.9+ MB


In [23]:
anac_df.isna().sum()

Unnamed: 0,0
ICAO Empresa Aérea,0
Número Voo,0
Código Autorização (DI),832
Código Tipo Linha,604
ICAO Aeródromo Origem,0
ICAO Aeródromo Destino,0
Partida Prevista,1116291
Partida Real,1304472
Chegada Prevista,132897
Chegada Real,339208


In [24]:
cols_time = [
    "Partida Prevista",
    "Partida Real",
    "Chegada Prevista",
    "Chegada Real"
]

anac_clean = anac_df.dropna(subset=cols_time)


In [25]:
anac_clean = anac_clean.drop(columns=["Código Autorização (DI)",
                                      "Código Tipo Linha",
                                      "Código Justificativa"])


In [26]:
anac_clean.isna().sum()

Unnamed: 0,0
ICAO Empresa Aérea,0
Número Voo,0
ICAO Aeródromo Origem,0
ICAO Aeródromo Destino,0
Partida Prevista,0
Partida Real,0
Chegada Prevista,0
Chegada Real,0
Situação Voo,0
year,0


In [28]:
anac_clean

Unnamed: 0,ICAO Empresa Aérea,Número Voo,ICAO Aeródromo Origem,ICAO Aeródromo Destino,Partida Prevista,Partida Real,Chegada Prevista,Chegada Real,Situação Voo,year,month
7,AVA,85,SKBO,SBGR,01/01/2018 01:04,01/01/2018 01:06,01/01/2018 07:15,01/01/2018 07:01,REALIZADO,2018,1
11,AZU,2404,SBRF,SBMO,01/01/2018 13:50,01/01/2018 13:44,01/01/2018 14:45,01/01/2018 14:35,REALIZADO,2018,1
13,AZU,2428,SBCF,SBBR,01/01/2018 17:30,01/01/2018 17:25,01/01/2018 19:00,01/01/2018 18:45,REALIZADO,2018,1
17,AZU,2537,SBCF,SBPA,01/01/2018 21:35,01/01/2018 21:52,02/01/2018 00:01,02/01/2018 00:27,REALIZADO,2018,1
19,AZU,2596,SBCY,SBPV,01/01/2018 12:15,01/01/2018 11:59,01/01/2018 14:10,01/01/2018 13:50,REALIZADO,2018,1
...,...,...,...,...,...,...,...,...,...,...,...
6411800,TAM,3962,SBGL,SBPA,2024-12-23 07:10:00,2024-12-23 06:58:00,2024-12-23 09:15:00,2024-12-23 09:03:00,REALIZADO,2024,12
6411801,TAM,3962,SBGL,SBPA,2024-12-24 07:10:00,2024-12-24 07:02:00,2024-12-24 09:15:00,2024-12-24 09:09:00,REALIZADO,2024,12
6411802,TAM,3962,SBGL,SBPA,2024-12-26 07:10:00,2024-12-26 07:07:00,2024-12-26 09:15:00,2024-12-26 09:02:00,REALIZADO,2024,12
6411803,TAM,3962,SBGL,SBPA,2024-12-27 07:10:00,2024-12-27 07:05:00,2024-12-27 09:15:00,2024-12-27 09:00:00,REALIZADO,2024,12


Mudar nomes das colunas e tipos