In [1]:
import os.path
import pandas as pd
from utils import Wrangling
import warnings
from unidecode import unidecode

warnings.filterwarnings('ignore')

In [2]:
# check if all_data.csv exists, if it doesn't, run function to create it:
if os.path.exists("all_data.csv"):
    data = pd.read_csv("all_data.csv", dtype = str)
else:
    Wrangling.save_tables_from_pdfs()
    data = pd.read_csv("all_data.csv", dtype = str)

data["file_name"] = data["file_name"].astype(str)# file name as string

I had issues with tables that share information from multiple ministerios in one row... check 20220218_135419.pdf and the data_to_fix dataframe generated below. These problematic lines are not currently part of the analyses.

In [3]:
data_to_fix = data[~data["file_name"].str.contains("pdf")] # uncomment this line to check 
# we could potentially keep these:
# data_problem["previsao_de_passageiros"] = data_problem["previsao_de_passageiros"].astype(str)
# data_problem = data_problem[~data_problem["previsao_de_passageiros"].str.contains("pdf")]

Removing problematic lines from the main dataframe for analysis

In [4]:
# remove problematic data:
data["file_name"] = data["file_name"].astype(str)
data = data[data["file_name"].str.contains("pdf")]

# first round of grooming the dataframe - select specific columns, remove accents, extra spaces 
cols = ["autoridades_apoiadas","origem","decolagem_h_local","destino","pouso_h_local","motivo","previsao_de_passageiros","file_name"]
data_clean = Wrangling.clean_flights(data,cols) # to do: combine records removed on this grooming with the data_to_fix

# select unique - todo: select unique names from "origem" and "destino" and retrieve result into a vector with the number of times thta a given
# city appeared in the flight dataframe
unique_origem = data_clean.groupby('origem').nunique().reset_index()[['origem', 'file_name']]
unique_destino = data_clean.groupby('destino').nunique().reset_index()[['destino', 'file_name']]
# rename cols
unique_origem.columns = ['city','count']
unique_destino.columns = ['city','count']

unique_cities = pd.concat([unique_origem, unique_destino]) # bind origem and destino
del(unique_destino, unique_origem, cols, data_to_fix, data) # clean env

# group again by unique cities and sum count values
unique_cities = unique_cities.groupby('city').sum().reset_index()


Now import the airport names and locations from the whole world using the data published by OpenFlights

In [5]:
airports = pd.read_csv("airports.csv")

cols_names = ["airportid", "name", "city", "country", "iata", "icao", "latitude", "longitude", "altitude", "timezone", "dst", "tz_db", "type", "source"]
airports.columns = cols_names
del(cols_names)

# to lower
airports["city"] = airports["city"].str.lower()
# keep only cols of interest:
airports = airports[["city","country","latitude","longitude"]]

Merge unique cities sourced from FAB flights with airport locations

In [6]:
merge_cities_airports = unique_cities.merge(airports, left_on='city', right_on='city', how="left")
# keep only cities that the location of airports was found in the step above


Start with fixing
1- check cities that did not have any matches with the airports from OpenFlights

In [8]:
# save the cities that did not have any matches in the merge above
merge_cities_airports_check = merge_cities_airports.loc[merge_cities_airports['country'].isna()] 

# I reviewed all cities that were recorded more than 10 times
merge_cities_airports_check = merge_cities_airports_check[merge_cities_airports_check['count'] > 10]

# These are the specific grooming to city names:
merge_cities_airports_check = merge_cities_airports_check.replace("guarulhos", "sao paulo", regex=True)  
merge_cities_airports_check = merge_cities_airports_check.replace("lisboa", "lisbon", regex=True)  
merge_cities_airports_check = merge_cities_airports_check.replace("ascension island", "wide awake", regex=True)  
merge_cities_airports_check = merge_cities_airports_check.replace("port of spain", "port-of-spain", regex=True)  
merge_cities_airports_check = merge_cities_airports_check.replace("madri", "madrid", regex=True)  
merge_cities_airports_check = merge_cities_airports_check.replace("gran canaria island", "gran canaria", regex=True)  
merge_cities_airports_check = merge_cities_airports_check.replace("montevideu", "montevideo", regex=True)  
merge_cities_airports_check = merge_cities_airports_check[["city"]]

## to do: combine again with airports and then with brazilian airports dataframe
merge_cities_airports_check = merge_cities_airports_check.merge(airports, left_on='city', right_on='city', how="left")
merge_cities_airports_check = merge_cities_airports_check.groupby(["city"]).head(1)

br_cities = pd.read_csv("brazilian_cities.csv")
br_cities["nome"] = br_cities["nome"].str.lower()
br_cities["nome"] = br_cities["nome"].apply(unidecode) # replace letters with accents with the letter without accent.

# to do: only merge records without lat lon at this point and then group by and select one
merge_cities_airports_check = merge_cities_airports_check.merge(br_cities, left_on='city', right_on='nome', how="left")


2 - find cities that matched with only one airport  

In [66]:
merge_cities_airports = merge_cities_airports.loc[merge_cities_airports['country'].notna()] 
merge_n_combinations = merge_cities_airports.groupby('city').nunique().reset_index() # count unique 

# separate cities based on the number of combinations from merge
merge_one_combination = merge_n_combinations[merge_n_combinations['latitude'] == 1]

3 - find cities that matched with more than one airport and select just one combination (in case airport appeared more than 10 times in the data dataframe)
    - if one of the airports is in Brazil, keep it and remove the others (see vitoria)

In [67]:
# separate cities based on the number of combinations from merge
merge_multiple_combinations = merge_n_combinations[merge_n_combinations['latitude'] > 1]

# select cities that have multiple airport matches
sel = list(merge_multiple_combinations['city']) # select cities that matched with more than one airport
merge_multiple_combinations = merge_cities_airports[merge_cities_airports['city'].isin(sel)]

# .. and where one of them is in Brazil.
merge_multiple_combinations_br = merge_multiple_combinations[merge_multiple_combinations['country'].isin(["Brazil"])]

 - if more than one airport is in a city in Brazil (see sao paulo), take the first airport location
 - if none of the airports is in brazil (use column from merge above to remove cities in brazil), group by country + city, select combination with country > airports, merge again with locations and select head

 https://stackoverflow.com/questions/53842287/select-rows-with-highest-value-from-groupby