In [1]:
import os.path
import pandas as pd
from utils import Wrangling
import warnings
from unidecode import unidecode
import geopandas as gpd
from shapely.geometry import LineString
from mpl_toolkits.basemap import Basemap
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.io as pio
pio.renderers.keys()
pio.renderers.default = 'jupyterlab'


warnings.filterwarnings('ignore')

In [2]:
# check if all_data.csv exists, if it doesn't, run function to create it:
if os.path.exists("all_data.csv"):
    data = pd.read_csv("all_data.csv", dtype = str)
else:
    Wrangling.save_tables_from_pdata_cleans()
    data = pd.read_csv("all_data.csv", dtype = str)

data["file_name"] = data["file_name"].astype(str)# file name as string

I had issues with tables that share information from multiple ministerios in one row... check 20220218_135419.pdata_clean and the data_to_fix dataframe generated below. These problematic lines are not currently part of the analyses.

In [3]:
data_to_fix = data[~data["file_name"].str.contains("pdata_clean")] # uncomment this line to check 
# we could potentially keep these:
# data_problem["previsao_de_passageiros"] = data_problem["previsao_de_passageiros"].astype(str)
# data_problem = data_problem[~data_problem["previsao_de_passageiros"].str.contains("pdata_clean")]

Removing problematic lines from the main dataframe for analysis

In [4]:
# remove problematic data:
data["file_name"] = data["file_name"].astype(str)
data = data[data["file_name"].str.contains("pdf")]

# first round of grooming the dataframe - select specific columns, remove accents, extra spaces 
cols = ["autoridades_apoiadas","origem","decolagem_h_local","destino","pouso_h_local","motivo","previsao_de_passageiros","file_name"]
data_clean = Wrangling.clean_flights(data,cols) # to do: combine records removed on this grooming with the data_to_fix

# These are the specific grooming to city names:
data_clean = data_clean.replace("guarulhos", "sao paulo", regex=True)  
data_clean = data_clean.replace("lisboa", "lisbon", regex=True)  
data_clean = data_clean.replace("ascension island", "wide awake", regex=True)  
data_clean = data_clean.replace("port of spain", "port-of-spain", regex=True)  
data_clean = data_clean.replace("madri", "madrid", regex=True)  
data_clean = data_clean.replace("gran canaria island", "gran canaria", regex=True)  
data_clean = data_clean.replace("montevideu", "montevideo", regex=True)  
data_clean = data_clean.replace("^carajas$", "parauapebas", regex=True)  
data_clean = data_clean.replace("^londres$", "london", regex=True)  
data_clean = data_clean.replace("^praia$", "praia, santiago island", regex=True)  
data_clean = data_clean.replace("^ilha do sal$", "amilcar cabral", regex=True)  


# select unique - todo: select unique names from "origem" and "destino" and retrieve result into a vector with the number of times thta a given
# city appeared in the flight dataframe
unique_origem = data_clean.groupby('origem').nunique().reset_index()[['origem', 'file_name']]
unique_destino = data_clean.groupby('destino').nunique().reset_index()[['destino', 'file_name']]
# rename cols
unique_origem.columns = ['city','count']
unique_destino.columns = ['city','count']

unique_cities = pd.concat([unique_origem, unique_destino]) # bind origem and destino
del(unique_destino, unique_origem, cols, data_to_fix, data) # clean env

# group again by unique cities and sum count values
unique_cities = unique_cities.groupby('city').sum().reset_index()


Now import the airport names and locations from the whole world using the data published by OpenFlights

In [5]:
airports = pd.read_csv("airports.csv")

cols_names = ["airportid", "name", "city", "country", "iata", "icao", "latitude", "longitude", "altitude", "timezone", "dst", "tz_db", "type", "source"]
airports.columns = cols_names
del(cols_names)

# to lower
airports["city"] = airports["city"].str.lower()
# keep only cols of interest:
airports = airports[["city","country","latitude","longitude"]]

Merge unique cities sourced from FAB flights with airport locations

In [6]:
unique_cities_merge = unique_cities.merge(airports, left_on='city', right_on='city', how="left")
del(unique_cities)
# keep only cities that the location of airports was found in the step above

Find cities that matched with only one airport  

In [7]:
# we will need to perform a set of grooming in the unique_cities_merge. For cities that pass the multiple grooming criteria, 
# we will save those in unique_cities_merge

df = unique_cities_merge.loc[unique_cities_merge['country'].notna()] 
df = df.groupby('city').nunique().reset_index() # count unique 

# separate cities based on the number of combinations from merge
df = df[df['latitude'] == 1]
# update country, lat and long columns
df = df[["city"]].merge(airports, left_on='city', right_on='city', how="left")

# copy to unique_cities_clean
unique_cities_clean = df
del(df)


Start with fixing
1- check cities that did not have any matches with the airports from OpenFlights

In [8]:
# save the cities that did not have any matches in the merge above
df = unique_cities_merge.loc[unique_cities_merge['country'].isna()] 

# I reviewed all cities that were recorded more than 10 times
df = df[df['count'] > 10]
df = df[["city"]]

## to do: combine again with airports and then with brazilian airports dataframe
df = df.merge(airports, left_on='city', right_on='city', how="left")
# if cities appear in more than one country take the first option
df = df.groupby(["city"]).head(1)

# keep working with cities that still don't have lat lon and save those that were fixed in this step to the clean df:
unique_cities_clean = pd.concat([unique_cities_clean, df.loc[df['latitude'].notna()]])
df = df.loc[df['latitude'].isna()] 

In [9]:
br_cities = pd.read_csv("brazilian_cities.csv")
br_cities["nome"] = br_cities["nome"].str.lower()
br_cities["nome"] = br_cities["nome"].apply(unidecode) # replace letters with accents with the letter without accent.

# fix names:

# to do: only merge records without lat lon at this point and then group by and select one
df = df[["city", "country"]].merge(br_cities[["nome", "latitude", "longitude"]], left_on='city', right_on='nome', how="left")[["city", "country", "latitude", "longitude"]]

# country is Brazil
df["country"] = "Brazil"
# if cities appear in more than one state take the first option
df = df.groupby(["city"]).head(1)

# only two cities were not found: praia and ilha do sal, these will be removed from the analysis
unique_cities_clean = pd.concat([unique_cities_clean, df.loc[df['latitude'].notna()]])
del(df)

3 - find cities that matched with more than one airport and select just one combination (in case airport appeared more than 10 times in the data dataframe)
    - if one of the airports is in Brazil, keep it and remove the others (see vitoria)

In [10]:
df = unique_cities_merge.loc[unique_cities_merge['country'].notna()] 
df = df.groupby('city').nunique().reset_index() # count unique 

# separate cities based on the number of combinations from merge
df = df[df['latitude'] > 1]

# # update country, lat and long columns
df = df[["city"]].merge(airports, left_on='city', right_on='city', how="left")

# # .. and where one of them is in Brazil, choose that one.
df_br = df[df['country'].isin(["Brazil"])]
df_br = df_br.groupby('city').head(1)

# # from df, remove cities that were resolved above
sel = list(df_br['city']) # select cities that matched with more than one airport
df = df[~df['city'].isin(sel)]
df = df.groupby('city').head(1)

# save to clean df:
unique_cities_clean = pd.concat([unique_cities_clean, df, df_br])
del(df, df_br, sel)

# remove duplicates, if any:
unique_cities_clean = unique_cities_clean.drop_duplicates()


In [11]:
#find unique city values
city_all = unique_cities_merge.city.unique()
city_clean = unique_cities_clean.city.unique()

df = unique_cities_merge[~unique_cities_merge['city'].isin(city_clean)]
# 165 out of 320 without match, but these cities appeared less than 10 times in the timeseries.

In [12]:
# clean env:
del(df, city_all, city_clean, br_cities, airports, unique_cities_merge)

In [13]:
df = data_clean.merge(unique_cities_clean.add_suffix('_origem'), left_on='origem', right_on='city_origem', how="outer")
df = df.merge(unique_cities_clean.add_suffix('_destino'), left_on='destino', right_on='city_destino', how="outer")
df.columns

# select specific cols
cols = ["autoridades_apoiadas","origem","decolagem_h_local","destino","pouso_h_local","motivo","previsao_de_passageiros",
"file_name",'country_origem', 'latitude_origem', 'longitude_origem', 'country_destino', 'latitude_destino', 'longitude_destino']

data = df[cols] # select specific cols

# remove rows that do not have coordinates fro origem or destino
data.dropna(subset = ['latitude_origem', 'longitude_origem','latitude_destino', 'longitude_destino'], inplace=True)
del(df, cols)

In [14]:
# generate spatial
geometry = [LineString([[data.iloc[i]['longitude_origem'], data.iloc[i]['latitude_origem']], [data.iloc[i]['longitude_destino'], data.iloc[i]['latitude_destino']]]) for i in range(data.shape[0])]
data = gpd.GeoDataFrame(data, geometry=geometry, crs='EPSG:4326')
data.to_file("data_sf.gpkg")
print(data.columns)

Index(['autoridades_apoiadas', 'origem', 'decolagem_h_local', 'destino',
       'pouso_h_local', 'motivo', 'previsao_de_passageiros', 'file_name',
       'country_origem', 'latitude_origem', 'longitude_origem',
       'country_destino', 'latitude_destino', 'longitude_destino', 'geometry'],
      dtype='object')


In [15]:
data.reset_index()
data.insert(loc=0, column='row_num', value=np.arange(len(data)))
data.set_index("row_num", inplace=True)

In [47]:
import plotly.graph_objects as go
import pandas as pd


fig = go.Figure()

fig.add_trace(go.Scattergeo(
    # locationmode = 'USA-states',
    lon = unique_cities_clean['longitude'],
    lat = unique_cities_clean['latitude'],
    hoverinfo = 'text',
    text = unique_cities_clean['city'],
    mode = 'markers',
    marker = dict(
        size = 2,
        color = 'rgb(255, 0, 0)',
        line = dict(
            width = 2,
            color = 'rgba(68, 68, 68, 0)'
        )
    )))

flight_paths = []
for i in range(len(data)):
    
    fig.add_trace(
        go.Scattergeo(
            # locationmode = 'USA-states',
            lon = [data['longitude_origem'][i], data['longitude_destino'][i]],
            lat = [data['latitude_origem'][i], data['latitude_destino'][i]],
            mode = 'lines',
            line = dict(width = 1,color = 'fuchsia'),
            opacity = 0.1,
        )
    )


fig.update_layout(
    title_text = 'Jul. 2022 Brazilian minister flights with FAB airplanes between 2013 and 2022 <br>(Hover for airport names)',
    showlegend = False,
    geo = dict(
        # scope = 'north america',
        # projection_type = 'azimuthal equal area',
        landcolor = 'black',
        showcoastlines=True, coastlinecolor="darkgrey",
        showocean=True, oceancolor="black",  showlakes=False),
)

fig.show()


In [20]:
import plotly.express as px
import shapely
lats = []
lons = []
names = []
for feature, name in zip(data.geometry, data.autoridades_apoiadas):
    if isinstance(feature, shapely.geometry.linestring.LineString):
        linestrings = [feature]
    elif isinstance(feature, shapely.geometry.multilinestring.MultiLineString):
        linestrings = feature.geoms
    else:
        continue
    for linestring in linestrings:
        x, y = linestring.xy
        lats = np.append(lats, y)
        lons = np.append(lons, x)
        names = np.append(names, [name]*len(y))
        lats = np.append(lats, None)
        lons = np.append(lons, None)
        names = np.append(names, None)

fig = px.line_geo(lat=lats, lon=lons, color=names) # "continent" is one of the columns of gapminder
                  #projection="orthographic")
                  
fig.update_traces(line=dict(width=1), opacity=.1)
                  
fig.show()
# https://towardsdatascience.com/how-to-create-interactive-map-plots-with-plotly-7b57e889239a
# https://medium.com/@ozgunhaznedar/my-first-streamlit-app-using-choropleth-map-5e4e337a7d66
