# Preprocessing Data from MongoDB

In [1]:
# imports
from pymongo import MongoClient
from pprint import pprint
import pandas as pd
import numpy as np
from datetime import datetime
import os 
import glob

In [24]:
# Client connects to "localhost" by default 
client = MongoClient()

# Create new client
db = client['TravelDashboard']

# get path of all .csv files in csv_data folder
all_files = glob.glob(os.path.join(os.path.abspath(""), "Preprocessed_data_24h_2mins", "*.csv"))

# Loop over all files in csv_data folder and insert them into the MongoDB
for file in all_files:
    db.travel_data.insert_many(pd.read_csv(file, index_col=0).to_dict(orient='records'))

In [20]:
db.travel_data.drop()

In [25]:
query_landing = {
    "geo_altitude" : { "$lt" : 3000 },
    "vertical_rate" : { "$lt" : -4}
}
query_starting = {
    "geo_altitude" : { "$lt" : 3000 },
    "vertical_rate" : { "$gt" : 4}
}

In [26]:
def get_data(collection, query_starting, query_landing):
    df_starting = pd.DataFrame(list(collection.find(query_starting)))
    df_landing = pd.DataFrame(list(collection.find(query_landing)))
    df_landing = df_landing.drop(columns="_id")
    df_starting = df_starting.drop(columns="_id")

    return df_starting, df_landing


In [27]:
starting, landing = get_data(collection=db.travel_data, query_starting=query_starting, query_landing=query_landing)

In [30]:
def get_total_per_country(df_starting = starting, df_landing = landing, from_time = '00:00:00', to_time = '23:59:59'):
    #convert epoch time to datetime
    df_starting['dtime'] = pd.to_datetime(df_starting['time'], unit='s')
    df_landing['dtime'] = pd.to_datetime(df_landing['time'], unit='s')
    #define period you want to look at
    start = datetime.strptime(from_time, '%H:%M:%S').time()
    end = datetime.strptime(to_time, '%H:%M:%S').time()
    df_starting = df_starting[df_starting['dtime'].dt.time.between(start, end)]
    df_landing = df_landing[df_landing['dtime'].dt.time.between(start, end)]

    
    #only keep one row per callsign (with the lowest altitude because closest to airport)
    df_starting = df_starting.sort_values(by=['geo_altitude'],ascending=True).groupby('callsign',as_index=False).first()
    df_landing = df_landing.sort_values(by=['geo_altitude'],ascending=True).groupby('callsign',as_index=False).first()

    df_starting = df_starting.groupby('country_cc').agg({'avg_no_seats': 'sum'})
    df_landing = df_landing.groupby('country_cc').agg({'avg_no_seats': 'sum'})
   
    sum_pass = df_landing.avg_no_seats.sub(df_starting['avg_no_seats'], fill_value = 0)
    

    return sum_pass