## Load the preprocessed flight data from our Python script into the MongoDB

### Imports

In [1]:
# imports
from pymongo import MongoClient
from pprint import pprint
import pymongo
import pandas as pd
import os
import glob
from datetime import datetime

In [2]:
# Client connects to "localhost" by default 
client = MongoClient()

In [3]:
# Create new client
db = client['TravelDashboard']

In [111]:
# first let's drop the collection
#db.travel_data.drop()

Check number of documents in the travel_data collection

In [112]:
db.travel_data.count_documents({})

0

## Load, preprocess and dump files into MongoDB

In [116]:
# Check current directory where notebook is located
os.path.abspath("")  # in python it should not be (""), but (__file__) !!!

# get path of all .csv files in csv_data folder
all_files = glob.glob(
    os.path.join(os.path.dirname(os.path.abspath("")), "data", "csv_data", "*.csv"))


In [117]:
# Put all the documents in the Mongo collection travel_data
cols = [
    "callsign", "geo_altitude", "country_cc", "avg_no_seats", "vertical_rate"
]

for file in all_files:
    df = pd.read_csv(file, index_col=0)

    #filter dataframe to only include columns and rows which are relevant to identify starting and landing planes
    df_clean = df[(df.geo_altitude<=3000)&(~df.vertical_rate.between(-4,4))&(~df.callsign.isna())&(~df.country_cc.isna())]\
        [["callsign", "geo_altitude", "country_cc", "avg_no_seats", "vertical_rate", "time"]].reset_index(drop=True)

    #Convert to datetime
    df_clean["time"] = pd.to_datetime(df_clean["time"], unit='s')
    
    #create document to put into mongo (grouped by time)
    dict_mongo = df_clean.groupby("time").apply(lambda x: x[cols].to_dict(
        "records")).reset_index().rename(columns={
            0: "flight_data"
        }).to_dict(orient="records")[0]
    db.travel_data.insert_one(dict_mongo)

### Create index

In [119]:
#Check number of doc ins collection 
db.travel_data.count_documents({})

2000

For a faster read from the MongoDB we create several indexes which hopefully improve our query speed

In [120]:
db.travel_data.create_index([("time", pymongo.DESCENDING)])

'time_-1'

In [121]:
db.travel_data.create_index(
    [
        ("time", pymongo.DESCENDING),
        ("flight_data.geo_altitude", pymongo.ASCENDING),
        ("flight_data.vertical_rate", pymongo.ASCENDING)
     ])

'time_-1_flight_data.geo_altitude_1_flight_data.vertical_rate_1'

In [122]:
db.travel_data.create_index(
    [
        ("time", pymongo.DESCENDING),
        ("flight_data.vertical_rate", pymongo.ASCENDING)
     ])

'time_-1_flight_data.vertical_rate_1'

In [123]:
db.travel_data.create_index(
    [
        ("time", pymongo.DESCENDING),
        ("flight_data.geo_altitude", pymongo.ASCENDING),
     ])

'time_-1_flight_data.geo_altitude_1'

### Check min and max time in the DB

Lets find the min and max time in our collection

In [124]:
min_max_time = []
for i in [1,-1]:

    res = (db.travel_data.find().sort([("time",i)]).limit(1))
    for doc in res:
        min_max_time.append(doc["time"])

In [125]:
min_max_time

[datetime.datetime(2022, 6, 27, 4, 47, 58),
 datetime.datetime(2022, 6, 30, 10, 56, 1)]