# Assignment 3

Import libraries and define common helper functions

In [1]:
import os
import sys
import gzip
import json
from pathlib import Path
import csv

import pandas as pd
import s3fs
import pyarrow as pa
from pyarrow.json import read_json
import pyarrow.parquet as pq
import fastavro
import pygeohash
import snappy
import jsonschema
from jsonschema.exceptions import ValidationError

import shutil
import math


endpoint_url='https://storage.budsc.midwest-datascience.com'

current_dir = Path(os.getcwd()).absolute()
schema_dir = current_dir.joinpath('schemas')
results_dir = current_dir.joinpath('results')
results_dir.mkdir(parents=True, exist_ok=True)


def read_jsonl_data():
    """
    Loads data from from the source path and returns it in JSON format
    """
    src_data_path = 'data/processed/openflights/routes.jsonl.gz'
    with open(src_data_path, 'rb') as f_gz:
        with gzip.open(f_gz, 'rb') as f:
            records = [json.loads(line) for line in f.readlines()]
        

    return records

In [2]:
records = read_jsonl_data()

# My go-to flight for international travel 
print(json.dumps(records[20022], sort_keys=False, indent=4))

{
    "airline": {
        "airline_id": 2009,
        "name": "Delta Air Lines",
        "alias": "CSA Czech Airlines",
        "iata": "DL",
        "icao": "DAL",
        "callsign": "DELTA",
        "country": "United States",
        "active": true
    },
    "src_airport": {
        "airport_id": 3864,
        "name": "Albany International Airport",
        "city": "Albany",
        "country": "United States",
        "iata": "ALB",
        "icao": "KALB",
        "latitude": 42.74829864501953,
        "longitude": -73.80169677734375,
        "altitude": 285,
        "timezone": -5.0,
        "dst": "A",
        "tz_id": "America/New_York",
        "type": "airport",
        "source": "OurAirports"
    },
    "dst_airport": {
        "airport_id": 3682,
        "name": "Hartsfield Jackson Atlanta International Airport",
        "city": "Atlanta",
        "country": "United States",
        "iata": "ATL",
        "icao": "KATL",
        "latitude": 33.6367,
        "longitude": -8

In [26]:
df = pd.DataFrame(records)

In [27]:

def make_key(row):
    try:
        # not sure if row.airline.get('iata') would be better
        airline = row.airline['iata']
        src_airport = row.src_airport['iata']
        dst_airport = row.dst_airport['iata']
        key = f'{src_airport}{dst_airport}{airline}'
    except:
        # Provide a default value, just in case
        key = 'ZZZZZZZZ'
    return(key)

In [28]:
df['key'] = df.apply(make_key, axis=1)

In [29]:
df

Unnamed: 0,airline,src_airport,dst_airport,codeshare,equipment,key
0,"{'airline_id': 410, 'name': 'Aerocondor', 'ali...","{'airport_id': 2965, 'name': 'Sochi Internatio...","{'airport_id': 2990, 'name': 'Kazan Internatio...",False,[CR2],AERKZN2B
1,"{'airline_id': 410, 'name': 'Aerocondor', 'ali...","{'airport_id': 2966, 'name': 'Astrakhan Airpor...","{'airport_id': 2990, 'name': 'Kazan Internatio...",False,[CR2],ASFKZN2B
2,"{'airline_id': 410, 'name': 'Aerocondor', 'ali...","{'airport_id': 2966, 'name': 'Astrakhan Airpor...","{'airport_id': 2962, 'name': 'Mineralnyye Vody...",False,[CR2],ASFMRV2B
3,"{'airline_id': 410, 'name': 'Aerocondor', 'ali...","{'airport_id': 2968, 'name': 'Chelyabinsk Bala...","{'airport_id': 2990, 'name': 'Kazan Internatio...",False,[CR2],CEKKZN2B
4,"{'airline_id': 410, 'name': 'Aerocondor', 'ali...","{'airport_id': 2968, 'name': 'Chelyabinsk Bala...","{'airport_id': 4078, 'name': 'Tolmachevo Airpo...",False,[CR2],CEKOVB2B
...,...,...,...,...,...,...
67658,"{'airline_id': 4178, 'name': 'Regional Express...","{'airport_id': 6334, 'name': 'Whyalla Airport'...","{'airport_id': 3341, 'name': 'Adelaide Interna...",False,[SF3],WYAADLZL
67659,"{'airline_id': 19016, 'name': 'Apache Air', 'a...","{'airport_id': 4029, 'name': 'Domodedovo Inter...","{'airport_id': 2912, 'name': 'Manas Internatio...",False,[734],DMEFRUZM
67660,"{'airline_id': 19016, 'name': 'Apache Air', 'a...","{'airport_id': 2912, 'name': 'Manas Internatio...","{'airport_id': 4029, 'name': 'Domodedovo Inter...",False,[734],FRUDMEZM
67661,"{'airline_id': 19016, 'name': 'Apache Air', 'a...","{'airport_id': 2912, 'name': 'Manas Internatio...","{'airport_id': 2913, 'name': 'Osh Airport', 'c...",False,[734],FRUOSSZM


## 3.1

### 3.1.a JSON Schema

In [3]:
def validate_jsonl_data(records):
    # Schema path
    schema_path = schema_dir.joinpath('routes-schema.json')
    # Validation_csv path
    validation_csv_path = results_dir.joinpath('validation-results.csv')
    
    # Open the schema and read it to memory
    with open(schema_path) as f:
        schema = json.load(f)
    
    # Open the validation CSV
    with open(validation_csv_path, 'w', encoding="utf-8") as f:
        # Create column names
        fieldnames = ['row_num', 'is_valid', 'msg']
        
        # Assign CSV writer object
        csv_writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator = '\n')
        csv_writer.writeheader()
        
        # Iterate over all the records and verify they align with the schema
        for i, record in enumerate(records):
            try:
                # Entry conforms to the schema
                jsonschema.validate(record, schema=schema)
                result = dict(
                    row_num = i,
                    is_valid = True,
                    msg = record
                )
            except ValidationError as e:
                # Entry does not conform to the schema
                result = dict(
                    row_num = i,
                    is_valid = False,
                    msg = record
                )
            finally:
                # Write the line to the CSV.
                csv_writer.writerow(result)
                
validate_jsonl_data(records)

KeyboardInterrupt: 

In [5]:
pd.read_csv("results/validation-results.csv")['is_valid'].value_counts()

True     60910
False      790
Name: is_valid, dtype: int64

### 3.1.b Avro

In [6]:
def create_avro_dataset(records):
    # Schema Path
    schema_path = schema_dir.joinpath('routes.avsc')
    
    # Output Path
    data_path = results_dir.joinpath('routes.avro')
    
    # Load the schema
    with open(schema_path) as f:
        schema = json.load(f)
        
    # Write to the output 
    with open(data_path, 'wb') as out:
        fastavro.writer(out, schema, records)
    
        
create_avro_dataset(records)

### 3.1.c Parquet

In [7]:
def create_parquet_dataset():
    # Get paths
    src_data_path = 'data/processed/openflights/routes.jsonl.gz'
    parquet_output_path = results_dir.joinpath('routes.parquet')
    
    # Open the routes.json
    with open(src_data_path, 'rb') as f_gz:
        with gzip.open(f_gz, 'rb') as f:
            
            # Read the table from the json
            table = pa.json.read_json(f)
            
            # Write the data to the parquet
            pq.write_table(table, parquet_output_path)

create_parquet_dataset()

### 3.1.d Protocol Buffers

In [None]:
sys.path.insert(0, os.path.abspath('routes_pb2'))

import routes_pb2

def _airport_to_proto_obj(airport):
    obj = routes_pb2.Airport()
    if airport is None:
        return None
    
    if airport.get('airport_id') is None:
        return None

    obj.airport_id = airport.get('airport_id')
    if airport.get('name'):
        obj.name = airport.get('name')
    if airport.get('city'):
        obj.city = airport.get('city')
    if airport.get('iata'):
        obj.iata = airport.get('iata')
    if airport.get('icao'):
        obj.icao = airport.get('icao')
    if airport.get('altitude'):
        obj.altitude = airport.get('altitude')
    if airport.get('timezone'):
        obj.timezone = airport.get('timezone')
    if airport.get('dst'):
        obj.dst = airport.get('dst')
    if airport.get('tz_id'):
        obj.tz_id = airport.get('tz_id')
    if airport.get('type'):
        obj.type = airport.get('type')
    if airport.get('source'):
        obj.source = airport.get('source')

    obj.latitude = airport.get('latitude')
    obj.longitude = airport.get('longitude')

    return obj


def _airline_to_proto_obj(airline):
    """
    Creating a proto object 
    """
    obj = routes_pb2.Airline()
    ## TODO: Create an Airline obj using Protocol Buffers API
    if airline is None:
        # Return None is raising a TypeError, so I'll just return the empty object instead
        # return None
        return obj 
    if airline.get('airline_id') is None:
        # Same here
        # return None
        return obj
    
    # Extract data from the JSON
    obj.airline_id = airline.get('airline_id')
    # Name
    if airline.get('name'):
        obj.name = airline.get('name')
        
    # Alias
    if airline.get('alias'):
        obj.alias = airline.get('alias')
    # iata
    if airline.get('iata'):
        obj.iata = airline.get('iata')
    # icao
    if airline.get('icao'):
        obj.icao = airline.get('icao')
    # callsign
    if airline.get('callsign'):
        obj.callsign = airline.get('callsign')
    # country
    if airline.get('country'):
        obj.country = airline.get('country')
    
    obj.active = airline.get('active')
        
    return obj


def create_protobuf_dataset(records):
    routes = routes_pb2.Routes()
    for record in records:
        route = routes_pb2.Route()
        
        # Source airport
        src_airport = _airport_to_proto_obj(record.get('src_airport'))
        # Destination airport
        dst_airport = _airport_to_proto_obj(record.get('dst_airport'))
        # Airline
        airline = _airline_to_proto_obj(record.get('airline'))
        # Code Share
        codeshare = record.get('codeshare')
        # Equipment
        equipment = record.get('equipment')
        
        # Some values need to be occupied in the proto-buffer
        # so if they are empty, impute them with 0s
        if not src_airport == None:
            route.src_airport.CopyFrom(src_airport)
        else:
            # Mandatory values
            route.src_airport.airport_id = 0
            route.src_airport.latitude = 0
            route.src_airport.longitude = 0
            
        if not dst_airport == None:
            route.dst_airport.CopyFrom(dst_airport)
        else:
            # Mandatory values
            route.dst_airport.airport_id = 0
            route.dst_airport.latitude = 0
            route.dst_airport.longitude = 0
            
        if not airline == None:
            route.airline.CopyFrom(airline)
        else:
            # Mandatory values
            route.airline.airline_id = 0
            route.airline.name = None
            route.airline.active = False

        route.codeshare = codeshare
        route.equipment.extend(equipment)
        
        routes.route.append(route)

    # Uncompressed
    data_path = results_dir.joinpath('routes.pb')
    with open(data_path, 'wb') as f:
        f.write(routes.SerializeToString())
        
    # Snappy Compression
    compressed_path = results_dir.joinpath('routes.pb.snappy')
    with open(compressed_path, 'wb') as f:
        f.write(snappy.compress(routes.SerializeToString()))
        
create_protobuf_dataset(records)

### 3.1.e Output Sizes

In [None]:
comparison_csv_path = results_dir.joinpath('comparison.csv')

# Open a comparison CSV
with open(comparison_csv_path, 'w', encoding="utf-8") as f:
    # Column Names
    fieldnames = ['file', 'size_MB']
    
    # CSV writer object
    csv_writer = csv.DictWriter(f, fieldnames=fieldnames, lineterminator = '\n')
    # Write the column heads
    csv_writer.writeheader()
    
    # Iterate over the files in the results directory
    for files in os.walk(results_dir):
        # Try loop for those geohash files
        try:
            # If I already ran it, I don't want to include the comparison.csv
            for file in files[2]:
                if file == "comparison.csv":
                    next
                else:
                    # Fancy formatting for Jupyter Notebook
                    # I hardcode numbers because I am a criminal 
                    print(f"{file:>25}: {int(os.stat(results_dir.joinpath(file))[6]) / 1024 / 1024:.2f}MB")
                    
                    # Make a dict to write to the CSV
                    file_size = dict(file = file, size_MB = int(os.stat(results_dir.joinpath(file))[6]) / 1024 / 1024)
                    
                    # Write to the CSV
                    csv_writer.writerow(file_size)
                
        # If I run this multiple times, it throws errors because of the geohash files
        except FileNotFoundError:
            pass

## 3.2

### 3.2.a Simple Geohash Index

In [None]:
def create_hash_dirs(records):
    # Starting path for geoindex
    geoindex_dir = results_dir.joinpath('geoindex')
    geoindex_dir.mkdir(exist_ok=True, parents=True)
    
    # Generating a list of dictonaries with their hash and record values
    hashes = []
    for record in records:
        try:
            src_lat = record.get('src_airport').get('latitude')
            src_long = record.get('src_airport').get('longitude')
        except AttributeError:
            # This needs to not be null, so skip the entry if we don't have the coordinates
            next
        # Get the location hash
        location_hash = pygeohash.encode(src_lat, src_long)
        # Build a dictionary to store the data in for processing
        hashes.append(dict(hash = location_hash,
                           record = record))

    # Iterate over each dictionary we added to the hashes list. 
    for hashed_location in hashes:
        # Get the hash value from the dictionary we made
        geo_hash = hashed_location.get('hash')
        
        # Build the directory and file name using the hash values
        # Only going two levels deep, let's not go crazy here 
        dir_name = results_dir.joinpath('geoindex').joinpath(geo_hash[0]).joinpath(geo_hash[0:2])
        file_name = geo_hash[0:3] + '.jsonl'
        path_name = os.path.join(dir_name, file_name)
        
        # If the path doesn't exist, make it
        os.makedirs(os.path.dirname(path_name), exist_ok=True)

        with open(path_name, "a") as f: 
            # Save the record
            # Since the search function reads each entry line-by-line, it's easier to have the formatting flat
            # so no indenting will be used inside the file 
            f.write(json.dumps(hashed_location.get('record')) + '\n') # Need a new line too or else it's just one big line
            
    # Zip up all the JSON files we just made
    for root, dirs, files in os.walk(geoindex_dir):
        for file in files:
            # Adding this if just to make sure I don't fill up my drive by compressing already compressed files
            if file.split(".")[-1] == "jsonl":
                fname = os.path.join(root,file)
                with open(fname, 'rb') as f_in:
                    with gzip.open(fname + '.gz', 'wb') as f_out:
                    
                        # Make a compressed copy
                        shutil.copyfileobj(f_in, f_out)
                # Remove the original JSON file
                os.remove(fname)
        
    
create_hash_dirs(records)

### 3.2.b Simple Search Feature

In [None]:
def airport_search(latitude, longitude):
    
    # Get the hash of the location we want to search for
    search_hash = pygeohash.encode(latitude, longitude)
    # Build the directory using the first two values in the hash
    search_dir = results_dir.joinpath('geoindex').joinpath(search_hash[0]).joinpath(search_hash[0:2])
    # Build the file name
    search_file = search_hash[0:3] + '.jsonl.gz'
    # Build the full path for the file
    search_path = os.path.join(search_dir, search_file)
    
    # Open the file
    with open(search_path, 'rb') as f_gz:
        with gzip.open(f_gz, 'rb') as f:
            # Set a default distance of infinity and iterate through the list of airports to find the closest one
            closest_airport_distance = math.inf
            for line in f:
                route = json.loads(line.decode('utf-8'))
                
                # Find latitude and longitude for each line
                lookup_lat = route['src_airport']['latitude']
                lookup_long = route['src_airport']['longitude']
                
                # Calculate distance
                dist = pygeohash.geohash_approximate_distance(search_hash, pygeohash.encode(lookup_lat,lookup_long))
                
                # Save the closest airport if it's closer to the previous. 
                if dist < closest_airport_distance:
                    closest_airport = route
                    
    # Print the closest airport
    print(json.dumps(closest_airport['src_airport'], indent=4))
            

airport_search(41.1499988, -95.91779)

In [None]:
lat_error = {1 : 23, 
             2 : 2.8,
             3 : 0.7, 
             4 : 0.087, 
             5 : 0.022,
             6 : 0.0027, 
             7 : 0.00068}

long_error = {1 : 23, 
              2 : 5.6,
              3 : 0.7, 
              4 : 0.18,
              5 : 0.022,
              6 : 0.0055, 
              7 : 0.00068}

In [None]:
import numpy as np

In [None]:
def geo_search(lat, long, level = 3, error = 1):
    """
    Input a lat and long and return the surrounding geohash values. 
    lat: float
    long: float
    level: int - default: 3
    """
    # Errors pulled from Wikipedia
    lat_error = {1 : 23, 
                 2 : 2.8,
                 3 : 0.7, 
                 4 : 0.087, 
                 5 : 0.022,
                 6 : 0.0027, 
                 7 : 0.00068}

    long_error = {1 : 23, 
                  2 : 5.6,
                  3 : 0.7, 
                  4 : 0.18,
                  5 : 0.022,
                  6 : 0.0055, 
                  7 : 0.00068}

    # Find the hash value for the original position
    search_hash = pygeohash.encode(lat, long)
    
    # Create a list of chunks and add it to the current list
    chunks = [search_hash[0:level]]
    
    # Add and subtract the errors by using the error dictionary.
    for err_x in np.linspace(1, error, error):
        for err_y in np.linspace(1, error, error):
            for x in np.arange(-1,2):
                for y in np.arange(-1,2):
                    new_lat = lat + (x *err_x * lat_error[level])
                    new_long = long + (y *err_y * long_error[level])
            
                    # Add the new geohash code to the list
                    chunks.append(pygeohash.encode(new_lat,new_long)[0:level])
            
    # Return the list removing any duplicates
    return sorted(set(chunks))
    
    

In [None]:
geo_search(41.1499988, -95.91779, level = 7)

In [None]:
geo_search(45.01413176216629, 0.010470670286730765, level = 3, error = 4)

In [None]:
np.linspace(1,5,5)