In [1]:
import os
from pathlib import Path
import pandas as pd
import geopy.distance
import math

# Setting dataframe path 
df_path = os.path.abspath('../../assignments/assignment03/results/routes.parquet')
current_dir = Path(os.getcwd()).absolute()

# 7.a

In [2]:
# Reading Parquet dataframe
df = pd.read_parquet(df_path)
df.head(5)

Unnamed: 0,airline,src_airport,dst_airport,codeshare,equipment
0,"{'active': True, 'airline_id': 410, 'alias': '...","{'airport_id': 2965.0, 'altitude': 89.0, 'city...","{'airport_id': 2990.0, 'altitude': 411.0, 'cit...",False,[CR2]
1,"{'active': True, 'airline_id': 410, 'alias': '...","{'airport_id': 2966.0, 'altitude': -65.0, 'cit...","{'airport_id': 2990.0, 'altitude': 411.0, 'cit...",False,[CR2]
2,"{'active': True, 'airline_id': 410, 'alias': '...","{'airport_id': 2966.0, 'altitude': -65.0, 'cit...","{'airport_id': 2962.0, 'altitude': 1054.0, 'ci...",False,[CR2]
3,"{'active': True, 'airline_id': 410, 'alias': '...","{'airport_id': 2968.0, 'altitude': 769.0, 'cit...","{'airport_id': 2990.0, 'altitude': 411.0, 'cit...",False,[CR2]
4,"{'active': True, 'airline_id': 410, 'alias': '...","{'airport_id': 2968.0, 'altitude': 769.0, 'cit...","{'airport_id': 4078.0, 'altitude': 365.0, 'cit...",False,[CR2]


In [3]:
# Reading each column to read in a formatted table
airline_df = pd.json_normalize(df['airline'])
airline_df = airline_df.add_prefix('airline.') # Adding prefix to avoid confusing with same column names

src_df = pd.json_normalize(df['src_airport'])
src_df = src_df.add_prefix('src.')

dst_df = pd.json_normalize(df['dst_airport'])
dst_df = dst_df.add_prefix('dst.')

# Merging dataframes
df = pd.concat([airline_df, src_df, dst_df, df], axis=1)

# Dropping unnecessary columns
df = df.drop(columns=['airline', 'src_airport', 'dst_airport'])

df.sample(5)

Unnamed: 0,airline.active,airline.airline_id,airline.alias,airline.callsign,airline.country,airline.iata,airline.icao,airline.name,src.airport_id,src.altitude,...,dst.icao,dst.latitude,dst.longitude,dst.name,dst.source,dst.timezone,dst.type,dst.tz_id,codeshare,equipment
43365,True,4687,Swiss European,SPIRIT WINGS,United States,NK,NKS,Spirit Airlines,3670.0,607.0,...,KATL,33.6367,-84.428101,Hartsfield Jackson Atlanta International Airport,OurAirports,-5.0,airport,America/New_York,False,[320]
8155,True,330,ANA All Nippon Airways,AIR CANADA,Canada,AC,ACA,Air Canada,111.0,244.0,...,CYUL,45.4706,-73.740799,Montreal / Pierre Elliott Trudeau Internationa...,OurAirports,-5.0,airport,America/Toronto,True,"[DH4, DH1, DH3]"
36976,True,3090,,KLM,Netherlands,KL,KLM,KLM Royal Dutch Airlines,178.0,3557.0,...,EHAM,52.308601,4.76389,Amsterdam Airport Schiphol,OurAirports,1.0,airport,Europe/Amsterdam,False,[330]
34244,True,4867,,TAM,Brazil,JJ,TAM,TAM Brazilian Airlines,2555.0,16.0,...,SBGR,-23.435556,-46.473057,Guarulhos - Governador André Franco Montoro In...,OurAirports,-3.0,airport,America/Sao_Paulo,False,[320]
63367,True,5399,Varig,WEB-BRASIL,Brazil,WJ,WEB,WebJet Linhas A,94.0,39.0,...,CYHR,50.468899,-59.6367,Chevery Airport,OurAirports,-4.0,airport,America/Blanc-Sablon,False,[BE1]


In [4]:
# Adding key to the dataframe 
df['key'] = df.apply(lambda row: str(row['src.iata']) + str(row['dst.iata']) + str(row['airline.iata']), axis=1)
df[['src.iata','dst.iata','airline.iata','key']].sample(5)

Unnamed: 0,src.iata,dst.iata,airline.iata,key
38006,BCN,FRA,LH,BCNFRALH
19909,RHO,DUS,DE,RHODUSDE
12317,BOG,CUR,AV,BOGCURAV
40936,CMB,IXM,MJ,CMBIXMMJ
44497,TXL,PRG,OK,TXLPRGOK


In [5]:
# Defining partition
partitions = (
        ('A', 'A'), ('B', 'B'), ('C', 'D'), ('E', 'F'),
        ('G', 'H'), ('I', 'J'), ('K', 'L'), ('M', 'M'),
        ('N', 'N'), ('O', 'P'), ('Q', 'R'), ('S', 'T'),
        ('U', 'U'), ('V', 'V'), ('W', 'X'), ('Y', 'Z')
    )

# Creating partition
def get_partition(key):
    key_letter = str(key[0])
    
    for first,last in partitions:
        if str(first) <= key_letter <= str(last):
            if first == last:
                return str(first)
            else:
                return str(''.join((first,'-',last)))

# Creating a new column to save partition
df['kv_key'] = df['key'].apply(get_partition)
df[['key', 'kv_key']].sample(5)

Unnamed: 0,key,kv_key
65641,JEDRUHXY,I-J
56504,GLAEWRUA,G-H
48336,PBDBOMS2,O-P
24560,SOFATHFB,S-T
45869,LHELHRPK,K-L


In [39]:
# Saving directory structure
df.to_parquet('results/kv/', partition_cols=['kv_key'])

# 7.b 

In [6]:
import hashlib

def hash_key(key):
    m = hashlib.sha256()
    m.update(str(key).encode('utf-8'))
    return m.hexdigest()

# Creating a hashed column
df['hashed'] = df['key'].apply(hash_key)
df[['key','hashed']].head(5)

Unnamed: 0,key,hashed
0,AERKZN2B,652cdec02010381f175efe499e070c8cbaac1522bac59a...
1,ASFKZN2B,9eea5dd88177f8d835b2bb9cb27fb01268122b635b241a...
2,ASFMRV2B,161143856af25bd4475f62c80c19f68936a139f653c1d3...
3,CEKKZN2B,39aa99e6ae2757341bede9584473906ef1089e30820c90...
4,CEKOVB2B,143b3389bce68eea3a13ac26a9c76c1fa583ec2bd26ea8...


In [7]:
# creating a partitioned dataset based on the first character of the hashed key 
df['hash_key'] = df['hashed'].apply(lambda x: x[0].upper())
df[['key','hashed','hash_key']].sample(5)

Unnamed: 0,key,hashed,hash_key
18578,HKGSWACZ,f535ae5d545e0df90eac2c7bff02f652ee91f44106af39...,F
1507,LPACGN4U,4cb11868d0be5e75fcae508f5cc7084a252ef3b6e42211...,4
8352,YYCYQLAC,26183c562f540daee3b4705d5a0226ded258eb9551bb47...,2
19192,TNALHWCZ,67d13026145f04bcbfaab0c96c6a1fe5cdb9089b0c0e76...,6
41997,NKGNNGMU,2b94ced0186d1d4bd276404c90d521160efcb8034fd9e2...,2


In [56]:
# Saving directory structure
df.to_parquet('results/hash/', partition_cols=['hash_key'])

# 7.c

In [8]:
data_center_dict = {
    'west': (45.5945645,-121.1786823), 
    'central': (41.1544433,-96.0422378), 
    'east': (39.08344,-77.6497145)
}

# Function to find closest center based on Haversine formula
def closest_center(lat,lon, center_dict = data_center_dict):
    # Initializing variables
    closest_dist = 1e100
    closest_center = ''

    if isinstance(lat, (int, float)) and isinstance(lon, (int, float)) and math.isfinite(lat) and math.isfinite(lon):
        for center, coor in center_dict.items():
            # Finding distance between two points
            src_distance = geopy.distance.geodesic((float(lat), float(lon)), coor).km 

            # Assiging closest center
            if src_distance < closest_dist:
                closest_dist = src_distance
                closest_center = center

        return closest_center
    
    else: None

df['location'] = df.apply(lambda x: closest_center(x['src.latitude'], x['src.longitude']), axis=1)
df[['src.latitude','src.longitude','location']].sample(5)

Unnamed: 0,src.latitude,src.longitude,location
21293,42.932598,-71.435699,east
41524,40.161098,94.809196,west
60789,8.945147,-64.151085,east
58221,15.3808,73.831398,west
18831,30.79545,106.1626,west


In [121]:
# Saving directory structure
df.to_parquet('results/geo/', partition_cols=['location'])

# 7.d

In [9]:
def balance_partitions(keys, num_partitions):
    
    # Sorting the input keys
    sort_key = sorted(keys)
    
    # Computing the partition size
    partition_size = len(sort_key) // num_partitions

    partitions = []
    
    # Ensuring  each partition contains all the keys between the least key in the partition and the greatest key in the partition
    for i in range(num_partitions):
        start_index = i * partition_size
        end_index = start_index + partition_size
        partitions.append(sort_key[start_index:end_index])
    
    # Ensuring that each partition is ordered
    partitions = [sorted(par) for par in partitions]
    
    return partitions

# printing first partition keys
print(balance_partitions(list(df.key), 500)[0])

['AAEALGAH', 'AAECDGAH', 'AAEISLAH', 'AAELYSAH', 'AAEMRSAH', 'AAEMRSZI', 'AAEORNAH', 'AAEORYAH', 'AAEORYZI', 'AALAARBA', 'AALAGPDY', 'AALALCDY', 'AALAMSAZ', 'AALAMSKL', 'AALARNSK', 'AALBCNIB', 'AALBCNVY', 'AALBLLDX', 'AALBLLSK', 'AALBLLTK', 'AALCPHDY', 'AALCPHSK', 'AALISLTK', 'AALLGWDY', 'AALOSLBA', 'AALOSLM3', 'AALOSLSK', 'AALPMIDY', 'AALSVGDX', 'AANCCJIX', 'AANPEWNL', 'AAQDMES7', 'AAQLEDSU', 'AAQSVOSU', 'AARAALBA', 'AARAGPFR', 'AARBMABA', 'AARCPHSK', 'AARGOTBA', 'AAROSLBA', 'AARPMIFR', 'AARSTNFR', 'AATURCCZ', 'AATURCGS', 'AAXPOJAD', 'AAYSAHFO', 'ABADMES7', 'ABAIKTnan', 'ABANSKY7', 'ABASVOSU', 'ABDMHDB9', 'ABDMHDIR', 'ABDSYZEP', 'ABDTHRB9', 'ABDTHREP', 'ABDTHRIR', 'ABEATLAF', 'ABEATLDL', 'ABEATLKL', 'ABECLTAA', 'ABECLTUS', 'ABEDTWDL', 'ABEMYRG4', 'ABEORDUA', 'ABEPGDG4', 'ABEPHLAA', 'ABEPHLUS', 'ABEPIEG4', 'ABESFBG4', 'ABIDFWAA', 'ABIDFWUS', 'ABJACCE2', 'ABJACCEK', 'ABJACCHF', 'ABJACCME', 'ABJACCMS', 'ABJACCS9', 'ABJACCSA', 'ABJALGAH', 'ABJBKOHF', 'ABJBKOL6', 'ABJBOY2J', 'ABJBRUAC', 'A