# EDA Script to find patterns in weather delays

### Business Question: How can we enhance operational efficiency by identifying weather patterns to minimise flight delays as a results of changing weather patterns?

In [15]:
from pyspark.sql.functions import udf, col, lit, to_date, concat_ws, split
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, MapType
from pyspark.sql import SparkSession
import os
import sys
import json
import requests

In [None]:
os.environ['PYSPAK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

spark = SparkSession.builder \
                    .appName("weather-script") \
                    .getOrCreate()

raw_data_dir = r"s3://operational-raw-data-bucket/raw_data/operational_database/operational_csv/"
csv_file = 'airline.csv'

In [None]:
airline_df = spark.read.csv(raw_data_dir + csv_file, header=True, inferSchema=True)
print (df.show())

In [None]:
header = {"X-Api-Key":"lf/2O0WcstK5VeRxvvASdA==tinCbr385akwLFVh"}

airport_coord = {}
list_failed = []

for iata in list(airline_df.Origin.unique()):
    url = f'https://api.api-ninjas.com/v1/airports?iata={iata}'

    data = requests.get(url, headers=header).json()
    try:
        lat = data[0]['latitude']
        long = data[0]['longitude']
        region = data[0]['region']
        airport_coord[iata] = {
            'latitude' : lat,
            'longitude' : long,
            'region' : region
        }
    except:
        print(f"iata : {iata}")
        list_failed.append(iata)

In [None]:
with open('airport_coord.json', 'w') as f:
    json.dumps(airport_coord))

In [None]:
import time

airport_weather_dict = {}

for airport in airport_coord:
    airport_weather_dict[airport] = {}
    
    lat = airport_coord[airport]['latitude']
    long = airport_coord[airport]['longitude']
    print(f"Getting data for {airport} at {lat}, {long}...")
    url = f'https://archive-api.open-meteo.com/v1/archive?start_date=2003-01-01&end_date=2009-01-01&latitude={lat}&longitude={long}&daily=weather_code,precipitation_hours,snowfall_sum,wind_speed_10m_max'
    
    try:
        data = requests.get(url).json()
        daily = data['daily']
    except:
        try:
            print("\n\nAPI limit hit...sleeping for 1 min...\n\n")
            time.sleep(65)
            data = requests.get(url).json()
        except:
            print("\n\nAPI limit hit...sleeping for 1 hour...\n\n")
            time.sleep(3600)
            data = requests.get(url).json()
    
    for i in range(len(data['daily']['time'])):
        date = data['daily']['time'][i]
        weather_code = data['daily']['weather_code'][i]
        precipitation_hours = data['daily']['precipitation_hours'][i]
        snowfall_sum = data['daily']['snowfall_sum'][i]
        wind_speed = data['daily']['wind_speed_10m_max'][i]
        airport_weather_dict[airport][date] = {
            'weather_code' : weather_code,
            'precipitation_hours': precipitation_hours,
            'snowfall_sum' :snowfall_sum,
            'wind_speed' : wind_speed
        }

In [None]:
import json

with open('airport_weather_final.json', 'w') as f:
    json.dump(airport_weather_dict, f)