In [6]:
import matplotlib
import scipy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import sklearn
pd.options.mode.chained_assignment = None


In [2]:
def parse_data(given_path):
    file_name = 'metars_cache_1.csv'
    metars_path = os.path.join(given_path, file_name)
    metars_data = pd.read_csv(metars_path)
    return metars_data

In [3]:
save_path = os.path.abspath(os.path.join(os.pardir, 'data'))
given_data = parse_data(save_path)

display(given_data)


Unnamed: 0,raw_text,station_id,observation_time,latitude,longitude,temp_c,dewpoint_c,wind_dir_degrees,wind_speed_kt,wind_gust_kt,...,maxT24hr_c,minT24hr_c,precip_in,pcp3hr_in,pcp6hr_in,pcp24hr_in,snow_in,vert_vis_ft,metar_type,elevation_m
0,KBCT 220253Z 35007KT 10SM VCTS BKN050 22/17 A3...,KBCT,2022-10-22T02:53:00Z,26.38,-80.10,22.0,17.0,350.0,7.0,,...,,,,,,,,,METAR,3.0
1,YSNF 220252Z AUTO 09007KT 9999 -SHRA SCT020 SC...,YSNF,2022-10-22T02:52:00Z,-29.03,167.93,18.0,16.0,90.0,7.0,,...,,,,,,,,,SPECI,113.0
2,YCOM 220252Z AUTO 01013KT 5000 // SCT013 SCT02...,YCOM,2022-10-22T02:52:00Z,-36.30,148.97,,,10.0,13.0,,...,,,,,,,,,SPECI,930.0
3,PAFM 220252Z AUTO 12003KT 10SM FEW004 SCT015 O...,PAFM,2022-10-22T02:52:00Z,67.10,-157.85,1.0,0.0,120.0,3.0,,...,,,0.005,,,,,,SPECI,79.0
4,PABV 220252Z AUTO 05003KT 10SM BKN004 02/02 A2...,PABV,2022-10-22T02:52:00Z,61.42,-149.52,2.0,2.0,50.0,3.0,,...,,,,,,,,,SPECI,21.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4680,MMIA 220140Z 00000KT 6SM BKN100 23/22 A3000 RM...,MMIA,2022-10-22T01:40:00Z,19.27,-103.57,23.0,22.0,0.0,0.0,,...,,,,,,,,,METAR,744.0
4681,MMHO 220140Z 24004KT 10SM SKC 29/17 A2984 RMK HZY,MMHO,2022-10-22T01:40:00Z,29.08,-111.05,29.0,17.0,240.0,4.0,,...,,,,,,,,,METAR,186.0
4682,MMGM 220140Z 20006KT 10SM SKC 30/24 A2976,MMGM,2022-10-22T01:40:00Z,27.95,-110.92,30.0,24.0,200.0,6.0,,...,,,,,,,,,METAR,13.0
4683,MMEP 220140Z 00000KT 8SM BKN020 23/20 A3000 RM...,MMEP,2022-10-22T01:40:00Z,21.42,-104.85,23.0,20.0,0.0,0.0,,...,,,,,,,,,METAR,921.0


In [4]:
# Do some basic cleaning to get rid of rows that are missing lat/long, since that's the bare minimum we need
cleaned_data = given_data.dropna(axis=0, subset=['latitude', 'longitude'])
cleaned_data = cleaned_data.reset_index(drop=True)

In [7]:
def filter_usa(df: pd.DataFrame) -> pd.DataFrame:
    """Takes in world data and crudely restricts to just lat/long values corresponding to the lower 48"""

    # We'll be using latitude bounds of [24, 49] and longitude bounds of [-125, -67] based on https://www.findlatitudeandlongitude.com/l/Lower+48/4315442/
    return df.loc[(df['latitude'] >= 24) & (df['latitude'] <= 49) & (df['longitude'] >= -125) & (df['longitude'] <= -67)]

# Get a rough estimate of the data corresponding to the US (minus Alaska and Hawaii)
usa_data = filter_usa(cleaned_data)

# Also, because 'Murica, let's change the units from C to F
usa_data['temp_f'] = usa_data['temp_c'].apply(lambda x: (x * (9.0 / 5.0) + 32))
usa_data['dewpoint_f'] = usa_data['dewpoint_c'].apply(lambda x: (x * (9.0 / 5.0) + 32))

display(usa_data)

Unnamed: 0,raw_text,station_id,observation_time,latitude,longitude,temp_c,dewpoint_c,wind_dir_degrees,wind_speed_kt,wind_gust_kt,...,precip_in,pcp3hr_in,pcp6hr_in,pcp24hr_in,snow_in,vert_vis_ft,metar_type,elevation_m,temp_f,dewpoint_f
0,KBCT 220253Z 35007KT 10SM VCTS BKN050 22/17 A3...,KBCT,2022-10-22T02:53:00Z,26.38,-80.10,22.0,17.0,350.0,7.0,,...,,,,,,,METAR,3.0,71.60,62.60
6,KREO 220252Z AUTO 23010G20KT 19/02 A2962 RMK A...,KREO,2022-10-22T02:52:00Z,42.58,-117.87,19.4,1.7,230.0,10.0,20.0,...,,,,,,,METAR,1237.0,66.92,35.06
7,KNRB 220252Z AUTO 02010KT 10SM CLR A3012 RMK A...,KNRB,2022-10-22T02:52:00Z,30.38,-81.42,,,20.0,10.0,,...,,,,,,,METAR,4.0,,
8,KNFW 220252Z 17019G28KT 10SM CLR 26/10 A2982 R...,KNFW,2022-10-22T02:52:00Z,32.78,-97.43,26.1,10.0,170.0,19.0,28.0,...,,,,,,,METAR,188.0,78.98,50.00
9,KBLH 220252Z AUTO 22006KT 10SM CLR 28/10 A2975...,KBLH,2022-10-22T02:52:00Z,33.62,-114.72,28.3,10.0,220.0,6.0,,...,,,,,,,METAR,119.0,82.94,50.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4635,MMML 220140Z 09004KT 10SM SKC 28/07 A2975,MMML,2022-10-22T01:40:00Z,32.62,-115.23,28.0,7.0,90.0,4.0,,...,,,,,,,METAR,21.0,82.40,44.60
4636,MMLP 220140Z 20008KT 10SM SCT200 28/17 A2985 R...,MMLP,2022-10-22T01:40:00Z,24.07,-110.37,28.0,17.0,200.0,8.0,,...,,,,,,,METAR,16.0,82.40,62.60
4639,MMHO 220140Z 24004KT 10SM SKC 29/17 A2984 RMK HZY,MMHO,2022-10-22T01:40:00Z,29.08,-111.05,29.0,17.0,240.0,4.0,,...,,,,,,,METAR,186.0,84.20,62.60
4640,MMGM 220140Z 20006KT 10SM SKC 30/24 A2976,MMGM,2022-10-22T01:40:00Z,27.95,-110.92,30.0,24.0,200.0,6.0,,...,,,,,,,METAR,13.0,86.00,75.20


In [8]:
from sklearn.model_selection import train_test_split