In [None]:
county_stations = {
    'Tarrant': ['DFW', 'GKY', 'FTW', 'AFW'],
    'McLennan': ['ACT', 'PWG'],
    'Montague': ['0F2'],
    'Stephens': ['BKD'],
    'Wise': ['XBP', 'LUD'],
    'Johnson': ['CPT'],
    'Comanche': ['MKN'],
    'Navarro': ['CRS'],
    'Dallas': ['DAL', 'RBD'],
    'Denton': ['DTO'],
    'Cooke': ['GLE'],
    'Coryell': ['GOP'],
    'Young': ['RPH'],
    'Hood': ['GDJ'],
    'Hunt': ['GVT'],
    'Hamilton': ['MNZ'],
    'Hill': ['INJ'],
    'Collin': ['TKI'],
    'Palo Pinto': ['MWL'],
    'Grayson': ['GYI'],
    'Erath': ['SEP'],
    'Kaufman': ['TRL'],
    'Ellis': ['JWY'],
    'Eastland': ['BKD', 'MKN'],
    'Jack': ['RPH', 'XBP', 'LUD'],
    'Parker': ['MWL', 'GDJ', 'DFW', 'GKY', 'FTW', 'AFW', 'DTO'],
    'Rockwall': ['DAL', 'RBD', 'TKI', 'GVT', 'TRL'],
    'Somervell': ['SEP', 'GDJ', 'CPT'],
    'Bosque': ['MNZ', 'INJ', 'ACT', 'PWG', 'GOP']
}

station_counties = {}
for county, stations in county_stations.items():
    for station in stations:
        if station not in station_counties:
            station_counties[station] = []
        station_counties[station].append(county)
station_counties

In [None]:
import pandas as pd
import os

selected_dfs = []

pd.options.mode.chained_assignment = None

stations_dir = r"..\data"
for file in os.listdir(stations_dir):
    station = file[:3]

    if station not in station_counties:
        continue

    print(station)

    path = os.path.join(stations_dir, file)
    station_df = pd.read_csv(path)

    #print(station_df.head(3))

    selected = station_df[['Date', 'Avg Temp', 'Max Temp', 'Min Temp', 'Total Precip', 'Total Snow', 'Avg Wind Speed', 'Max Wind Speed']]

    selected_cols = selected.columns.drop('Date')
    selected[selected_cols] = selected[selected_cols].apply(pd.to_numeric, errors='coerce')
    selected['Date'] = pd.to_datetime(selected['Date'])

    selected = selected[selected['Max Wind Speed'] < 100] # correct for some error where the max wind speed is swapped with the max wind direction
    selected.dropna(subset=['Avg Temp', 'Max Temp', 'Min Temp'], inplace=True) # rows where these are NaN are likely missing all values anyway
    selected.fillna(0, inplace=True)

    selected.rename({'Avg Temp': f'{station}_avg_temp', 
                     'Max Temp': f'{station}_max_temp', 
                     'Min Temp': f'{station}_min_temp', 
                     'Total Precip': f'{station}_precip', 
                     'Total Snow': f'{station}_snow', 
                     'Avg Wind Speed': f'{station}_avg_wind', 
                     'Max Wind Speed': f'{station}_max_wind'}, axis=1, inplace=True)
    
    selected_dfs.append(selected)

valid_dates = []

for df in selected_dfs:
    valid_dates.extend(df['Date'].values)

unique_dates = sorted(list(set(valid_dates)))

total = pd.DataFrame({'Date': unique_dates})

for df in selected_dfs:
    total = pd.merge(total, df, on='Date', how='left')

total.interpolate(method='pad', inplace=True) # kind of shit method but does allow for the rest of the process to happen

#total.to_csv('all_variables2.csv', index=False)
total = total.dropna()#.to_csv('all_variables_full2.csv', index=False)

In [None]:
#total = pd.read_csv('all_variables_full2.csv')
county_total = pd.DataFrame()
county_total['Date'] = total['Date']

for county, stations in county_stations.items():
    county_avg_columns = [f'{county}_avg_temp', f'{county}_max_temp', f'{county}_min_temp', 
                          f'{county}_precip', f'{county}_snow', f'{county}_avg_wind', f'{county}_max_wind']
    station_columns = lambda station: [f'{station}_avg_temp', f'{station}_max_temp', f'{station}_min_temp', 
                          f'{station}_precip', f'{station}_snow', f'{station}_avg_wind', f'{station}_max_wind']
    columns_to_average = {}
    for station in stations:
        cols = station_columns(station)
        for i in range(len(county_avg_columns)):
            if county_avg_columns[i] not in columns_to_average:
                columns_to_average[county_avg_columns[i]] = []
            columns_to_average[county_avg_columns[i]].append(cols[i])
    
    for col in county_avg_columns:
        county_total[col] = total[columns_to_average[col]].mean(axis=1).round(3)

county_total.to_csv('county_variables.csv', index=False)