In [10]:
county_stations = {
    'Tarrant': ['DFW', 'GKY', 'FTW', 'AFW'],
    'McLennan': ['ACT', 'PWG'],
    'Montague': ['0F2'],
    'Stephens': ['BKD'],
    'Wise': ['XBP', 'LUD'],
    'Johnson': ['CPT'],
    'Comanche': ['MKN'],
    'Navarro': ['CRS'],
    'Dallas': ['DAL', 'RBD'],
    'Denton': ['DTO'],
    'Cooke': ['GLE'],
    'Coryell': ['GOP'],
    'Young': ['RPH'],
    'Hood': ['GDJ'],
    'Hunt': ['GVT'],
    'Hamilton': ['MNZ'],
    'Hill': ['INJ'],
    'Collin': ['TKI'],
    'Palo Pinto': ['MWL'],
    'Grayson': ['GYI'],
    'Erath': ['SEP'],
    'Kaufman': ['TRL'],
    'Ellis': ['JWY'],
    'Eastland': ['BKD', 'MKN'],
    'Jack': ['RPH', 'XBP', 'LUD'],
    'Parker': ['MWL', 'GDJ', 'DFW', 'GKY', 'FTW', 'AFW', 'DTO'],
    'Rockwall': ['DAL', 'RBD', 'TKI', 'GVT', 'TRL'],
    'Somervell': ['SEP', 'GDJ', 'CPT'],
    'Bosque': ['MNZ', 'INJ', 'ACT', 'PWG', 'GOP']
}

station_counties = {}
for county, stations in county_stations.items():
    for station in stations:
        if station not in station_counties:
            station_counties[station] = []
        station_counties[station].append(county)
station_counties

{'DFW': ['Tarrant', 'Parker'],
 'GKY': ['Tarrant', 'Parker'],
 'FTW': ['Tarrant', 'Parker'],
 'AFW': ['Tarrant', 'Parker'],
 'ACT': ['McLennan', 'Bosque'],
 'PWG': ['McLennan', 'Bosque'],
 '0F2': ['Montague'],
 'BKD': ['Stephens', 'Eastland'],
 'XBP': ['Wise', 'Jack'],
 'LUD': ['Wise', 'Jack'],
 'CPT': ['Johnson', 'Somervell'],
 'MKN': ['Comanche', 'Eastland'],
 'CRS': ['Navarro'],
 'DAL': ['Dallas', 'Rockwall'],
 'RBD': ['Dallas', 'Rockwall'],
 'DTO': ['Denton', 'Parker'],
 'GLE': ['Cooke'],
 'GOP': ['Coryell', 'Bosque'],
 'RPH': ['Young', 'Jack'],
 'GDJ': ['Hood', 'Parker', 'Somervell'],
 'GVT': ['Hunt', 'Rockwall'],
 'MNZ': ['Hamilton', 'Bosque'],
 'INJ': ['Hill', 'Bosque'],
 'TKI': ['Collin', 'Rockwall'],
 'MWL': ['Palo Pinto', 'Parker'],
 'GYI': ['Grayson'],
 'SEP': ['Erath', 'Somervell'],
 'TRL': ['Kaufman', 'Rockwall'],
 'JWY': ['Ellis']}

In [14]:
import pandas as pd
import os

selected_dfs = []

pd.options.mode.chained_assignment = None

stations_dir = r"..\daily expansion\data"
for file in os.listdir(stations_dir):
    station = file[:3]

    if station not in station_counties:
        continue

    print(station)

    path = os.path.join(stations_dir, file)
    station_df = pd.read_csv(path)

    #print(station_df.head(3))

    selected = station_df[['Date', 'Hour', 'Temperature', 'Precipitation', 'Snow', 'Wind Speed']]

    selected_cols = selected.columns.drop('Date').drop('Hour')
    selected[selected_cols] = selected[selected_cols].apply(pd.to_numeric, errors='coerce')
    selected['Date'] = pd.to_datetime(selected['Date'])

    #selected = selected[selected['Max Wind Speed'] < 100] # correct for some error where the max wind speed is swapped with the max wind direction
    #selected.dropna(subset=['Avg Temp', 'Max Temp', 'Min Temp'], inplace=True) # rows where these are NaN are likely missing all values anyway
    #selected.fillna(0, inplace=True)

    selected.rename({'Temperature': f'{station}_temperature', 
                     'Precipitation': f'{station}_precipitation', 
                     'Snow': f'{station}_snow', 
                     'Wind Speed': f'{station}_wind_speed'}, axis=1, inplace=True)
    
    selected_dfs.append(selected)

valid_dates = []

for df in selected_dfs:
    valid_dates.extend(df['Date'].values)

unique_dates = sorted(list(set(valid_dates)))

dates = []
hours = []

for date in unique_dates:
    dates.append(date)
    hours.append(6)

    dates.append(date)
    hours.append(12)

    dates.append(date)
    hours.append(18)

    dates.append(date)
    hours.append(24)

total = pd.DataFrame({'Date': dates, 'Hour': hours})

for df in selected_dfs:
    total = pd.merge(total, df, on=['Date', 'Hour'], how='left')

#total.interpolate(method='pad', inplace=True) # kind of shit method but does allow for the rest of the process to happen

#total.to_csv('all_variables2.csv', index=False)
total = total.dropna()#.to_csv('all_variables_full2.csv', index=False)

0F2
ACT
AFW
BKD
CPT
CRS
DAL
DFW
DTO
FTW
GDJ
GKY
GLE
GOP
GVT
GYI
INJ
JWY
LUD
MKN
MNZ
MWL
PWG
RBD
RPH
SEP
TKI
TRL
XBP


In [24]:
#total = pd.read_csv('all_variables_full2.csv')
county_total = pd.DataFrame()
county_total['Date'] = total['Date']
county_total['Hour'] = total['Hour']

for county, stations in county_stations.items():
    county_avg_columns = [f'{county}_temperature', f'{county}_precipitation', f'{county}_snow', 
                          f'{county}_wind_speed']
    station_columns = lambda station: [f'{station}_temperature', f'{station}_precipitation', f'{station}_snow', 
                          f'{station}_wind_speed']
    columns_to_average = {}
    for station in stations:
        cols = station_columns(station)
        for i in range(len(county_avg_columns)):
            if county_avg_columns[i] not in columns_to_average:
                columns_to_average[county_avg_columns[i]] = []
            columns_to_average[county_avg_columns[i]].append(cols[i])
    
    for col in county_avg_columns:
        # take averages for wind and temp
        if 'temp' in col or 'wind' in col:
            county_total[col] = total[columns_to_average[col]].mean(axis=1).round(1)
        # sum over the precip and snow since they accumulate
        elif 'precip' in col or 'snow' in col:
            county_total[col] = total[columns_to_average[col]].sum(axis=1).round(3)

county_total.to_csv('county_variables.csv', index=False)

  county_total[col] = total[columns_to_average[col]].sum(axis=1).round(3)
  county_total[col] = total[columns_to_average[col]].mean(axis=1).round(1)
  county_total[col] = total[columns_to_average[col]].mean(axis=1).round(1)
  county_total[col] = total[columns_to_average[col]].sum(axis=1).round(3)
  county_total[col] = total[columns_to_average[col]].sum(axis=1).round(3)
  county_total[col] = total[columns_to_average[col]].mean(axis=1).round(1)
  county_total[col] = total[columns_to_average[col]].mean(axis=1).round(1)
  county_total[col] = total[columns_to_average[col]].sum(axis=1).round(3)
  county_total[col] = total[columns_to_average[col]].sum(axis=1).round(3)
  county_total[col] = total[columns_to_average[col]].mean(axis=1).round(1)
  county_total[col] = total[columns_to_average[col]].mean(axis=1).round(1)
  county_total[col] = total[columns_to_average[col]].sum(axis=1).round(3)
  county_total[col] = total[columns_to_average[col]].sum(axis=1).round(3)
  county_total[col] = total[colu