In [1]:
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

In [2]:
# valley fever data
vf = pd.read_excel('../../data/original_datasets/ValleyFeverDashboard_Data.xlsx')
# weather data
wf = pd.read_csv('../../data/original_datasets/weather_data_original.csv')

print(f'The original shape of vf: {vf.shape}, The original shape of wf: {wf.shape}')

The original shape of vf: (1410, 4), The original shape of wf: (19241397, 28)


In [3]:
# rename columns
vf = vf.rename(columns={
    'Valley Fever Cases and Incidence Rates by Local Health Jurisdiction, California, 2001-2022 ' : 'County',
    'Unnamed: 1' : 'Year',
    'Unnamed: 2' : 'Cases', 
    'Unnamed: 3' : 'Rate'
})

# remove unuseful info
vf.drop(index=vf.index[0],axis=0,inplace=True)
vf.drop(index=vf.index[-1],axis=0,inplace=True)
vf.drop('Rate', axis=1, inplace=True) 

#change dtypes

vf['Year'] = pd.to_datetime(vf['Year'], format='%Y')
vf['Year'] = vf['Year'].dt.year
vf['Cases'] = pd.to_numeric(vf['Cases'])

#remove counties from vf where there are less than 60 cases or it is a total of multiple counies 
counties_to_drop = ["ALPINE", "DEL NORTE", "GLENN", "INYO", "LAKE", "MARIPOSA", "MODOC", "MONO", "PLUMAS", "SHASTA", "SIERRA", "TRINITY", "LOS ANGELES", 'ALAMEDA COUNTY TOTAL', 'CALIFORNIA TOTAL', 'LOS ANGELES COUNTY TOTAL']
vf = vf.drop(vf[vf['County'].isin(counties_to_drop)].index, axis=0)
print(vf.shape)
print(len(vf["County"].unique()))

# relabel all of the counties so it is in a better format
vf['County'] = vf['County'].replace({
    'ALAMEDA': 'Alameda',
    'AMADOR': 'Amador',
    'BERKELEY': 'Berkeley',
    'BUTTE': 'Butte',
    'CALAVERAS': 'Calaveras',
    'COLUSA': 'Colusa',
    'CONTRA COSTA': 'ContraCosta',
    'EL DORADO': 'ElDorado',
    'FRESNO': 'Fresno',
    'HUMBOLDT': 'Humboldt',
    'IMPERIAL': 'Imperial',
    'KERN': 'Kern',
    'LASSEN': 'Lassen',
    'LONG BEACH': 'LongBeach',
    'MADERA': 'Madera',
    'MARIN': 'Marin',
    'MENDOCINO': 'Mendocino',
    'MERCED': 'Merced',
    'MONTEREY': 'Monterey',
    'NAPA': 'Napa',
    'NEVADA': 'Nevada',
    'ORANGE': 'Orange',
    'PASADENA': 'Pasadena',
    'PLACER': 'Placer',
    'RIVERSIDE': 'Riverside',
    'SACRAMENTO': 'Sacramento',
    'SAN BENITO': 'SanBenito',
    'SAN BERNARDINO': 'SanBernardino',
    'SAN DIEGO': 'SanDiego',
    'SAN FRANCISCO': 'SanFrancisco',
    'SAN JOAQUIN': 'SanJoaquin',
    'SAN LUIS OBISPO': 'SanLuisObispo',
    'SAN MATEO': 'SanMateo',
    'SANTA BARBARA': 'SantaBarbara',
    'SANTA CLARA': 'SantaClara',
    'SANTA CRUZ': 'SantaCruz',
    'SISKIYOU': 'Siskiyou',
    'SOLANO': 'Solano',
    'SONOMA': 'Sonoma',
    'STANISLAUS': 'Stanislaus',
    'SUTTER': 'Sutter',
    'TEHAMA': 'Tehama',
    'TULARE': 'Tulare',
    'TUOLUMNE': 'Tuolumne',
    'VENTURA': 'Ventura',
    'YOLO': 'Yolo',
    'YUBA': 'Yuba',
    'KINGS': 'Kings'
})
print(vf['County'].unique())

(1056, 3)
48
['Alameda' 'Amador' 'Berkeley' 'Butte' 'Calaveras' 'Colusa' 'ContraCosta'
 'ElDorado' 'Fresno' 'Humboldt' 'Imperial' 'Kern' 'Kings' 'Lassen'
 'LongBeach' 'Madera' 'Marin' 'Mendocino' 'Merced' 'Monterey' 'Napa'
 'Nevada' 'Orange' 'Pasadena' 'Placer' 'Riverside' 'Sacramento'
 'SanBenito' 'SanBernardino' 'SanDiego' 'SanFrancisco' 'SanJoaquin'
 'SanLuisObispo' 'SanMateo' 'SantaBarbara' 'SantaClara' 'SantaCruz'
 'Siskiyou' 'Solano' 'Sonoma' 'Stanislaus' 'Sutter' 'Tehama' 'Tulare'
 'Tuolumne' 'Ventura' 'Yolo' 'Yuba']


# Valley Fever df is done needing to be cleaned, now it is time to clean the weather df

In [4]:
# int this version i desited to keep more of the values except the ones below
col_to_drop = ['sea_level', 'grnd_level', 'dt', 'weather_icon', 'timezone', 'lat', 'lon']
wf = wf.drop(col_to_drop, axis=1)

# create a datetime colummn for mthe dt_iso
wf['dt_iso'] = wf['dt_iso'].str.replace(' UTC', '')
wf['date'] = pd.to_datetime(wf['dt_iso'], format='%Y-%m-%d %H:%M:%S %z', utc=True)
wf = wf.drop('dt_iso', axis=1)

#relabell all the columns for better appearance and readability
wf = wf.rename(columns={
    'city_name': 'County',
    'temp': 'Temp',
    'visibility': 'Vis',
    'dew_point': 'Dewpt',
    'feels_like': 'Feels',
    'temp_min': 'Tmin',
    'temp_max': 'Tmax',
    'pressure': 'Pres',
    'humidity': 'Humid',
    'wind_speed': 'Windsp',
    'wind_deg': 'Windir',
    'clouds_all': 'Clouds',
    'weather_id': 'Wthrid',
    'weather_main': 'Wthrmain',
    'weather_description': 'Wthrdesc',
    'rain_1h' : 'Rain1H',
    'rain_3h' : 'Rain3H',
    'snow_1h' : 'Snow1h',
    'snow_3h' : 'Snow3N',
    'wind_gust' : 'WindGust',
    'date' : 'Date',
})


# Relabel all the couties 
wf['County'] = wf['County'].replace({
    'Alameda County': 'Alameda',
    'Amador County': 'Amador',
    'Berkeley': 'Berkeley',
    'Butte Valley': 'Butte',
    'Calaveras County': 'Calaveras',
    'Colusa County': 'Colusa',
    'Contra Costa County': 'ContraCosta',
    'El Dorado County': 'ElDorado',
    'Fresno County': 'Fresno',
    'Humboldt County': 'Humboldt',
    'Imperial County': 'Imperial',
    'Kern County': 'Kern',
    'Lassen County': 'Lassen',
    'Long Beach': 'LongBeach',
    'Madera County': 'Madera',
    'Marin County': 'Marin',
    'Mendocino County': 'Mendocino',
    'Merced County': 'Merced',
    'Monterey County': 'Monterey',
    'Napa County': 'Napa',
    'Nevada County': 'Nevada',
    'Orange County': 'Orange',
    'Pasadena': 'Pasadena',
    'Placer County': 'Placer',
    'Riverside County': 'Riverside',
    'Sacramento County': 'Sacramento',
    'San Benito County': 'SanBenito',
    'San Bernardino County': 'SanBernardino',
    'San Diego County': 'SanDiego',
    'San Francisco County': 'SanFrancisco',
    'San Joaquin County': 'SanJoaquin',
    'San Luis Obispo County': 'SanLuisObispo',
    'San Mateo County': 'SanMateo',
    'Santa Barbara County': 'SantaBarbara',
    'Santa Clara County': 'SantaClara',
    'Santa Cruz County': 'SantaCruz',
    'Siskiyou County': 'Siskiyou',
    'Solano County': 'Solano',
    'Sonoma County': 'Sonoma',
    'Stanislaus County': 'Stanislaus',
    'Sutter County': 'Sutter',
    'Tehama County': 'Tehama',
    'Tulare County': 'Tulare',
    'Tuolumne County': 'Tuolumne',
    'Ventura County': 'Ventura',
    'Yolo County': 'Yolo',
    'Yuba County': 'Yuba',
    'Kings County': 'Kings'
})
print(wf['County'].unique())

#encode the 2 columnt that contrain string values
encoder = LabelEncoder()

wf['Wthrmain'] = encoder.fit_transform(wf['Wthrmain'])
wf['Wthrdesc'] = encoder.fit_transform(wf['Wthrdesc'])

# i made the weather start in 2000 because the output for 2001 of the rates will contain data from september 2000 to august 2001
wf = wf.loc[wf['Date'].dt.year >= 2000]
wf = wf.loc[wf['Date'].dt.year <= 2022]

#fill nan values
wf.fillna(0,inplace=True)

['Alameda' 'Amador' 'Berkeley' 'Butte' 'Calaveras' 'Colusa' 'ContraCosta'
 'ElDorado' 'Fresno' 'Humboldt' 'Imperial' 'Kern' 'Lassen' 'LongBeach'
 'Madera' 'Marin' 'Mendocino' 'Merced' 'Monterey' 'Napa' 'Nevada' 'Orange'
 'Pasadena' 'Placer' 'Riverside' 'Sacramento' 'SanBenito' 'SanBernardino'
 'SanDiego' 'SanFrancisco' 'SanJoaquin' 'SanLuisObispo' 'SanMateo'
 'SantaBarbara' 'SantaClara' 'SantaCruz' 'Siskiyou' 'Solano' 'Sonoma'
 'Stanislaus' 'Sutter' 'Tehama' 'Tulare' 'Tuolumne' 'Ventura' 'Yolo'
 'Yuba' 'Kings']


In [24]:
wf['Date'] = wf['Date'].dt.date.astype(str) + ' ' + wf['Date'].dt.hour.astype(str)
wf['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d %H')



In [30]:
wf.head()

Unnamed: 0,County,Temp,Vis,Dewpt,Feels,Tmin,Tmax,Pres,Humid,Windsp,...,WindGust,Rain1H,Rain3H,Snow1h,Snow3N,Clouds,Wthrid,Wthrmain,Wthrdesc,Date
184260,Alameda,12.42,0.0,5.35,11.34,11.87,13.31,1016,62,2.71,...,0.0,0.0,0.0,0.0,0.0,43,802,1,27,2000-01-01 0
184261,Alameda,10.87,10000.0,5.83,9.87,9.3,12.15,1017,71,4.6,...,0.0,0.0,0.0,0.0,0.0,100,804,1,19,2000-01-01 1
184262,Alameda,10.18,10000.0,5.95,9.21,8.3,11.66,1017,75,5.7,...,0.0,0.0,0.0,0.0,0.0,100,804,1,19,2000-01-01 2
184263,Alameda,7.26,10000.0,3.13,4.58,5.4,8.98,1018,75,4.1,...,0.0,0.0,0.0,0.0,0.0,100,804,1,19,2000-01-01 3
184264,Alameda,6.84,10000.0,2.72,4.65,5.2,9.01,1018,75,3.1,...,0.0,0.0,0.0,0.0,0.0,100,804,1,19,2000-01-01 4


# Now that both datasets are clean we will create a single combined dataset from them

In [29]:
#these will be temporarya for creating the data into the shape that will go into the tensors
x_list = []
y_list = []

hours_per_year = 24 * 365

#used for data normalization
scaler = MinMaxScaler()

#outer loop ranges over the years to create a date for filtering through the wf
for year in range(2000, 2022):
    #start if on sept 1 and end is on aug 31 the following year
    start_date = pd.Timestamp(year=year, month=9, day=1,)
    end_date = pd.Timestamp(year=year + 1, month=8, day=31,)


    #wf for that timeframe
    yearly_data = wf[(wf['Date'] >= start_date) & (wf['Date'] <= end_date)]

    #loops through each county for that year and pairs it up with the cases for that county 
    for county, group in yearly_data.groupby('County'):

        #the rate for the counnty currently on
        rate = vf[(vf['County'] == county) & (vf['Year'] == end_date.year)]['Cases']
        #checks for missing values
        if not rate.empty:
            # both are objects and would also not add to learing the weather parameters so we drop them from thoe dataset
            group = group.drop(['County', 'Date'], axis=1)  

            #ensures that there is the correct amont of instances in the group            
            if group.shape[0] >= hours_per_year:
                #slices the data so there is no extra
                group = group.iloc[:hours_per_year]
                #normalizes the data
                scaled = scaler.fit_transform(group)
                group = pd.DataFrame(scaled, columns=group.columns)

                #middlestep for converting to torch tensors
                x_arr = group.values
                y_arr = rate.values[0]  
                #convert the wf values to a torch tensor then psh it to the list
                x_list.append(torch.from_numpy(x_arr.astype(np.float32)))
                y_list.append(torch.tensor(y_arr, dtype=torch.float32))

#converts the list into a torch tensor
x_tensor = torch.stack(x_list, dim=0)
y_tensor = torch.stack(y_list, dim=0) 


print(f'X Shape: {x_tensor.shape}')
print(f'Y Shape: {y_tensor.shape}')

TypeError: '>=' not supported between instances of 'str' and 'Timestamp'

In [8]:
def add_input_noise(X, noise_level=0.01):
    noise = torch.randn(X.size()) * noise_level
    return X + noise

x_augmented = add_input_noise(x_tensor)
x_combined = torch.cat([x_tensor, x_augmented], dim=0)
y_combined = torch.cat([y_tensor, y_tensor], dim=0)

x_augmented_second = add_input_noise(x_combined, 0.001)
x_combined_final = torch.cat([x_combined, x_augmented_second], dim=0)
y_combined_final = torch.cat([y_combined, y_combined], dim=0)

In [None]:
##aggredate the dataset so they will be smaller sequence lengths
x = x_combined_final.view(4224,365,24,13)

x = x.mean(dim=2)
print(x.shape)
print(y_combined_final.shape)