In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import torch
import sys
import numpy as np

# this notebook takes the data from the weather and mathes it up with the data from the valley fever rates so that the the datasets that will be used in training everything are created

In [2]:
weather = pd.read_csv('../../data/weather_datasets/one_hot_weather_06_02_24.csv')
rates = pd.read_csv('../../data/valley_fever_rates/valley_fever_cases_05_31_24.csv')

In [3]:
print(f'Weather shape: {weather.shape}')
print(f'Rates shape: {rates.shape}')

Weather shape: (19241397, 75)
Rates shape: (1056, 4)


In [4]:
#these are already one hot encoded and were left as redundant
weather = weather.drop(['Wthrmain','Wthrdesc'], axis=1)

In [5]:
#was a index that got accitendaly saved
rates = rates.drop('Unnamed: 0',axis=1)

In [6]:
# i made the weather start in 2000 because the output for 2001 of the rates will contain data from september 2000 to august 2001
weather = weather.loc[weather['Year'] >= 2000]
weather = weather.loc[weather['Year'] <= 2022]
weather.shape

(9750922, 73)

In [7]:
print(weather['County'].unique())
print(rates['County'].unique())

['Alameda' 'Amador' 'Berkeley' 'Butte' 'Calaveras' 'Colusa' 'ContraCosta'
 'ElDorado' 'Fresno' 'Humboldt' 'Imperial' 'Kern' 'Lassen' 'LongBeach'
 'Madera' 'Marin' 'Mendocino' 'Merced' 'Monterey' 'Napa' 'Nevada' 'Orange'
 'Pasadena' 'Placer' 'Riverside' 'Sacramento' 'SanBenito' 'SanBernardino'
 'SanDiego' 'SanFrancisco' 'SanJoaquin' 'SanLuisObispo' 'SanMateo'
 'SantaBarbara' 'SantaClara' 'SantaCruz' 'Siskiyou' 'Solano' 'Sonoma'
 'Stanislaus' 'Sutter' 'Tehama' 'Tulare' 'Tuolumne' 'Ventura' 'Yolo'
 'Yuba' 'Kings']
['ALAMEDA' 'AMADOR' 'BERKELEY' 'BUTTE' 'CALAVERAS' 'COLUSA' 'CONTRA COSTA'
 'EL DORADO' 'FRESNO' 'HUMBOLDT' 'IMPERIAL' 'KERN' 'KINGS' 'LASSEN'
 'LONG BEACH' 'MADERA' 'MARIN' 'MENDOCINO' 'MERCED' 'MONTEREY' 'NAPA'
 'NEVADA' 'ORANGE' 'PASADENA' 'PLACER' 'RIVERSIDE' 'SACRAMENTO'
 'SAN BENITO' 'SAN BERNARDINO' 'SAN DIEGO' 'SAN FRANCISCO' 'SAN JOAQUIN'
 'SAN LUIS OBISPO' 'SAN MATEO' 'SANTA BARBARA' 'SANTA CLARA' 'SANTA CRUZ'
 'SISKIYOU' 'SOLANO' 'SONOMA' 'STANISLAUS' 'SUTTER' 'TEHAMA

In [8]:
#relabel all of the counties so it is in a better format
rates['County'] = rates['County'].replace({
    'ALAMEDA': 'Alameda',
    'AMADOR': 'Amador',
    'BERKELEY': 'Berkeley',
    'BUTTE': 'Butte',
    'CALAVERAS': 'Calaveras',
    'COLUSA': 'Colusa',
    'CONTRA COSTA': 'ContraCosta',
    'EL DORADO': 'ElDorado',
    'FRESNO': 'Fresno',
    'HUMBOLDT': 'Humboldt',
    'IMPERIAL': 'Imperial',
    'KERN': 'Kern',
    'LASSEN': 'Lassen',
    'LONG BEACH': 'LongBeach',
    'MADERA': 'Madera',
    'MARIN': 'Marin',
    'MENDOCINO': 'Mendocino',
    'MERCED': 'Merced',
    'MONTEREY': 'Monterey',
    'NAPA': 'Napa',
    'NEVADA': 'Nevada',
    'ORANGE': 'Orange',
    'PASADENA': 'Pasadena',
    'PLACER': 'Placer',
    'RIVERSIDE': 'Riverside',
    'SACRAMENTO': 'Sacramento',
    'SAN BENITO': 'SanBenito',
    'SAN BERNARDINO': 'SanBernardino',
    'SAN DIEGO': 'SanDiego',
    'SAN FRANCISCO': 'SanFrancisco',
    'SAN JOAQUIN': 'SanJoaquin',
    'SAN LUIS OBISPO': 'SanLuisObispo',
    'SAN MATEO': 'SanMateo',
    'SANTA BARBARA': 'SantaBarbara',
    'SANTA CLARA': 'SantaClara',
    'SANTA CRUZ': 'SantaCruz',
    'SISKIYOU': 'Siskiyou',
    'SOLANO': 'Solano',
    'SONOMA': 'Sonoma',
    'STANISLAUS': 'Stanislaus',
    'SUTTER': 'Sutter',
    'TEHAMA': 'Tehama',
    'TULARE': 'Tulare',
    'TUOLUMNE': 'Tuolumne',
    'VENTURA': 'Ventura',
    'YOLO': 'Yolo',
    'YUBA': 'Yuba',
    'KINGS': 'Kings'
})
print(rates['County'].unique())

['Alameda' 'Amador' 'Berkeley' 'Butte' 'Calaveras' 'Colusa' 'ContraCosta'
 'ElDorado' 'Fresno' 'Humboldt' 'Imperial' 'Kern' 'Kings' 'Lassen'
 'LongBeach' 'Madera' 'Marin' 'Mendocino' 'Merced' 'Monterey' 'Napa'
 'Nevada' 'Orange' 'Pasadena' 'Placer' 'Riverside' 'Sacramento'
 'SanBenito' 'SanBernardino' 'SanDiego' 'SanFrancisco' 'SanJoaquin'
 'SanLuisObispo' 'SanMateo' 'SantaBarbara' 'SantaClara' 'SantaCruz'
 'Siskiyou' 'Solano' 'Sonoma' 'Stanislaus' 'Sutter' 'Tehama' 'Tulare'
 'Tuolumne' 'Ventura' 'Yolo' 'Yuba']


In [9]:
# rates.to_csv('../../data/valley_fever_rates/valley_fever_cases_06_02_24.csv', index=False)

In [10]:
weather.shape

(9750922, 73)

In [11]:
rates.shape

(1056, 3)

In [12]:
print(weather['Year'].unique())

[2000 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 2011 2012 2013
 2014 2015 2016 2017 2018 2019 2020 2021 2022]


In [13]:
#check to make sure there is no missing info
for county, group in weather.groupby('County'):
    print(f"County: {county}, Number of unique years: {len(group['Year'].unique())}")

County: Alameda, Number of unique years: 23
County: Amador, Number of unique years: 23
County: Berkeley, Number of unique years: 23
County: Butte, Number of unique years: 23
County: Calaveras, Number of unique years: 23
County: Colusa, Number of unique years: 23
County: ContraCosta, Number of unique years: 23
County: ElDorado, Number of unique years: 23
County: Fresno, Number of unique years: 23
County: Humboldt, Number of unique years: 23
County: Imperial, Number of unique years: 23
County: Kern, Number of unique years: 23
County: Kings, Number of unique years: 23
County: Lassen, Number of unique years: 23
County: LongBeach, Number of unique years: 23
County: Madera, Number of unique years: 23
County: Marin, Number of unique years: 23
County: Mendocino, Number of unique years: 23
County: Merced, Number of unique years: 23
County: Monterey, Number of unique years: 23
County: Napa, Number of unique years: 23
County: Nevada, Number of unique years: 23
County: Orange, Number of unique yea

In [14]:
print(len(weather['County'].unique()))

48


# math 
The weather dataset has a totatl of:
    - 9750922 instances
    - 48 counties
    - 23 years 
    - 365 days in a year
    - 24 hours in a day

so to verify that the number of instances is correct 
we can do 48 * 23 * 365 * 24 = 9671040


In [15]:
print(len(rates['County'].unique()))

48


In [16]:
for county, group in rates.groupby('County'):
    print(f"County: {county}, Number of unique years: {len(group['Year'].unique())}")

County: Alameda, Number of unique years: 22
County: Amador, Number of unique years: 22
County: Berkeley, Number of unique years: 22
County: Butte, Number of unique years: 22
County: Calaveras, Number of unique years: 22
County: Colusa, Number of unique years: 22
County: ContraCosta, Number of unique years: 22
County: ElDorado, Number of unique years: 22
County: Fresno, Number of unique years: 22
County: Humboldt, Number of unique years: 22
County: Imperial, Number of unique years: 22
County: Kern, Number of unique years: 22
County: Kings, Number of unique years: 22
County: Lassen, Number of unique years: 22
County: LongBeach, Number of unique years: 22
County: Madera, Number of unique years: 22
County: Marin, Number of unique years: 22
County: Mendocino, Number of unique years: 22
County: Merced, Number of unique years: 22
County: Monterey, Number of unique years: 22
County: Napa, Number of unique years: 22
County: Nevada, Number of unique years: 22
County: Orange, Number of unique yea

# This block creates the dataset without aggregation

In [20]:
#relabel to df to make it easier to write a bunch of code with
df = weather.copy()
#convert data back to a datteime object
df['Date'] = pd.to_datetime(df[['Year', 'Month', 'Day']])
df = df.drop(['Year', 'Month', 'Day'], axis=1)

#these will be temporarya for creating the data into the shape that will go into the tensors
x_list = []
y_list = []

hours_per_year = 24 * 365

#used for data normalization
scaler = MinMaxScaler()

#outer loop ranges over the years to create a date for filtering through the df
for year in range(2000, 2022):
    #start if on sept 1 and end is on aug 31 the following year
    start_date = pd.Timestamp(year=year, month=9, day=1)
    end_date = pd.Timestamp(year=year + 1, month=8, day=31)

    #df for that timeframe
    yearly_data = df[(df['Date'] >= start_date) & (df['Date'] <= end_date)]
    
    #loops through each county for that year and pairs it up with the cases for that county 
    for county, group in yearly_data.groupby('County'):
        #the rate for the counnty currently on
        rate = rates[(rates['County'] == county) & (rates['Year'] == end_date.year)]['Cases']
        #checks for missing values
        if not rate.empty:
            # both are objects and would also not add to learing the weather parameters so we drop them from thoe dataset
            group = group.drop(['County', 'Date'], axis=1)  

            #ensures that there is the correct amont of instances in the group            
            if group.shape[0] >= hours_per_year:
                #slices the data so there is no extra
                group = group.iloc[:hours_per_year]
                #normalizes the data
                scaled = scaler.fit_transform(group)
                group = pd.DataFrame(scaled, columns=group.columns)
                #middlestep for converting to torch tensors
                x_arr = group.values
                y_arr = rate.values[0]  
                #convert the df values to a torch tensor then psh it to the list
                x_list.append(torch.from_numpy(x_arr.astype(np.float32)))
                y_list.append(torch.tensor(y_arr, dtype=torch.float32))
#converts the list into a torch tensor
x_tensor = torch.stack(x_list, dim=0)
y_tensor = torch.stack(y_list, dim=0) 

torch.save(x_tensor, '../../data/dataset/x_06_04_24.pt')
torch.save(y_tensor, '../../data/dataset/y_06_04_24.pt')

print(f'X Shape: {x_tensor.shape}')
print(f'Y Shape: {y_tensor.shape}')

X Shape: torch.Size([1056, 8760, 69])
Y Shape: torch.Size([1056])


In [30]:
def add_input_noise(X, noise_level=0.01):
    noise = torch.randn(X.size()) * noise_level
    return X + noise

x_augmented = add_input_noise(x_tensor)
x_combined = torch.cat([x_tensor, x_augmented], dim=0)
y_combined = torch.cat([y_tensor, y_tensor], dim=0)

x_augmented_second = add_input_noise(x_combined, 0.001)
x_combined_final = torch.cat([x_combined, x_augmented_second], dim=0)
y_combined_final = torch.cat([y_combined, y_combined], dim=0)

In [28]:
x_combined_final.shape

torch.Size([4224, 8760, 69])

In [31]:
y_combined_final.shape

torch.Size([4224])

In [33]:
torch.save(x_combined_final, '../../data/augmented_dataset/x_augmented_06_04_24.pt')
torch.save(y_combined_final, '../../data/augmented_dataset/y_augmented_06_04_24.pt')