In [88]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split



In [2]:
# combine the 2 datasets 

df = pd.read_excel('../../data/original/rates.xlsx')
print(df.shape)
df.head()

(1410, 4)


Unnamed: 0,"Valley Fever Cases and Incidence Rates by Local Health Jurisdiction, California, 2001-2022",Unnamed: 1,Unnamed: 2,Unnamed: 3
0,Local Health Jurisdiction,Year of Estimated Illness Onset,Cases,Rate
1,ALAMEDA,2001,7,0.5*
2,ALAMEDA,2002,2,0.1*
3,ALAMEDA,2003,9,0.7*
4,ALAMEDA,2004,12,0.9*


In [3]:
# the column titles are improperly aligned

df = df.rename(columns={
    'Valley Fever Cases and Incidence Rates by Local Health Jurisdiction, California, 2001-2022 ' : 'county',
    'Unnamed: 1' : 'year', 
    'Unnamed: 2' : 'cases', 
    'Unnamed: 3' : 'rates'
},)
df.columns

Index(['county', 'year', 'cases', 'rates'], dtype='object')

In [4]:
# remove rates because the sight said thate the rate is unreliable
df.drop('rates', axis=1, inplace=True)
df.head()

Unnamed: 0,county,year,cases
0,Local Health Jurisdiction,Year of Estimated Illness Onset,Cases
1,ALAMEDA,2001,7
2,ALAMEDA,2002,2
3,ALAMEDA,2003,9
4,ALAMEDA,2004,12


In [5]:
# drop first and last row because they are not instances of the data

df.drop(index=[df.index[0], df.index[-1]], axis=0, inplace=True)
df.head()

Unnamed: 0,county,year,cases
1,ALAMEDA,2001,7
2,ALAMEDA,2002,2
3,ALAMEDA,2003,9
4,ALAMEDA,2004,12
5,ALAMEDA,2005,11


In [6]:
df.tail()

Unnamed: 0,county,year,cases
1404,YUBA,2018,2
1405,YUBA,2019,5
1406,YUBA,2020,2
1407,YUBA,2021,2
1408,YUBA,2022,2


In [7]:
df['county'].unique()

array(['ALAMEDA', 'ALAMEDA COUNTY TOTAL', 'ALPINE', 'AMADOR', 'BERKELEY',
       'BUTTE', 'CALAVERAS', 'CALIFORNIA TOTAL', 'COLUSA', 'CONTRA COSTA',
       'DEL NORTE', 'EL DORADO', 'FRESNO', 'GLENN', 'HUMBOLDT',
       'IMPERIAL', 'INYO', 'KERN', 'KINGS', 'LAKE', 'LASSEN',
       'LONG BEACH', 'LOS ANGELES', 'LOS ANGELES COUNTY TOTAL', 'MADERA',
       'MARIN', 'MARIPOSA', 'MENDOCINO', 'MERCED', 'MODOC', 'MONO',
       'MONTEREY', 'NAPA', 'NEVADA', 'ORANGE', 'PASADENA', 'PLACER',
       'PLUMAS', 'RIVERSIDE', 'SACRAMENTO', 'SAN BENITO',
       'SAN BERNARDINO', 'SAN DIEGO', 'SAN FRANCISCO', 'SAN JOAQUIN',
       'SAN LUIS OBISPO', 'SAN MATEO', 'SANTA BARBARA', 'SANTA CLARA',
       'SANTA CRUZ', 'SHASTA', 'SIERRA', 'SISKIYOU', 'SOLANO', 'SONOMA',
       'STANISLAUS', 'SUTTER', 'TEHAMA', 'TRINITY', 'TULARE', 'TUOLUMNE',
       'VENTURA', 'YOLO', 'YUBA'], dtype=object)

In [8]:
df['county'] = df['county'].str.lower()
df['county'].unique()

array(['alameda', 'alameda county total', 'alpine', 'amador', 'berkeley',
       'butte', 'calaveras', 'california total', 'colusa', 'contra costa',
       'del norte', 'el dorado', 'fresno', 'glenn', 'humboldt',
       'imperial', 'inyo', 'kern', 'kings', 'lake', 'lassen',
       'long beach', 'los angeles', 'los angeles county total', 'madera',
       'marin', 'mariposa', 'mendocino', 'merced', 'modoc', 'mono',
       'monterey', 'napa', 'nevada', 'orange', 'pasadena', 'placer',
       'plumas', 'riverside', 'sacramento', 'san benito',
       'san bernardino', 'san diego', 'san francisco', 'san joaquin',
       'san luis obispo', 'san mateo', 'santa barbara', 'santa clara',
       'santa cruz', 'shasta', 'sierra', 'siskiyou', 'solano', 'sonoma',
       'stanislaus', 'sutter', 'tehama', 'trinity', 'tulare', 'tuolumne',
       'ventura', 'yolo', 'yuba'], dtype=object)

In [9]:
df.shape

(1408, 3)

In [10]:
cases = df.copy()

In [11]:
df = pd.read_csv('../../data/original/weather.csv')
print(df.shape)
df.head()

(19241397, 28)


Unnamed: 0,dt,dt_iso,timezone,city_name,lat,lon,temp,visibility,dew_point,feels_like,...,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,weather_icon
0,283996800,1979-01-01 00:00:00 +0000 UTC,-28800,Alameda County,37.601689,-121.719546,6.63,,-0.92,5.57,...,,,,,,0,800,Clear,sky is clear,01d
1,284000400,1979-01-01 01:00:00 +0000 UTC,-28800,Alameda County,37.601689,-121.719546,7.63,,-3.67,7.63,...,,,,,,0,800,Clear,sky is clear,01n
2,284004000,1979-01-01 02:00:00 +0000 UTC,-28800,Alameda County,37.601689,-121.719546,6.93,,-2.69,6.93,...,,,,,,0,800,Clear,sky is clear,01n
3,284007600,1979-01-01 03:00:00 +0000 UTC,-28800,Alameda County,37.601689,-121.719546,-0.11,,-4.71,-0.11,...,,,,,,0,800,Clear,sky is clear,01n
4,284011200,1979-01-01 04:00:00 +0000 UTC,-28800,Alameda County,37.601689,-121.719546,1.05,,-4.79,1.05,...,,,,,,0,800,Clear,sky is clear,01n


In [12]:
# drop all unnessarcary columns

cols = ['dt', 'timezone', 'lat', 'lon', 'weather_icon', 'sea_level', 'grnd_level']
df.drop(cols, axis=1, inplace=True)

df.head()

Unnamed: 0,dt_iso,city_name,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,...,wind_deg,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description
0,1979-01-01 00:00:00 +0000 UTC,Alameda County,6.63,,-0.92,5.57,5.69,7.32,1024,58,...,14,,,,,,0,800,Clear,sky is clear
1,1979-01-01 01:00:00 +0000 UTC,Alameda County,7.63,,-3.67,7.63,5.8,9.45,1023,43,...,59,,,,,,0,800,Clear,sky is clear
2,1979-01-01 02:00:00 +0000 UTC,Alameda County,6.93,,-2.69,6.93,3.75,9.11,1024,49,...,111,,,,,,0,800,Clear,sky is clear
3,1979-01-01 03:00:00 +0000 UTC,Alameda County,-0.11,,-4.71,-0.11,-1.63,1.41,1025,68,...,156,,,,,,0,800,Clear,sky is clear
4,1979-01-01 04:00:00 +0000 UTC,Alameda County,1.05,,-4.79,1.05,-0.88,2.99,1025,62,...,132,,,,,,0,800,Clear,sky is clear


In [13]:
# fill all missing valuues
df = df.fillna(0)
df.head()

Unnamed: 0,dt_iso,city_name,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,...,wind_deg,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description
0,1979-01-01 00:00:00 +0000 UTC,Alameda County,6.63,0.0,-0.92,5.57,5.69,7.32,1024,58,...,14,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
1,1979-01-01 01:00:00 +0000 UTC,Alameda County,7.63,0.0,-3.67,7.63,5.8,9.45,1023,43,...,59,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
2,1979-01-01 02:00:00 +0000 UTC,Alameda County,6.93,0.0,-2.69,6.93,3.75,9.11,1024,49,...,111,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
3,1979-01-01 03:00:00 +0000 UTC,Alameda County,-0.11,0.0,-4.71,-0.11,-1.63,1.41,1025,68,...,156,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
4,1979-01-01 04:00:00 +0000 UTC,Alameda County,1.05,0.0,-4.79,1.05,-0.88,2.99,1025,62,...,132,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear


In [14]:
df['dt_iso']

0           1979-01-01 00:00:00 +0000 UTC
1           1979-01-01 01:00:00 +0000 UTC
2           1979-01-01 02:00:00 +0000 UTC
3           1979-01-01 03:00:00 +0000 UTC
4           1979-01-01 04:00:00 +0000 UTC
                        ...              
19241392    2024-05-30 19:00:00 +0000 UTC
19241393    2024-05-30 20:00:00 +0000 UTC
19241394    2024-05-30 21:00:00 +0000 UTC
19241395    2024-05-30 22:00:00 +0000 UTC
19241396    2024-05-30 23:00:00 +0000 UTC
Name: dt_iso, Length: 19241397, dtype: object

In [15]:
df['dt_iso'] = pd.to_datetime(df['dt_iso'], format='%Y-%m-%d %H:%M:%S %z UTC').dt.strftime('%Y-%m-%d %H')


df.dtypes

dt_iso                  object
city_name               object
temp                   float64
visibility             float64
dew_point              float64
feels_like             float64
temp_min               float64
temp_max               float64
pressure                 int64
humidity                 int64
wind_speed             float64
wind_deg                 int64
wind_gust              float64
rain_1h                float64
rain_3h                float64
snow_1h                float64
snow_3h                float64
clouds_all               int64
weather_id               int64
weather_main            object
weather_description     object
dtype: object

In [16]:
df.head()

Unnamed: 0,dt_iso,city_name,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,...,wind_deg,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description
0,1979-01-01 00,Alameda County,6.63,0.0,-0.92,5.57,5.69,7.32,1024,58,...,14,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
1,1979-01-01 01,Alameda County,7.63,0.0,-3.67,7.63,5.8,9.45,1023,43,...,59,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
2,1979-01-01 02,Alameda County,6.93,0.0,-2.69,6.93,3.75,9.11,1024,49,...,111,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
3,1979-01-01 03,Alameda County,-0.11,0.0,-4.71,-0.11,-1.63,1.41,1025,68,...,156,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
4,1979-01-01 04,Alameda County,1.05,0.0,-4.79,1.05,-0.88,2.99,1025,62,...,132,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear


In [17]:
df['dt_iso'] = pd.to_datetime(df['dt_iso'])
df.dtypes

dt_iso                 datetime64[ns]
city_name                      object
temp                          float64
visibility                    float64
dew_point                     float64
feels_like                    float64
temp_min                      float64
temp_max                      float64
pressure                        int64
humidity                        int64
wind_speed                    float64
wind_deg                        int64
wind_gust                     float64
rain_1h                       float64
rain_3h                       float64
snow_1h                       float64
snow_3h                       float64
clouds_all                      int64
weather_id                      int64
weather_main                   object
weather_description            object
dtype: object

In [18]:
df.head()

Unnamed: 0,dt_iso,city_name,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,...,wind_deg,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description
0,1979-01-01 00:00:00,Alameda County,6.63,0.0,-0.92,5.57,5.69,7.32,1024,58,...,14,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
1,1979-01-01 01:00:00,Alameda County,7.63,0.0,-3.67,7.63,5.8,9.45,1023,43,...,59,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
2,1979-01-01 02:00:00,Alameda County,6.93,0.0,-2.69,6.93,3.75,9.11,1024,49,...,111,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
3,1979-01-01 03:00:00,Alameda County,-0.11,0.0,-4.71,-0.11,-1.63,1.41,1025,68,...,156,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
4,1979-01-01 04:00:00,Alameda County,1.05,0.0,-4.79,1.05,-0.88,2.99,1025,62,...,132,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear


In [19]:
df['dt_iso']

0          1979-01-01 00:00:00
1          1979-01-01 01:00:00
2          1979-01-01 02:00:00
3          1979-01-01 03:00:00
4          1979-01-01 04:00:00
                   ...        
19241392   2024-05-30 19:00:00
19241393   2024-05-30 20:00:00
19241394   2024-05-30 21:00:00
19241395   2024-05-30 22:00:00
19241396   2024-05-30 23:00:00
Name: dt_iso, Length: 19241397, dtype: datetime64[ns]

In [20]:
df = df.rename(columns={'city_name':'county'})
df.head()

Unnamed: 0,dt_iso,county,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,...,wind_deg,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description
0,1979-01-01 00:00:00,Alameda County,6.63,0.0,-0.92,5.57,5.69,7.32,1024,58,...,14,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
1,1979-01-01 01:00:00,Alameda County,7.63,0.0,-3.67,7.63,5.8,9.45,1023,43,...,59,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
2,1979-01-01 02:00:00,Alameda County,6.93,0.0,-2.69,6.93,3.75,9.11,1024,49,...,111,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
3,1979-01-01 03:00:00,Alameda County,-0.11,0.0,-4.71,-0.11,-1.63,1.41,1025,68,...,156,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear
4,1979-01-01 04:00:00,Alameda County,1.05,0.0,-4.79,1.05,-0.88,2.99,1025,62,...,132,0.0,0.0,0.0,0.0,0.0,0,800,Clear,sky is clear


In [21]:
encoder = LabelEncoder()
df['weather_main'] = encoder.fit_transform(df['weather_main'])
df['weather_description'] = encoder.fit_transform(df['weather_description'])

df.head()

Unnamed: 0,dt_iso,county,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,...,wind_deg,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description
0,1979-01-01 00:00:00,Alameda County,6.63,0.0,-0.92,5.57,5.69,7.32,1024,58,...,14,0.0,0.0,0.0,0.0,0.0,0,800,0,30
1,1979-01-01 01:00:00,Alameda County,7.63,0.0,-3.67,7.63,5.8,9.45,1023,43,...,59,0.0,0.0,0.0,0.0,0.0,0,800,0,30
2,1979-01-01 02:00:00,Alameda County,6.93,0.0,-2.69,6.93,3.75,9.11,1024,49,...,111,0.0,0.0,0.0,0.0,0.0,0,800,0,30
3,1979-01-01 03:00:00,Alameda County,-0.11,0.0,-4.71,-0.11,-1.63,1.41,1025,68,...,156,0.0,0.0,0.0,0.0,0.0,0,800,0,30
4,1979-01-01 04:00:00,Alameda County,1.05,0.0,-4.79,1.05,-0.88,2.99,1025,62,...,132,0.0,0.0,0.0,0.0,0.0,0,800,0,30


In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19241397 entries, 0 to 19241396
Data columns (total 21 columns):
 #   Column               Dtype         
---  ------               -----         
 0   dt_iso               datetime64[ns]
 1   county               object        
 2   temp                 float64       
 3   visibility           float64       
 4   dew_point            float64       
 5   feels_like           float64       
 6   temp_min             float64       
 7   temp_max             float64       
 8   pressure             int64         
 9   humidity             int64         
 10  wind_speed           float64       
 11  wind_deg             int64         
 12  wind_gust            float64       
 13  rain_1h              float64       
 14  rain_3h              float64       
 15  snow_1h              float64       
 16  snow_3h              float64       
 17  clouds_all           int64         
 18  weather_id           int64         
 19  weather_main       

In [23]:
df.columns

Index(['dt_iso', 'county', 'temp', 'visibility', 'dew_point', 'feels_like',
       'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed',
       'wind_deg', 'wind_gust', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h',
       'clouds_all', 'weather_id', 'weather_main', 'weather_description'],
      dtype='object')

In [24]:
df = df.set_index('dt_iso')
daily_averages = df.groupby('county').resample('D').mean()
daily_averages.head(50)

Unnamed: 0_level_0,Unnamed: 1_level_0,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description
county,dt_iso,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Alameda County,1979-01-01,5.233333,0.0,-2.91625,3.992083,3.474583,7.02625,1026.375,54.958333,1.763333,73.625,0.0,0.0,0.0,0.0,0.0,11.916667,800.5,0.166667,26.75
Alameda County,1979-01-02,4.5,0.0,-1.149167,2.939583,2.980833,6.405,1026.333333,67.375,1.892917,93.666667,0.0,0.0,0.0,0.0,0.0,95.166667,803.875,1.0,18.541667
Alameda County,1979-01-03,6.033333,0.0,-2.309167,4.630417,4.344167,8.6275,1019.291667,54.791667,1.73875,106.041667,0.0,0.009583,0.0,0.0,0.0,93.416667,791.208333,1.25,16.416667
Alameda County,1979-01-04,8.742083,0.0,2.02875,7.550833,6.7275,11.1575,1019.541667,63.5,1.90125,110.625,0.0,0.086667,0.0,0.0,0.0,91.875,702.5,3.0,16.083333
Alameda County,1979-01-05,8.525833,0.0,5.631667,7.390833,7.196667,10.41125,1012.625,82.625,2.016667,77.791667,0.0,0.432083,0.0,0.0,0.0,99.916667,588.791667,5.25,15.958333
Alameda County,1979-01-06,7.796667,0.0,5.388333,6.862083,6.724583,9.253333,1013.375,85.458333,1.689583,101.041667,0.0,0.0,0.0,0.0,0.0,55.375,802.541667,1.0,11.541667
Alameda County,1979-01-07,9.375833,0.0,6.9425,8.557917,8.047917,11.13375,1017.5,85.291667,1.702083,154.625,0.0,0.185833,0.0,0.0,0.0,97.25,715.291667,2.75,16.75
Alameda County,1979-01-08,10.429583,0.0,9.325833,9.65375,9.264583,11.882917,1019.041667,92.875,3.82625,151.541667,0.0,0.814583,0.0,0.0,0.0,99.833333,538.333333,6.25,15.958333
Alameda County,1979-01-09,10.230417,0.0,9.030833,9.83,9.157083,11.687083,1020.583333,92.625,1.695,176.916667,0.0,0.297083,0.0,0.0,0.0,87.791667,727.833333,2.5,14.625
Alameda County,1979-01-10,10.388333,0.0,8.22625,9.72875,8.880417,11.928333,1024.375,87.375,1.67625,181.083333,0.0,0.0,0.0,0.0,0.0,93.583333,803.875,1.0,18.541667


In [54]:
df = daily_averages.copy()
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 796176 entries, ('Alameda County', Timestamp('1979-01-01 00:00:00')) to ('Yuba County', Timestamp('2024-05-30 00:00:00'))
Data columns (total 19 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   temp                 796176 non-null  float64
 1   visibility           796176 non-null  float64
 2   dew_point            796176 non-null  float64
 3   feels_like           796176 non-null  float64
 4   temp_min             796176 non-null  float64
 5   temp_max             796176 non-null  float64
 6   pressure             796176 non-null  float64
 7   humidity             796176 non-null  float64
 8   wind_speed           796176 non-null  float64
 9   wind_deg             796176 non-null  float64
 10  wind_gust            796176 non-null  float64
 11  rain_1h              796176 non-null  float64
 12  rain_3h              796176 non-null  float64
 13  snow_1h              7961

In [55]:
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description
county,dt_iso,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Alameda County,1979-01-01,5.233333,0.0,-2.91625,3.992083,3.474583,7.02625,1026.375,54.958333,1.763333,73.625,0.0,0.0,0.0,0.0,0.0,11.916667,800.5,0.166667,26.75
Alameda County,1979-01-02,4.5,0.0,-1.149167,2.939583,2.980833,6.405,1026.333333,67.375,1.892917,93.666667,0.0,0.0,0.0,0.0,0.0,95.166667,803.875,1.0,18.541667
Alameda County,1979-01-03,6.033333,0.0,-2.309167,4.630417,4.344167,8.6275,1019.291667,54.791667,1.73875,106.041667,0.0,0.009583,0.0,0.0,0.0,93.416667,791.208333,1.25,16.416667
Alameda County,1979-01-04,8.742083,0.0,2.02875,7.550833,6.7275,11.1575,1019.541667,63.5,1.90125,110.625,0.0,0.086667,0.0,0.0,0.0,91.875,702.5,3.0,16.083333
Alameda County,1979-01-05,8.525833,0.0,5.631667,7.390833,7.196667,10.41125,1012.625,82.625,2.016667,77.791667,0.0,0.432083,0.0,0.0,0.0,99.916667,588.791667,5.25,15.958333


In [56]:
df = df.reset_index(level=['county', 'dt_iso'])

df = df.rename(columns={'dt_iso': 'date'})
df.head()

Unnamed: 0,county,date,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,...,wind_deg,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description
0,Alameda County,1979-01-01,5.233333,0.0,-2.91625,3.992083,3.474583,7.02625,1026.375,54.958333,...,73.625,0.0,0.0,0.0,0.0,0.0,11.916667,800.5,0.166667,26.75
1,Alameda County,1979-01-02,4.5,0.0,-1.149167,2.939583,2.980833,6.405,1026.333333,67.375,...,93.666667,0.0,0.0,0.0,0.0,0.0,95.166667,803.875,1.0,18.541667
2,Alameda County,1979-01-03,6.033333,0.0,-2.309167,4.630417,4.344167,8.6275,1019.291667,54.791667,...,106.041667,0.0,0.009583,0.0,0.0,0.0,93.416667,791.208333,1.25,16.416667
3,Alameda County,1979-01-04,8.742083,0.0,2.02875,7.550833,6.7275,11.1575,1019.541667,63.5,...,110.625,0.0,0.086667,0.0,0.0,0.0,91.875,702.5,3.0,16.083333
4,Alameda County,1979-01-05,8.525833,0.0,5.631667,7.390833,7.196667,10.41125,1012.625,82.625,...,77.791667,0.0,0.432083,0.0,0.0,0.0,99.916667,588.791667,5.25,15.958333


In [57]:
df['county'].unique()

array(['Alameda County', 'Amador County', 'Berkeley', 'Butte Valley',
       'Calaveras County', 'Colusa County', 'Contra Costa County',
       'El Dorado County', 'Fresno County', 'Humboldt County',
       'Imperial County', 'Kern County', 'Kings County', 'Lassen County',
       'Long Beach', 'Madera County', 'Marin County', 'Mendocino County',
       'Merced County', 'Monterey County', 'Napa County', 'Nevada County',
       'Orange County', 'Pasadena', 'Placer County', 'Riverside County',
       'Sacramento County', 'San Benito County', 'San Bernardino County',
       'San Diego County', 'San Francisco County', 'San Joaquin County',
       'San Luis Obispo County', 'San Mateo County',
       'Santa Barbara County', 'Santa Clara County', 'Santa Cruz County',
       'Siskiyou County', 'Solano County', 'Sonoma County',
       'Stanislaus County', 'Sutter County', 'Tehama County',
       'Tulare County', 'Tuolumne County', 'Ventura County',
       'Yolo County', 'Yuba County'], dtype=obj

In [58]:
cases['county'].unique()

array(['alameda', 'alameda county total', 'alpine', 'amador', 'berkeley',
       'butte', 'calaveras', 'california total', 'colusa', 'contra costa',
       'del norte', 'el dorado', 'fresno', 'glenn', 'humboldt',
       'imperial', 'inyo', 'kern', 'kings', 'lake', 'lassen',
       'long beach', 'los angeles', 'los angeles county total', 'madera',
       'marin', 'mariposa', 'mendocino', 'merced', 'modoc', 'mono',
       'monterey', 'napa', 'nevada', 'orange', 'pasadena', 'placer',
       'plumas', 'riverside', 'sacramento', 'san benito',
       'san bernardino', 'san diego', 'san francisco', 'san joaquin',
       'san luis obispo', 'san mateo', 'santa barbara', 'santa clara',
       'santa cruz', 'shasta', 'sierra', 'siskiyou', 'solano', 'sonoma',
       'stanislaus', 'sutter', 'tehama', 'trinity', 'tulare', 'tuolumne',
       'ventura', 'yolo', 'yuba'], dtype=object)

In [62]:
relabelled_df = df.copy()
relabelled_df['county'] = df['county'].str.replace(' County', '')
relabelled_df['county'] = relabelled_df['county'].str.lower()
relabelled_df.head()

Unnamed: 0,county,date,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,...,wind_deg,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description
0,alameda,1979-01-01,5.233333,0.0,-2.91625,3.992083,3.474583,7.02625,1026.375,54.958333,...,73.625,0.0,0.0,0.0,0.0,0.0,11.916667,800.5,0.166667,26.75
1,alameda,1979-01-02,4.5,0.0,-1.149167,2.939583,2.980833,6.405,1026.333333,67.375,...,93.666667,0.0,0.0,0.0,0.0,0.0,95.166667,803.875,1.0,18.541667
2,alameda,1979-01-03,6.033333,0.0,-2.309167,4.630417,4.344167,8.6275,1019.291667,54.791667,...,106.041667,0.0,0.009583,0.0,0.0,0.0,93.416667,791.208333,1.25,16.416667
3,alameda,1979-01-04,8.742083,0.0,2.02875,7.550833,6.7275,11.1575,1019.541667,63.5,...,110.625,0.0,0.086667,0.0,0.0,0.0,91.875,702.5,3.0,16.083333
4,alameda,1979-01-05,8.525833,0.0,5.631667,7.390833,7.196667,10.41125,1012.625,82.625,...,77.791667,0.0,0.432083,0.0,0.0,0.0,99.916667,588.791667,5.25,15.958333


In [63]:
relabelled_df['county'] = relabelled_df['county'].str.replace(' valley', '')
relabelled_df.head()

Unnamed: 0,county,date,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,...,wind_deg,wind_gust,rain_1h,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description
0,alameda,1979-01-01,5.233333,0.0,-2.91625,3.992083,3.474583,7.02625,1026.375,54.958333,...,73.625,0.0,0.0,0.0,0.0,0.0,11.916667,800.5,0.166667,26.75
1,alameda,1979-01-02,4.5,0.0,-1.149167,2.939583,2.980833,6.405,1026.333333,67.375,...,93.666667,0.0,0.0,0.0,0.0,0.0,95.166667,803.875,1.0,18.541667
2,alameda,1979-01-03,6.033333,0.0,-2.309167,4.630417,4.344167,8.6275,1019.291667,54.791667,...,106.041667,0.0,0.009583,0.0,0.0,0.0,93.416667,791.208333,1.25,16.416667
3,alameda,1979-01-04,8.742083,0.0,2.02875,7.550833,6.7275,11.1575,1019.541667,63.5,...,110.625,0.0,0.086667,0.0,0.0,0.0,91.875,702.5,3.0,16.083333
4,alameda,1979-01-05,8.525833,0.0,5.631667,7.390833,7.196667,10.41125,1012.625,82.625,...,77.791667,0.0,0.432083,0.0,0.0,0.0,99.916667,588.791667,5.25,15.958333


In [64]:
df = relabelled_df.copy()
df['county'].unique()

array(['alameda', 'amador', 'berkeley', 'butte', 'calaveras', 'colusa',
       'contra costa', 'el dorado', 'fresno', 'humboldt', 'imperial',
       'kern', 'kings', 'lassen', 'long beach', 'madera', 'marin',
       'mendocino', 'merced', 'monterey', 'napa', 'nevada', 'orange',
       'pasadena', 'placer', 'riverside', 'sacramento', 'san benito',
       'san bernardino', 'san diego', 'san francisco', 'san joaquin',
       'san luis obispo', 'san mateo', 'santa barbara', 'santa clara',
       'santa cruz', 'siskiyou', 'solano', 'sonoma', 'stanislaus',
       'sutter', 'tehama', 'tulare', 'tuolumne', 'ventura', 'yolo',
       'yuba'], dtype=object)

In [65]:
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day

df.shape

(796176, 24)

In [66]:
df.tail()

Unnamed: 0,county,date,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,...,rain_3h,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,year,month,day
796171,yuba,2024-05-26,17.932917,5998.75,9.315,17.349167,16.892083,19.286667,1014.833333,60.0,...,0.0,0.0,0.0,1.666667,746.583333,3.458333,12.916667,2024,5,26
796172,yuba,2024-05-27,21.892083,7804.25,9.457083,21.178333,20.345,23.061667,1013.708333,47.958333,...,0.0,0.0,0.0,0.833333,780.291667,1.291667,23.166667,2024,5,27
796173,yuba,2024-05-28,21.905417,6193.75,10.600833,21.289583,20.375,23.88875,1013.791667,51.833333,...,0.0,0.0,0.0,1.791667,793.5,0.458333,27.958333,2024,5,28
796174,yuba,2024-05-29,22.66875,8735.666667,8.403333,21.872917,21.579167,23.684167,1015.333333,43.791667,...,0.0,0.0,0.0,4.791667,797.0,0.458333,23.5,2024,5,29
796175,yuba,2024-05-30,23.311667,9166.666667,4.610417,22.104167,21.902917,24.907917,1013.583333,33.375,...,0.0,0.0,0.0,4.0,800.208333,0.208333,24.583333,2024,5,30


In [67]:
weather = df.copy()

merged_df = pd.merge(weather, cases, on=['county','year'],how='inner')
print(merged_df.shape)

(385680, 25)


In [68]:
merged_df.tail()

Unnamed: 0,county,date,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,...,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,year,month,day,cases
385675,yuba,2022-12-27,11.722727,5871.030303,10.57303,11.321515,10.936364,13.400303,1011.636364,93.0,...,0.0,0.0,100.0,583.060606,6.181818,15.727273,2022,12,27,2
385676,yuba,2022-12-28,8.491667,8750.0,6.945833,7.20625,7.870417,9.822083,1013.541667,90.416667,...,0.0,0.0,13.5,800.708333,0.333333,25.791667,2022,12,28,2
385677,yuba,2022-12-29,7.817714,8556.171429,6.982571,6.495714,7.108571,8.674857,1013.942857,94.628571,...,0.0,0.0,85.428571,622.228571,4.142857,15.628571,2022,12,29,2
385678,yuba,2022-12-30,10.258611,7048.638889,9.916111,8.996389,9.820556,11.912778,1012.861111,97.805556,...,0.0,0.0,98.611111,592.583333,6.25,16.861111,2022,12,30,2
385679,yuba,2022-12-31,13.0656,2744.96,12.1236,12.6052,12.3736,13.8116,1007.72,94.04,...,0.0,0.0,99.8,508.76,6.96,16.48,2022,12,31,2


In [69]:
len(merged_df['county'].unique())

48

In [38]:
merged_df['county'].unique()

array([], dtype=object)

In [70]:
len(merged_df['year'].unique())

22

In [71]:
day_threshold = 31
month_threshold = 8

# Create a boolean mask
mask = (merged_df['date'].dt.day <= day_threshold) & (merged_df['date'].dt.month <= month_threshold)

# Filter the DataFrame
filtered_merged_df = merged_df.loc[mask]
filtered_merged_df.head()

Unnamed: 0,county,date,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,...,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,year,month,day,cases
0,alameda,2001-01-01,8.39,4866.666667,1.287917,7.492083,6.574167,10.485833,1019.333333,61.833333,...,0.0,0.0,17.708333,781.0,1.5,20.25,2001,1,1,7
1,alameda,2001-01-02,8.55125,4883.333333,0.8625,7.215833,6.980417,10.632083,1023.583333,61.041667,...,0.0,0.0,5.166667,790.375,0.875,20.625,2001,1,2,7
2,alameda,2001-01-03,10.125,3416.666667,0.801667,9.01125,8.376667,12.681667,1026.916667,52.75,...,0.0,0.0,0.0,786.833333,0.833333,26.166667,2001,1,3,7
3,alameda,2001-01-04,10.165,3333.333333,-1.030417,8.8075,8.11125,13.0,1024.291667,46.041667,...,0.0,0.0,5.291667,800.25,0.166667,27.583333,2001,1,4,7
4,alameda,2001-01-05,11.111667,3566.666667,0.440417,9.98875,8.65375,13.720417,1020.458333,48.083333,...,0.0,0.0,0.0,790.125,0.625,27.125,2001,1,5,7


In [72]:
filtered_merged_df.shape

(256848, 25)

In [73]:
df = filtered_merged_df.copy()

len(df['month'].unique())

8

In [74]:
df.to_csv('df.csv', index=False)

In [75]:
df.columns

Index(['county', 'date', 'temp', 'visibility', 'dew_point', 'feels_like',
       'temp_min', 'temp_max', 'pressure', 'humidity', 'wind_speed',
       'wind_deg', 'wind_gust', 'rain_1h', 'rain_3h', 'snow_1h', 'snow_3h',
       'clouds_all', 'weather_id', 'weather_main', 'weather_description',
       'year', 'month', 'day', 'cases'],
      dtype='object')

In [76]:
df.shape

(256848, 25)

In [77]:
cols_to_normalize = [
    # 'temp',  
    # 'visibility', 
    # 'dew_point', 
    # 'feels_like',
    'temp_min', 
    'temp_max', 
    'pressure', 
    'humidity', 
    'wind_speed',
    'wind_deg', 
    'wind_gust', 
    'rain_1h', 
    'rain_3h', 
    'snow_1h', 
    'snow_3h',
    'clouds_all', 
    'weather_id', 
    'weather_main', 
    # 'weather_description'
    ]

scaler = MinMaxScaler()

df[cols_to_normalize] = scaler.fit_transform(df[cols_to_normalize])
df.head()

Unnamed: 0,county,date,temp,visibility,dew_point,feels_like,temp_min,temp_max,pressure,humidity,...,snow_1h,snow_3h,clouds_all,weather_id,weather_main,weather_description,year,month,day,cases
0,alameda,2001-01-01,8.39,4866.666667,1.287917,7.492083,0.414704,0.446981,0.585039,0.595049,...,0.0,0.0,0.177083,0.935525,0.166667,20.25,2001,1,1,7
1,alameda,2001-01-02,8.55125,4883.333333,0.8625,7.215833,0.421649,0.449506,0.657022,0.586649,...,0.0,0.0,0.051667,0.961806,0.097222,20.625,2001,1,2,7
2,alameda,2001-01-03,10.125,3416.666667,0.801667,9.01125,0.445518,0.484903,0.713479,0.498674,...,0.0,0.0,0.0,0.951877,0.092593,26.166667,2001,1,3,7
3,alameda,2001-01-04,10.165,3333.333333,-1.030417,8.8075,0.440981,0.490401,0.669019,0.427498,...,0.0,0.0,0.052917,0.989488,0.018519,27.583333,2001,1,4,7
4,alameda,2001-01-05,11.111667,3566.666667,0.440417,9.98875,0.450255,0.502842,0.604093,0.44916,...,0.0,0.0,0.0,0.961105,0.069444,27.125,2001,1,5,7


In [78]:
grouped = df.groupby(['county', 'year'])
x_list = []
y_list = []

for _, group_df in grouped:
    x = group_df.drop(['county', 'year', 'cases', 'month', 'date', 'day'], axis=1)
    x = x.iloc[:243]
    x = x.values 
    x_list.append(x)

    y = group_df['cases'].values[-1] 
    y_list.append(y)

x = np.stack(x_list) 
y = np.array(y_list)

x_tensor = torch.from_numpy(x).float()
y_tensor = torch.from_numpy(y).float()

print(x_tensor.shape, y_tensor.shape)


torch.Size([1056, 243, 19]) torch.Size([1056])


In [79]:
print(y_tensor)

tensor([7., 2., 9.,  ..., 2., 2., 2.])


In [80]:
print(x_tensor[0][1])

tensor([8.5513e+00, 4.8833e+03, 8.6250e-01, 7.2158e+00, 4.2165e-01, 4.4951e-01,
        6.5702e-01, 5.8665e-01, 1.6835e-01, 2.2360e-01, 0.0000e+00, 0.0000e+00,
        0.0000e+00, 0.0000e+00, 0.0000e+00, 5.1667e-02, 9.6181e-01, 9.7222e-02,
        2.0625e+01])


In [84]:


aug = []
for i in range(4):
    noise = torch.randn_like(x_tensor) * 0.001
    aug.append(x_tensor + noise)

aug.append(x_tensor)
augmented_x = torch.cat(aug, dim=0)

augmented_y = y_tensor.repeat(5)

print(augmented_x.shape)  
print(augmented_y.shape)  


torch.Size([5280, 243, 19])
torch.Size([5280])


In [89]:
dataset = TensorDataset(augmented_x, augmented_y)

total_size = len(dataset)
train_size = int(total_size * 0.8)
val_size = int(total_size * 0.15)
test_size = total_size - train_size - val_size

train, val, test = random_split(dataset, [train_size, val_size, test_size])

print(f'Train Size: {len(train)}, Val Size: {len(val)}, Test Size: {len(test)}')

Train Size: 4224, Val Size: 792, Test Size: 264


In [90]:
torch.save(train, '../../data/cleaned/train.pt')
torch.save(val, '../../data/cleaned/val.pt')
torch.save(test, '../../data/cleaned/test.pt')

print('Train, val, and test datasets saved')

Train, val, and test datasets saved
