This Notebook shows how we can find the stations with the most hot days in them. We can then pick out stations to analyze based on the number of hot days.

In [11]:
import netCDF4 as nc 

import numpy as np
import datetime as dt
import pandas as pd

import matplotlib.pylab as plt
 %matplotlib inline
    
import pickle

In [13]:
#directory to data CSV files
ghcnd_csv_dir = '/glade/p/work/ddvento/ML/McKinnon_data/ghcnd/ghcnd_all_csv/'

#empty dictionary to save stations and number of hot days to
station_dict = {}

#starting and ending dates for our analysis
start_year = 1982
end_year   = 2015
        
start_doy  = 175
end_doy    = 234
   
#temperature anomaly which classifies a day as hot or not-hot
cut_off = 6.5

#counter variable
count = 0

In [15]:
#we see that we have 1613 stations
for file in os.listdir(ghcnd_csv_dir):
    print(count, file.replace('.csv', ''))
    
    count = count + 1
    
    #read in file, replace nan values
    station = pd.read_csv(ghcnd_csv_dir+file, na_values=-9999)
    
    #create datetime dates so that we can subset for years and calendar days
    station['date']=pd.to_datetime(station['YYYY'].astype(str)+'-'+station['MM'].astype(str)+'-'+station['DD'].astype(str))
    station['jday'] = station['date'].dt.dayofyear.apply(lambda x: str(x).zfill(3)).astype(int)

    #subset for the correct years and calendar days
    yrs_data = station[(station['YYYY']>=start_year) & (station['YYYY']<=end_year)]
    stn_data= yrs_data[(yrs_data['jday']>=start_doy) & (yrs_data['jday']<=end_doy)]
    
    #calculate temperature anomaly
    var_anom= "TMAXANOM"
    means=stn_data.groupby(['MM','DD'])['TMAX'].transform('mean')
    stn_data[var_anom]= stn_data['TMAX'] - means
    
    #create cutoff label
    stn_data['HOT'] = np.where(stn_data['TMAXANOM']>= cut_off,1,0)
    
    #only keep stations that have some percentage of hot days
    if (stn_data.HOT.shape[0]==2040):
       station_dict[np.count_nonzero(stn_data.HOT)] = file

0 USC00231145
1 USC00121030
2 USC00467730
3 USC00364992
4 USW00003849
5 USC00315923
6 USC00233043
7 USC00035186
8 USC00079605
9 USC00391777
10 USC00130364
11 USC00274480
12 USC00036768
13 USC00441593
14 USC00134705
15 USC00033466
16 USC00191386
17 USC00162534
18 USC00144104
19 USC00408562
20 USC00145692
21 USC00234694
22 USC00445096
23 USC00460580
24 USC00316256
25 USC00318744
26 USW00013733
27 USC00407184
28 USC00365902
29 USC00404876
30 USC00251825
31 USC00445851
32 USW00003940
33 USC00135992
34 USC00368905
35 USC00473038
36 USC00366508
37 USC00011566
38 USC00253735
39 USC00468777
40 USC00118740
41 USC00253595
42 USC00470904
43 USC00435416
44 USC00014502
45 USC00125407
46 USW00093814
47 USC00136327
48 USC00225987
49 USC00035563
50 USC00238603
51 USC00410586
52 USW00013899
53 USC00218323
54 USC00113335
55 USC00368449
56 USC00471568
57 USC00221865
58 USC00178965
59 USC00057515
60 USW00014820
61 USC00216287
62 USC00461215
63 USC00140682
64 USW00013971
65 USW00093997
66 USW00094728
67 US

In [8]:
#print out our reduced list of stations with number of hot days
for key, value in station_dict.items():
    print(key, value)

45 USC00120200.csv
8 USW00093842.csv
22 USC00192501.csv
56 USC00032930.csv
71 USC00190736.csv
0 USC00410613.csv
35 USC00442941.csv
21 USW00003820.csv
28 USW00093738.csv
98 USC00258320.csv
5 USC00011620.csv
39 USC00123082.csv
49 USC00478027.csv
138 USC00393832.csv
46 USC00035200.csv
48 USC00123418.csv
15 USW00013876.csv
2 USC00312635.csv
4 USW00013748.csv
40 USC00235027.csv
64 USW00014925.csv
55 USC00252145.csv
50 USW00093822.csv
67 USC00398980.csv
72 USW00014933.csv
25 USC00386293.csv
18 USW00013723.csv
43 USC00145536.csv
115 USC00253910.csv
81 USW00014764.csv
37 USC00032794.csv
52 USC00118684.csv
3 USW00013970.csv
12 USW00013871.csv
30 USW00093720.csv
83 USC00250365.csv
1 USC00011080.csv
33 USW00093739.csv
61 USC00130157.csv
11 USW00003812.csv
63 USC00213174.csv
121 USC00328913.csv
93 USW00014916.csv
27 USW00003952.csv
85 USC00251450.csv
97 USC00254335.csv
47 USW00093730.csv
34 USC00192451.csv
132 USW00013984.csv
68 USC00174086.csv
19 USC00387631.csv
99 USW00014944.csv
69 USC00125174.

In [9]:
print(len(station_dict))

133


In [17]:
#we can sort the dictionary's values which show how many hot days there were
hot_list = list(station_dict.keys())
hot_sort = sorted(hot_list, reverse=True)

In [19]:
#then we can list the number of days 
print(hot_sort[:20])

[167, 162, 159, 154, 150, 148, 143, 142, 138, 136, 133, 132, 130, 124, 123, 121, 119, 118, 117, 116]


In [30]:
#we can use `station_dict` to find particular stations based on the number of hot days
print(station_dict[159])

USC00143527.csv


In [24]:
#then we can save this dictionary to file
dict_dir = '/glade/work/jakidxav/IPython/McKinnon/station_dict_text.txt'

with open(dict_dir, 'wb') as h:
    pickle.dump(station_dict, h)

In [27]:
#let's open the file and see if we saved it correctly
with open(dict_dir, 'rb') as filehandle:  
    # read the data as binary data stream
    stat_dict = pickle.load(filehandle)

for key, value in stat_dict.items():
    print(key, value)

45 USC00120200.csv
8 USW00093842.csv
22 USC00192501.csv
56 USC00032930.csv
71 USC00190736.csv
0 USC00410613.csv
35 USC00442941.csv
21 USW00003820.csv
28 USW00093738.csv
98 USC00258320.csv
5 USC00011620.csv
39 USC00123082.csv
49 USC00478027.csv
138 USC00393832.csv
46 USC00035200.csv
48 USC00123418.csv
15 USW00013876.csv
2 USC00312635.csv
4 USW00013748.csv
40 USC00235027.csv
64 USW00014925.csv
55 USC00252145.csv
50 USW00093822.csv
67 USC00398980.csv
72 USW00014933.csv
25 USC00386293.csv
18 USW00013723.csv
43 USC00145536.csv
115 USC00253910.csv
81 USW00014764.csv
37 USC00032794.csv
52 USC00118684.csv
3 USW00013970.csv
12 USW00013871.csv
30 USW00093720.csv
83 USC00250365.csv
1 USC00011080.csv
33 USW00093739.csv
61 USC00130157.csv
11 USW00003812.csv
63 USC00213174.csv
121 USC00328913.csv
93 USW00014916.csv
27 USW00003952.csv
85 USC00251450.csv
97 USC00254335.csv
47 USW00093730.csv
34 USC00192451.csv
132 USW00013984.csv
68 USC00174086.csv
19 USC00387631.csv
99 USW00014944.csv
69 USC00125174.