# Compress Data for Transfer

Goal of this script is to resample the downloaded csvs by taking the max hourly windspeed and then averaging across the 3 stations per county. This should result in ~50x factor of compression and will be more manageable 

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [9]:
# get filepathing
state = "Ohio"
path = "/Users/julianschmitt/Downloads/Direcho/{}".format(state)
files = os.listdir(path)
paths = [os.path.join(path, f) for f in files]
try:
    os.mkdir("/Users/julianschmitt/Downloads/Direcho/processed/{}".format(state))
except:
    print("Directory already made.")

In [10]:
# Print total number of unique counties
counties = np.unique(["_".join(file.split("_")[1:3]) for file in files])

def process_raw(path):
    """ Take max of wind speed parameters and mean of temperature """
    wind = pd.read_csv(path, low_memory=False)
    process = wind.groupby(["Year","Month","Day","Hour"])[["wind speed at 10m (m/s)", \
                                                                 'wind speed at 40m (m/s)', \
                                                                 "wind speed at 100m (m/s)"]].max()
    process['air temperature at 10m (C)'] = wind.groupby(["Year","Month","Day",\
                                                           "Hour"])[['air temperature at 10m (C)']].mean()
    return process

In [11]:
#ar = [i for i in files if counties[0] in i]
for ind in tqdm(range(len(paths))): #len(paths)
    try:
        processed = process_raw(paths[ind])
        #print(processed.head())
        year, month, day, hour = [elt[0] for elt in processed.index],[elt[1] for elt in processed.index], \
                            [elt[2] for elt in processed.index], [elt[3] for elt in processed.index]
        wind_10, wind_40, wind_100, temp_10 = [elt[0] for elt in processed.values], [elt[1] for elt in processed.values],\
                        [elt[2] for elt in processed.values], [elt[3] for elt in processed.values]
        cleaned = pd.DataFrame({"Year":year, "Month":month, "Day":day, "Hour": hour, 'wind_10ms':wind_10, \
                  'wind_40ms':wind_40, 'wind_100ms':wind_100,'temp_10':temp_10})
        new_loc = "/Users/julianschmitt/Downloads/Direcho/processed/{}/{}".format(state,files[ind])
#         if os.path.isdir(new_loc) == False:
#             os.mkdir(os.path.dirname(new_loc))
        cleaned.to_csv(new_loc)
    except:
        print("{} could not be loaded".format(files[ind]))

100%|██████████| 179/179 [13:26<00:00,  4.51s/it]


In [9]:
lst = [elt for elt in paths if counties[0] in elt]
lst

['/Users/julianschmitt/Downloads/Direcho/Nebraska/50861_Nebraska_Adams_5.csv',
 '/Users/julianschmitt/Downloads/Direcho/Nebraska/51019_Nebraska_Adams_5.csv',
 '/Users/julianschmitt/Downloads/Direcho/Nebraska/50862_Nebraska_Adams_5.csv']

In [14]:
os.path.dirname(paths[0])


'/Users/julianschmitt/Downloads/Direcho/Nebraska'

In [82]:
files2=os.listdir("../../direcho_compact")
dif = set(files).difference(set(files2))
dif

{'.DS_Store'}

In [70]:
wind = pd.read_csv(f"../../direcho_data/{ar[0]}")
processed = wind.groupby(["Year","Month","Day","Hour"])[["wind speed at 10m (m/s)",'wind speed at 40m (m/s)',"wind speed at 100m (m/s)",'air temperature at 100m (C)']].max()

In [71]:
print(processed)

                     wind speed at 10m (m/s)  wind speed at 40m (m/s)  \
Year Month Day Hour                                                     
2007 1     1   0                        7.26                     9.81   
               1                        6.96                     9.44   
               2                        7.08                     9.58   
               3                        6.84                     9.37   
               4                        6.63                     9.12   
...                                      ...                      ...   
2014 12    31  19                       6.24                     7.25   
               20                       6.34                     7.44   
               21                       6.12                     7.44   
               22                       5.48                     7.18   
               23                       3.73                     6.64   

                     wind speed at 100m (m/s)  air

In [72]:
print(type(processed))
year, month, day, hour = [elt[0] for elt in processed.index],[elt[1] for elt in processed.index], \
                        [elt[2] for elt in processed.index], [elt[3] for elt in processed.index]
wind_10, wind_40, wind_100, temp_100 = [elt[0] for elt in processed.values], [elt[1] for elt in processed.values],\
                    [elt[2] for elt in processed.values], [elt[3] for elt in processed.values]
pd.DataFrame({"Year":year, "Month":month, "Day":day, "Hour": hour, 'wind_10ms':wind_10, \
              'wind_40ms':wind_40, 'wind_100ms':wind_100,'temp_100':temp_100})

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,Year,Month,Day,Hour,wind_10ms,wind_40ms,wind_100ms,temp_100
0,2007,1,1,0,7.26,9.81,11.85,-1.85
1,2007,1,1,1,6.96,9.44,11.55,-2.19
2,2007,1,1,2,7.08,9.58,11.70,-2.59
3,2007,1,1,3,6.84,9.37,11.57,-2.55
4,2007,1,1,4,6.63,9.12,11.44,-2.49
...,...,...,...,...,...,...,...,...
70075,2014,12,31,19,6.24,7.25,7.84,-3.51
70076,2014,12,31,20,6.34,7.44,8.09,-3.07
70077,2014,12,31,21,6.12,7.44,8.35,-2.89
70078,2014,12,31,22,5.48,7.18,8.62,-2.92


In [37]:
wind.iloc[0:24][['Hour','wind speed at 10m (m/s)']]

Unnamed: 0,Hour,wind speed at 10m (m/s)
0,0,6.28
1,0,6.5
2,0,6.92
3,0,7.13
4,0,7.13
5,0,7.18
6,0,7.26
7,0,6.87
8,0,6.96
9,0,6.71


In [74]:
files

['43780_Kansas_Jackson_5.csv',
 '41464_Kansas_Russell_5.csv',
 '36362_Kansas_Haskell_5.csv',
 '46878_Kansas_Republic_5.csv',
 '31911_Kansas_Chautauqua_5.csv',
 '39397_Kansas_Finney_5.csv',
 '41353_Kansas_Ellsworth_5.csv',
 '38830_Kansas_Pawnee_5.csv',
 '43161_Kansas_Riley_5.csv',
 '32521_Kansas_Morton_5.csv',
 '35967_Kansas_Kingman_5.csv',
 '39245_Kansas_Lyon_5.csv',
 '41456_Kansas_Ness_5.csv',
 '34930_Kansas_Ford_5.csv',
 '38327_Kansas_Greenwood_5.csv',
 '43977_Kansas_Rooks_5.csv',
 '31780_Kansas_Montgomery_5.csv',
 '45566_Kansas_Marshall_5.csv',
 '37995_Kansas_Hodgeman_5.csv',
 '45915_Kansas_Smith_5.csv',
 '44184_Kansas_Rooks_5.csv',
 '39904_Kansas_Franklin_5.csv',
 '33028_Kansas_Barber_5.csv',
 '31771_Kansas_Morton_5.csv',
 '32143_Kansas_Stevens_5.csv',
 '47967_Kansas_Cheyenne_5.csv',
 '46987_Kansas_Republic_5.csv',
 '43873_Kansas_Sheridan_5.csv',
 '33291_Kansas_Barber_5.csv',
 '46339_Kansas_Rawlins_5.csv',
 '36612_Kansas_Grant_5.csv',
 '33299_Kansas_Kingman_5.csv',
 '47269_Kansas_D