<h1><center><font color = 'blue'>Recreate 2D df (one line per level) CH7</font></center></h1>

In [1]:
import pandas as pd
import psutil
import boto3
import time
import os

from math import trunc

Pandas consume a lot of RAM when dealing with large dataset, so let's define a function to keep an eye on how much we are using.

In [2]:
def print_ram_usage():
    process = psutil.Process(os.getpid())
    ram_usage = round(process.memory_info().rss/float(2**30), 2)
    print("RAM usage: {}GB".format(ram_usage))

### Import data

In [3]:
start = time.time()

df = [pd.read_hdf("../efs/data/amsua_n18_ch7_4pts_part1.h5"),
      pd.read_hdf("../efs/data/amsua_n18_ch7_4pts_part2.h5"),
      pd.read_hdf("../efs/data/amsua_n18_ch7_4pts_part3.h5")]

df = pd.concat(df, axis=0)

df.reset_index(drop=True, inplace=True)

end = time.time()
print("Data imported in: {} min and {} sec".format(trunc((end - start)/60),
                                                   round((end - start)%60)))

print_ram_usage()

Data imported in: 4 min and 1 sec
RAM usage: 9.66GB


### Compute sensitivity, and hour feature

In [4]:
df["SENSITIVITY"] = df["IMPACT"]/df["OMF"]
df["hour"] = df["DATETIME"].dt.hour

In [5]:
df.tail()

Unnamed: 0,DATETIME,IMPACT,LATITUDE,LONGITUDE,OMF,delp_0.015,delp_0.026,delp_0.04,delp_0.057,delp_0.078,...,v_873.975,v_888.632,v_903.289,v_917.946,v_932.602,v_947.258,v_961.913,v_976.62,SENSITIVITY,hour
4199834,2015-02-28 18:00:00,-4e-06,88.25,-96.519997,0.329822,1.0,1.27,1.488501,1.8415,2.3345,...,-0.010676,0.593499,1.136518,1.505242,1.942371,2.311975,2.691252,2.824533,-1.1e-05,18
4199835,2015-02-28 18:00:00,-0.000165,88.419998,111.849998,0.196722,1.0,1.27,1.488501,1.8415,2.3345,...,-4.826113,-4.162806,-3.729328,-3.426113,-2.980954,-2.648366,-2.550055,-2.323054,-0.000839,18
4199836,2015-02-28 18:00:00,-7.8e-05,89.189995,-171.569992,0.156673,1.0,1.27,1.488501,1.8415,2.3345,...,-4.714759,-5.14755,-5.511382,-5.582294,-6.103623,-6.21734,-5.986819,-5.48258,-0.000495,18
4199837,2015-02-28 18:00:00,-6.3e-05,89.269997,46.529999,0.084857,1.0,1.27,1.488501,1.8415,2.3345,...,5.708429,5.440742,5.119752,4.757587,4.508724,4.032255,3.689447,3.532344,-0.000743,18
4199838,2015-02-28 18:00:00,-6.4e-05,89.32,-62.049999,0.129242,1.0,1.27,1.488501,1.8415,2.3345,...,2.84172,2.137918,1.564459,1.39559,2.00031,2.625103,2.80118,2.76616,-0.000496,18


### Recreate 2D dataframe

In [6]:
start = time.time()

cols_2D = [col for col in df.columns if any(char.isdigit() for char in col)]
level_cols = ["delp", "u", "v", "tv", "sphu", "ozone", "qitot", "qltot"]
levels = [colname.split("_")[-1] for colname in cols_2D[:72]]
df_2D = []

for i in range(0, len(cols_2D), 72):
    df_2D.append(
        df[cols_2D[i: i + 72]].copy().stack()\
        .reset_index(drop=True)\
        .rename(cols_2D[i].split("_")[0])
    )

df_2D = pd.concat(df_2D, axis=1)
duplicate_2D = pd.concat([df.drop(cols_2D, axis=1)]*72).sort_index().reset_index(drop=True)
df_2D = pd.concat([df_2D, duplicate_2D], axis=1)

end = time.time()
print("2D df recreated in: {} min and {} sec".format(trunc((end - start)/60),
                                                   round((end - start)%60)))

print_ram_usage()

df_2D.tail()

2D df recreated in: 5 min and 42 sec
RAM usage: 81.79GB


Unnamed: 0,delp,ozone,qitot,qltot,sphu,tv,u,v,DATETIME,IMPACT,...,frocean,frseaice,hs_stdv,lat,lon,phis,ps,ts,SENSITIVITY,hour
302388403,1505.413818,0.023409,2e-06,7e-06,0.000402,247.908279,-3.216168,1.39559,2015-02-28 18:00:00,-6.4e-05,...,1.0,0.957943,0.0,89.5,-61.875,0.0,100251.609375,251.866562,-0.000496,18
302388404,1505.370972,0.023913,7e-06,2.5e-05,0.000332,245.818283,-2.74884,2.00031,2015-02-28 18:00:00,-6.4e-05,...,1.0,0.957943,0.0,89.5,-61.875,0.0,100251.609375,251.866562,-0.000496,18
302388405,1505.416626,0.024183,1.2e-05,1.9e-05,0.000345,246.367722,-3.042329,2.625103,2015-02-28 18:00:00,-6.4e-05,...,1.0,0.957943,0.0,89.5,-61.875,0.0,100251.609375,251.866562,-0.000496,18
302388406,1505.447754,0.024248,7e-06,9e-06,0.000376,247.490265,-3.394604,2.80118,2015-02-28 18:00:00,-6.4e-05,...,1.0,0.957943,0.0,89.5,-61.875,0.0,100251.609375,251.866562,-0.000496,18
302388407,1503.786255,0.024266,3e-06,3e-06,0.000398,248.628876,-3.535062,2.76616,2015-02-28 18:00:00,-6.4e-05,...,1.0,0.957943,0.0,89.5,-61.875,0.0,100251.609375,251.866562,-0.000496,18


### Save data

In [None]:
start = time.time()

df_2D.to_hdf("../efs/df_2D.h5", key="df", complevel=9)

end = time.time()
print("2D df saved and compressed in: {} min and {} sec".format(trunc((end - start)/60),
                                                                round((end - start)%60)))