In [4]:
#! /usr/bin/env python3

import imports

warnings.filterwarnings("ignore")
ic.configureOutput(includeContext=True)


def get_data_train(path):
    ## import data
    df = pd.read_csv(path, parse_dates=['date'], infer_datetime_format=True)

    # sort data
    df = df.sort_values(by=["number_sta", "date"])

    # set number_sta as category
    df["number_sta"] = df["number_sta"].astype("category")

    return df


def get_data_test(path):
    ## import data
    df = pd.read_csv(path)
    return df


def get_observations(x, displ=False):
    ## shift X
    # get the observation baseline
    obs = x[{"number_sta", "date", "precip"}]
    # obs.set_index('date', inplace=True)

    # if any NaN on the day, then the value is NaN (24 values per day)
    # obs = obs.groupby('number_sta').resample('D')#.agg(pd.Series.sum, min_count = 24)
    obs['date'] = obs['date'].astype('category')
    obs['number_sta'] = obs['number_sta'].astype('category')
    obs['baseline_obs'] = obs.groupby(['number_sta'])['precip'].shift(1)

    obs = obs.sort_values(by=["number_sta", "date"])
    del obs['precip']
    obs = obs.rename(columns={'baseline_obs': 'precip'})
    # obs_new = obs.reset_index()

    if displ:
        display(obs)

    return obs

In [8]:
def preprocess_merge_x_y(x, y=None, sort=True, data_type="train", verbose=False, save_path=False):
    t_total = time.time()
    #acquire data from file if input is a file path
    if type(x) == str:
        t = time.time()
        if verbose: print("type x is str, try to retrieve from path")
        # differentiate x_train from x_test
        x = get_data_train(path=x) if data_type == "train" else get_data_test(path=x)
        if verbose: print(f"elapsed : {time.time() - t:.2f}s ; total : {time.time() - t_total:.2f}s")
    if type(y) == str:
        t = time.time()
        if verbose: print("type y is str, try to retrieve from path")
        # differentiate y_train from y_test
        y = get_data_train(path=y) if data_type == "train" else get_data_test(path=y)
        if verbose: print(f"elapsed : {time.time() - t:.2f}s ; total : {time.time() - t_total:.2f}s")

    #preprocess x
    t = time.time()
    if verbose: print("\npreprocess x...")
    if verbose: print("\tprocess Id")
    x["Id"] = x["Id"].apply(lambda id: "_".join(id.split("_")[:2]))
    #only proceed if number_sta is in columns, i.e. if x_train supposedly
    if "number_sta" not in x.columns :
        x["number_sta"] = x["Id"].apply(lambda id : int(id.split("_")[0]))
    if verbose: print("\tset astype category")
    x["Id"]         = x["Id"].astype("category")
    x["number_sta"] = x["number_sta"].astype("category")
    if "month" in x.columns :
        x["month"]  = x["month"].astype("category")
    #only proceed if date is in columns, i.e. if x_train supposedly
    if "date" in x.columns :
        if verbose: print("\tdrop date")
        x = x.drop("date", axis=1)
    if verbose: print("\taggregate")
    x = x.drop("number_sta", axis=1).groupby("Id").agg(pd.Series.sum)
    if verbose: print(f"elapsed : {time.time() - t:.2f}s ; total : {time.time() - t_total:.2f}s")

    #preprocess y
    if type(y) != type(None):
        t = time.time()
        if verbose: print("\npreprocess y")
        if verbose: print("\tset astype category")
        y["number_sta"] = y["number_sta"].astype("category")
        y["Id"] = y["Id"].astype("category")
        #only proceed if date is in columns, i.e. if y_train supposedly
        if "date" in y.columns :
            if verbose: print("\tdrop date")
            y = y.drop("date", axis=1)
        if verbose: print(f"elapsed : {time.time() - t:.2f}s ; total : {time.time() - t_total:.2f}s")

        #merge x and y
        t = time.time()
        if verbose: print("merge x and y")
        x = x.merge(y, how="left", on="Id")
        # we should get x and y with the same number of columns, else push warning.
        if len(x) != len(y):
            if verbose: print("DimensionWarning : len(x) != len(y) : {} != {}".format(len(x), len(y)))
        if verbose: print(f"elapsed : {time.time() - t:.2f}s ; total : {time.time() - t_total:.2f}s")
    else :
        x = x.reset_index()

    #get day
    t = time.time()
    if verbose: print("\nget day")
    x["day"] = x["Id"].apply(lambda id: int(id.split("_")[1]))
    if verbose: print(f"elapsed : {time.time() - t:.2f}s ; total : {time.time() - t_total:.2f}s")

    if "number_sta" not in x.columns :
        x["number_sta"] = x["Id"].apply(lambda id : int(id.split("_")[0]))

    #sort
    if sort:
        t = time.time()
        if verbose: print("sorting by number_sta then day")
        x.sort_values(["number_sta", "day"], inplace=True)
        if verbose: print(f"elapsed : {time.time() - t:.2f}s ; total : {time.time() - t_total:.2f}s")

    if verbose: print(f"total time elapsed : {time.time() - t_total:.2f}s")

    #save in file
    if save_path:
        if not os.path.exists("/".join(save_path.split("/")[:-1])):
            os.mkdir("/".join(save_path.split("/")[:-1]))
        #if verbose : print(f"\nfile saved as '{save_path}'")
        #x.to_csv(save_path)
    return x

In [9]:
x_train = preprocess_merge_x_y(x = '../Train/Train/X_station_train.csv',
                               y = '../Train/Train/Y_train.csv',
                               sort      = True,
                               data_type = "train",
                               verbose   = False,
                               save_path = "../preprocessed/aggregated/X_station.csv")

In [7]:
x_test = preprocess_merge_x_y(x = "../Test/Test/X_station_test.csv",
                              data_type = "test",
                              verbose   = False,
                              save_path = "../preprocessed/aggregated/X_test.csv")