In [1]:
import numpy as np
import pandas as pd

from typing import List
from typing import Union
from typing import Tuple
from hacktops.settings import NB_SAMPLES, SHIFT_STEP, WINDOW_LENGTH
import pickle
from time import sleep


def get_well_relevant_windows(top_index: int, df_well: pd.DataFrame, nb_samples: int = NB_SAMPLES,
                              shift: int = SHIFT_STEP, ratio: Union[None, float] = None) \
        -> Tuple[List[np.ndarray], List[np.ndarray]]:
    """
    Given df_well : 'wellName', 'DEPTH', 'GR' and top_index the position of a top in df_well
    Returns a list of numerous windows around top_index, and their label
    Labels are either True or False
    for a given selected window it is labelled True if the distance between its center and
    the top position is less than 4

    :param top_index: int
    :param df_well: pd.DataFrame(columns=['wellName', 'DEPTH', 'GR'])
    :param nb_samples: int
    :param shift: int=SHIFT_STEP
    :param ratio: Union[None, float]=None
    :return: list
    """
    windows = []
    labels = []
    positives = 0
    negatives = 0
    for i in range(top_index - nb_samples, top_index + nb_samples, shift):
        # print(WINDOW_LENGTH)
        left_limit = i - WINDOW_LENGTH
        right_limit = i + WINDOW_LENGTH
        window_data = list(map(lambda x: np.array([x]), list(df_well['GR'].values[left_limit:right_limit + 1])))
        if np.array(window_data).shape != (WINDOW_LENGTH * 2 + 1, 1):
            continue
        label = abs(df_well['DEPTH'].iloc[i] - df_well['DEPTH'].iloc[top_index]) < 4
        if ratio:
            if label:
                windows.append(np.array(window_data))
                labels.append(np.array(label))
            elif negatives / max(positives + negatives, 1) < ratio:
                pass
            else:
                windows.append(np.array(window_data))
                labels.append(np.array(label))
        else:
            windows.append(np.array(window_data))
            labels.append(np.array(label))

        if label:
            positives += 1
        else:
            negatives += 1

    return windows, labels


# TODO: check index and len depth

def generate_top_dataset(df_logs: pd.DataFrame, df_tops: pd.DataFrame,
                         top: str = 'CONRAD', ratio: Union[None, float] = None):
    """
    From df_logs and df_tops for each well
    return a list of relevant windows from the whole signal of the well and the labels of the windows
    a relevant window depend on the top
    for more explanation about window selection cf get_well_relevant_windows

    df_logs contains : 'wellName', 'DEPTH', 'GR'
    df_tops contains a column top

    :param df_logs: pd.DataFrame
    :param df_tops: pd.DataFrame
    :param top: str='CONRAD'
    :param ratio: Union[None, float]=None
    :return:
    """
    windows = []
    labels = []
    for well_name in df_logs['wellName'].drop_duplicates().tolist():
        df_well = df_logs[df_logs['wellName'] == well_name]
        top_position = df_tops.loc[well_name][top]
        if np.isnan(top_position):
            print("NAN FOUND")
            continue
        depth_list = list(df_well['DEPTH'].values)
        # print(depth_list)
        real_top_position = min(df_well['DEPTH'].values,
                                key=lambda x: abs(x - top_position))  # SOMETIMES top_position not in df_logs
        print("Real ", real_top_position)
        #print("Real ", real_top_position)
        if abs(real_top_position - top_position) > 3:
            print("DATA BAD LABELLED")
            continue
        top_index = depth_list.index(real_top_position)
        print(top_index)
        windows_, labels_ = get_well_relevant_windows(top_index=top_index, df_well=df_well, shift=1, nb_samples=1 * 100,
                                                      ratio=ratio)
        windows += windows_
        labels += labels_
    return windows, labels


def get_location_dataset(df_loc: pd.DataFrame, df_tops: pd.DataFrame, top: str):
    """
    result :
     - index : wellName
     - columns : Latitude, Longitude, top
    top must be a column of df_tops

    :param df_loc: pd.DataFrame
    :param df_tops: pd.DataFrame
    :param top: str
    :return:
    """
    assert top in df_tops
    well_data = df_loc.merge(df_tops[[top]], how='inner', left_index=True, right_index=True)
    well_data = well_data[well_data[top].notnull()]

    return well_data.reset_index(drop=True)

def change_shape(window_data):
    window_data1 = np.array(window_data)
    window_data2 = window_data1.reshape(window_data1.shape[0], window_data1.shape[1])
    return window_data2

def clean_df_logs(df_logs_):

    # Delete data which GR value < 0.
    df_logs_ = df_logs_[df_logs_['GR'] >= 0]

    # Filter data which depth between 5000 and 8000.
    df_logs_ = df_logs_[df_logs_['DEPTH'] >= 5000]
    df_logs_ = df_logs_[df_logs_['DEPTH'] <= 8000]

    return df_logs_


if __name__ == '__main__':
    # top_ = 'CONRAD'
    # df_logs_ = pd.read_parquet("../data/logs.parquet")
    # df_loc_ = pd.read_parquet("../data/loc.parquet")
    # df_tops_ = pd.read_parquet("../data/tops.parquet")

    top_ = 'CONRAD'
    # top_ = "MARCEL"
    #top_ = 'SYLVAIN'

    # Train Set
    df_logs_ = pd.read_parquet("data/logs.parquet")
    df_tops_ = pd.read_parquet("data/tops.parquet")
    df_logs_ = clean_df_logs(df_logs_)

    X, y = generate_top_dataset(df_logs=df_logs_, df_tops=df_tops_, top=top_)
    X = change_shape(X)

    # Save the cleaned dataset.
    print("Total data number is {}".format(len(X)))
    print("The window length is {}".format(WINDOW_LENGTH))

    # Use the first 100,000 data as the training set, and the last 10,000 data as the verification set
    X_train = np.array(X[:100000])
    X_test = np.array(X[-10000:])
    y_train = np.array(y[:100000])
    y_test = np.array(y[-10000:])

    dataset = [X_train, y_train, X_test, y_test]

    # Save the train set.
    with open("dataset_{}_{}_{}.pickle".format(top_, WINDOW_LENGTH, len(X_train)), "wb") as f:
        pickle.dump(dataset, f)



Real  6661.0
3322
Real  6676.0
3352
Real  6793.0
3586
Real  6816.0
3632
Real  6789.0
3578
Real  6760.0
3520
Real  6735.0
3470
Real  6782.0
3564
NAN FOUND
Real  6710.0
3420
Real  6606.0
3212
Real  6528.0
3056
Real  6520.0
3040
Real  6282.0
2564
Real  6554.0
3108
Real  6546.0
3092
Real  6660.0
3320
Real  6714.0
3428
Real  6812.0
3624
Real  6776.0
3552
Real  6602.0
3204
Real  6606.0
3212
Real  6566.0
3132
Real  6761.0
3522
Real  6794.0
3588
Real  6734.0
3468
Real  6765.0
3530
Real  6842.0
3684
Real  6788.0
3576
Real  6280.0
2560
Real  6626.0
3252
Real  6612.0
3224
Real  6641.0
3282
Real  6795.0
3590
Real  6755.0
3510
Real  6786.0
3572
Real  6755.0
3510
Real  6770.0
3540
Real  6254.0
2508
Real  6238.0
2476
Real  6630.0
3260
Real  6730.0
3460
Real  6832.0
3664
Real  6820.0
3640
Real  6767.0
3534
Real  6774.0
3548
Real  6800.0
3600
Real  6754.0
3508
Real  6302.0
2604
Real  6770.0
3540
Real  6756.0
3512
Real  6745.0
3490
Real  6276.0
2552
Real  6630.0
3260
Real  6638.0
3276
Real  6634.0
3268
