In [23]:
import pandas as pd
from tools.config_file import NewUserPredictParams
from tools.normalize import normalize_by_columns
params = NewUserPredictParams()

In [24]:
def divide(df_csv: pd.DataFrame, eids: list) -> pd.DataFrame:
    """
    :author Micros0ft
    :date 2023/8/27
    :param df_csv: Processed training dataset, udmap has been divided into 9 different keys
    :param eids: List of eids for dividing data series into separate pd.DataFrames
    :return: train_df divided according to eids, test_df divided according to eids
    """
    train_df_list = []  # Store divided training DataFrames based on eids

    # For each eid in eids, extract corresponding data from training and testing DataFrames
    for eid in eids:
        train_df_eid = df_csv[df_csv['eid'] == eid]

        train_df_list.append(train_df_eid)

    # Concatenate all divided DataFrames into single DataFrames
    df_train = pd.concat(train_df_list, ignore_index=True)

    return df_train

In [25]:
key2_key3 = [26, 40, 3, 38, 25, 12, 7, 0, 27, 34, ]
key4_key5 = [2, 5, ]
unknown = [41, 36, 31, 30, 4, 1, 19, 13, 15, 20, 10, 9, 29, 37, 32, 21, 39, 35, 11, 8, 33, 42, 28, 14, 16, 23, 6,
           22, 18, 17, 24, ]
columns_1 = ['key1', 'key4', 'key5', 'key6', 'key7', 'key8', 'key9']
columns_2 = ['key6', 'key7', 'key8', 'key9']
columns_3 = ['key1', 'key2', 'key3', 'key4', 'key5', 'key6', 'key7', 'key8', 'key9']
train_processed_csv = pd.read_csv(params.train_processed_csv)
test_processed_csv = pd.read_csv(params.test_processed_csv)
dataset = [train_processed_csv, test_processed_csv]
train_df_list = []
test_df_list = []
is_train = True

for df in dataset:
    key2_key3_df = divide(df, key2_key3)
    key4_key5_df = divide(df, key4_key5)
    unknown_df = divide(df, unknown)
    if is_train:
        train_df_list.append(key2_key3_df.drop(columns=columns_1))
        train_df_list.append(key4_key5_df.drop(columns=columns_2))
        train_df_list.append(unknown_df.drop(columns=columns_3))
        is_train = False
    else:
        test_df_list.append(key2_key3_df.drop(columns=columns_1))
        test_df_list.append(key4_key5_df.drop(columns=columns_2))
        test_df_list.append(unknown_df.drop(columns=columns_3))

combined_df_list = []

for i in range(len(train_df_list)):
    combined_df_list.append(pd.concat([train_df_list[i], test_df_list[i]], ignore_index=True))
    combined_df_list[i] = combined_df_list[i].drop(columns=['one_hot'])

In [26]:
normalized_dataset = normalize_by_columns(combined_df_list[1], ['eid', 'x1', 'x2', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'key1', 'key2', 'key3', 'key4', 'key5', 'date', 'hour', 'weekday'])

In [27]:
normalized_dataset

Unnamed: 0,uuid,eid,x1,x2,x3,x4,x5,x6,x7,x8,key1,key2,key3,key4,key5,date,hour,weekday,target
0,19,0.0,0.25,0.666667,0.545455,0.946667,0.840194,1.0,0.888889,0.0,0.000000,0.000000,0.000000,0.75,0.75,0.866667,0.391304,1.000000,0.0
1,21,0.0,1.00,0.333333,0.545455,1.000000,0.607748,1.0,0.444444,0.0,0.000000,0.000000,0.000000,0.65,0.30,0.933333,0.173913,0.000000,0.0
2,32,0.0,0.25,1.000000,0.545455,0.246667,0.711864,0.0,0.666667,0.0,0.000000,0.000000,0.000000,1.00,0.85,0.933333,0.130435,0.000000,0.0
3,52,0.0,1.00,1.000000,0.545455,0.706667,0.237288,0.0,0.666667,0.0,0.000000,0.000000,0.000000,0.95,0.55,0.800000,0.173913,0.833333,0.0
4,94,0.0,0.00,0.000000,0.545455,0.733333,0.934625,0.0,0.666667,0.0,0.000000,0.000000,0.000000,0.20,0.25,1.000000,0.521739,0.166667,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112282,206653,1.0,1.00,1.000000,0.545455,0.333333,0.738499,0.0,0.666667,0.0,0.214844,0.139521,0.414802,0.35,0.40,0.866667,0.434783,1.000000,
112283,206668,1.0,1.00,0.000000,0.545455,0.226667,0.629540,0.0,0.666667,0.0,0.624349,0.543331,0.414802,0.70,0.65,0.733333,0.434783,0.666667,
112284,206695,1.0,1.00,0.000000,0.545455,0.686667,0.699758,1.0,0.777778,0.0,0.431641,0.379840,0.414802,0.75,0.75,0.600000,0.000000,0.333333,
112285,206708,1.0,1.00,0.333333,0.545455,1.000000,0.762712,0.0,0.000000,0.0,0.751302,0.547019,0.414802,0.95,0.55,0.866667,0.173913,1.000000,
