In [None]:
### Imports for the Data Preprocessing
import numpy as np
import pandas as pd
import matplotlib as plt
import os
import gc


In [None]:
# get working directory and remove last folder
# TODO: make this more robust
wd = os.path.dirname(os.getcwd())
os.chdir(wd)
print('Working Directory: ', os.getcwd())

In [None]:
dtypes = {
    'level': np.uint8,
    "level_group": "category",
    'event_name': np.uint8,
    'name': np.uint8,
    'fqid': np.uint8,
    'room_fqid': np.uint8,           
    "text_fqid": np.uint8,
    'fullscreen': np.uint8,
    'hq': np.uint8,
    'music': np.uint8,
    'hover_duration_mean': np.float32,
    'difference_clicks_mean': np.float32,
    'elapsed_time_std': np.float32,
    'page_std': np.float32,
    'room_coor_x_std': np.float32,
    'room_coor_y_std': np.float32,
    'screen_coor_x_std': np.float32,
    'screen_coor_y_std': np.float32,
    'hover_duration_std': np.float32,
    'difference_clicks_std': np.float32,
    'index_sum_of_actions': np.float32,
    'difference_clicks_max': np.float32,
    'elapsed_time_max': np.float32,
    'clicks_per_second': np.float32}

dataset_df = pd.read_csv("data/processed/dataset_df_level.csv", dtype=dtypes)

#load the label dataset
labels = pd.read_csv("data/processed/labels.csv")
dataset_df_0_4 = pd.read_csv("data/processed/df_0_4.csv", dtype=dtypes, index_col= 0)
dataset_df_0_4 = dataset_df_0_4.reset_index(drop=True)

In [None]:
def split_dataset(dataset, test_ratio=0.20):
    USER_LIST = dataset_df.index.unique()
    split = int(len(USER_LIST) * (1 - 0.20))
    return dataset.loc[USER_LIST[:split]], dataset.loc[USER_LIST[split:]]

train_x, valid_x = split_dataset(dataset_df)
print("{} examples in training, {} examples in testing.".format(
    len(train_x), len(valid_x)))

In [None]:
# Fetch the unique list of user sessions in the validation dataset. We assigned 
# `session_id` as the index of our feature engineered dataset. Hence fetching 
# the unique values in the index column will give us a list of users in the 
# validation set.
VALID_USER_LIST = valid_x.index.unique()

# Create a dataframe for storing the predictions of each question for all users
# in the validation set.
# For this, the required size of the data frame is: 
# (no: of users in validation set  x no of questions).
# We will initialize all the predicted values in the data frame to zero.
# The dataframe's index column is the user `session_id`s. 
prediction_df = pd.DataFrame(data=np.zeros((len(VALID_USER_LIST),18)), index=VALID_USER_LIST)

# Create an empty dictionary to store the models created for each question.
models = {}

# Create an empty dictionary to store the evaluation score for each question.
evaluation_dict ={}

In [None]:
#get labels and data together
# Iterate through questions 1 to 18 to train models for each question, evaluate
# the trained model and store the predicted values.
for q_no in range(1,19):

    # Select level group for the question based on the q_no.
    if q_no<=3: grp = '0-4'
    elif q_no<=13: grp = '5-12'
    elif q_no<=22: grp = '13-22'
    print("### q_no", q_no, "grp", grp)
    
        
    # Filter the rows in the datasets based on the selected level group. 
    train_df = train_x.loc[train_x.level_group == grp]
    train_users = train_df.index.values
    valid_df = valid_x.loc[valid_x.level_group == grp]
    valid_users = valid_df.index.values

    # Select the labels for the related q_no.
    train_labels = labels.loc[labels.q==q_no].set_index('session').loc[train_users]
    valid_labels = labels.loc[labels.q==q_no].set_index('session').loc[valid_users]

    # Add the label to the filtered datasets.
    train_df["correct"] = train_labels["correct"]
    valid_df["correct"] = valid_labels["correct"]

In [190]:
def combine_rows(df, n_flatten=5, only_one=None, drop=None):
    """
    Combines every n_flatten rows of a Pandas DataFrame into a new DataFrame, with each row containing the combined values from the n_flatten rows.

    Args:
        df (pandas.DataFrame): The DataFrame to combine.
        n_flatten (int): The number of rows to be combined into a single row.
        only_one (list): A list of column names to keep only the first occurrence of in the output DataFrame.
        drop (list): A list of column names to drop from the input DataFrame before performing the calculation.

    Returns:
        pandas.DataFrame: A new DataFrame containing one row for every n_flatten rows of the input DataFrame, with each row containing the combined values from the n_flatten rows.
    """
    # Create a copy of the input DataFrame to modify
    df = df.copy()

    # Use value_counts() to get the count of each session_id
    counts = df['session_id'].value_counts()

    # Check if each group has the same number of rows
    if (counts % n_flatten).any():
        # Get the session_ids that need to be generated
        need_generating = counts[counts < n_flatten].index.tolist()
        num_generated_rows = 0
        
        # Loop through the session_ids that need to be generated
        for session_id in need_generating:
            # Generate a new row for this session_id
            new_row = {"session_id": session_id}
            for col in df.columns:
                if col == "session_id":
                    continue
                elif df[col].dtype.name == "category":
                    # Categorical column - set value to "generated"
                    new_row[col] = "generated"
                else:
                    # Numeric column - set value to average of other values in column with the same level
                    level_values = df.loc[df["session_id"] == session_id, "level"].unique()
                    for level in level_values:
                        if level == "generated":
                            continue
                        other_values = df.loc[(df["session_id"] == session_id) & (df["level"] == level), col]
                        if other_values.dtype.kind in 'biufc':
                            new_value = other_values.mean()
                            new_row[col] = new_value
                            break
            df = pd.concat([df, pd.DataFrame([new_row])], ignore_index=True)
            num_generated_rows += 1

        print(f"Generated {num_generated_rows} rows with indices: {list(range(len(df) - num_generated_rows, len(df)))}\n{new_row}")


    # Drop specified columns from input DataFrame
    if drop:
        df = df.drop(columns=drop)

    # Determine the number of rows and columns in the input DataFrame
    num_rows, num_cols = df.shape

    # Determine the number of new rows in the output DataFrame
    num_new_rows = num_rows // n_flatten

    # Reshape the flattened values into a new array with the desired shape
    values = df.values.flatten()
    new_values = values.reshape(num_new_rows, n_flatten*num_cols)

    # Create a new DataFrame from the reshaped values
    new_df = pd.DataFrame(new_values, columns=[f"{col}_{i}" for i in range(1, n_flatten+1) for col in df.columns])

    # Drop specified columns from output DataFrame
    if only_one:
        for col in only_one:
            keep_col = f"{col}_1"
            drop_cols = [f"{col}_{i}" for i in range(2, n_flatten+1)]
            new_df = new_df.drop(columns=drop_cols)

    return new_df



In [204]:
# Example usage

new_df = combine_rows(dataset_df_0_4,n_flatten= 5 ,drop= ["level"], only_one= ["level_group","music", "hq", "fullscreen"])
#print(new_df)

UnboundLocalError: local variable 'num_generated_rows' referenced before assignment

In [193]:
df_13_22 = pd.read_csv("data\processed\df_13_22.csv", dtype=dtypes, index_col= 0)
df_13_22 = df_13_22.reset_index(drop=True)
ex = ["level_group","music", "hq", "fullscreen"]
drop = ["level"]

In [203]:
df_13_22 = combine_rows(df_13_22,n_flatten= 10 ,drop= drop, only_one= ex)


KeyError: 'session_id'

In [202]:
def combine_rows(df, n_flatten=5, only_one=None, drop=None):
    """
    Combines every n_flatten rows of a Pandas DataFrame into a new DataFrame, with each row containing the combined values from the n_flatten rows.

    Args:
        df (pandas.DataFrame): The DataFrame to combine.
        n_flatten (int): The number of rows to be combined into a single row.
        only_one (list): A list of column names to keep only the first occurrence of in the output DataFrame.
        drop (list): A list of column names to drop from the input DataFrame before performing the calculation.

    Returns:
        pandas.DataFrame: A new DataFrame containing one row for every n_flatten rows of the input DataFrame, with each row containing the combined values from the n_flatten rows.
    """
    # Create a copy of the input DataFrame to modify
    df = df.copy()

    # Use value_counts() to get the count of each session_id
    counts = df['session_id'].value_counts()

    # Check if each group has the same number of rows
    if (counts % n_flatten).any():
        # Get the session_ids that need to be generated
        need_generating = counts[counts < n_flatten].index.tolist()
        num_generated_rows = 0
        
        # Loop through the session_ids that need to be generated
        for session_id in need_generating:
            # Generate new rows for this session_id
            num_missing_rows = n_flatten - counts[session_id]
            new_rows = []
            for i in range(num_missing_rows):
                new_row = {"session_id": session_id}
                for col in df.columns:
                    if col == "session_id":
                        continue
                    elif df[col].dtype.name == "category":
                        # Categorical column - set value to "generated"
                        new_row[col] = "generated"
                    else:
                        # Numeric column - set value to average of other values in column with the same level
                        level_values = df.loc[df["session_id"] == session_id, "level"].unique()
                        for level in level_values:
                            if level == "generated":
                                continue
                            other_values = df.loc[(df["session_id"] == session_id) & (df["level"] == level), col]
                            if other_values.dtype.kind in 'biufc':
                                new_value = other_values.mean()
                                new_row[col] = new_value
                                break
                new_rows.append(new_row)
            df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
            num_generated_rows += num_missing_rows

        print(f"Generated {num_generated_rows} rows with indices: {list(range(len(df) - num_generated_rows, len(df)))}\n{new_rows}")




    # Drop specified columns from input DataFrame
    if drop:
        df = df.drop(columns=drop)

    # Determine the number of rows and columns in the input DataFrame
    num_rows, num_cols = df.shape

    # Determine the number of new rows in the output DataFrame
    num_new_rows = (num_rows + num_generated_rows) // n_flatten

    # Reshape the flattened values into a new array with the desired shape
    values = df.values.flatten()
    new_values = values.reshape(num_new_rows, n_flatten*num_cols)

    # Create a new DataFrame from the reshaped values
    new_df = pd.DataFrame(new_values, columns=[f"{col}_{i}" for i in range(1, n_flatten+1) for col in df.columns])

    # Drop specified columns from output DataFrame
    if only_one:
        for col in only_one:
            keep_col = f"{col}_1"
            drop_cols = [f"{col}_{i}" for i in range(2, n_flatten+1)]
            new_df = new_df.drop(columns=drop_cols)

    return new_df