# Import dependencies

In [1]:
# Data Manipulations and Preprocessing
import numpy as np
import pandas as pd

# Data Visualization
import seaborn as sns
import matplotlib.pyplot as plt

# Services
import services.ModelStageService as sds

# utils
import utils.EDAUtils as eda_utils

sns.set_style('darkgrid')
sns.color_palette('pastel')

stage_data_io_service = sds.ModelStageService(previous_stage_name=sds.EDA_STAGE, current_stage_name=sds.PREPROCESSING_STAGE)

# Load data

In [None]:
# Reading text from txt file
df = stage_data_io_service.run_or_load_stage_data(reload_stage=False)

# Data preprocessing

## Drop useless/empty columns

In [None]:
df = df.drop(
    columns={'recurringAuthInd', 'posOnPremises', 'merchantName', 'merchantZip', 'merchantState', 'merchantCity',
             'echoBuffer', 'cardLast4Digits'})
eda_utils.data_summary(df)

In [None]:
# On the surface it seems that column accountNumber equals customerId. 
# So if it is true - let`s drop one of them

if (len(df[df.accountNumber != df.customerId]) == 0):
    df = df.drop(columns={'customerId'})
    eda_utils.data_summary(df)

## Visualize null values

In [None]:

plt.figure(figsize=(15, 6))
df.isnull().sum().plot(kind='barh', color='black')
plt.xlabel('Count of Missing Values', fontsize=14)
plt.ylabel('Attributes', fontsize=14)
plt.title('Count of Missing Values vs Attributes', fontsize=18)
plt.show()

## Fill N/A

In [None]:
def fill_na_in_transaction_type(df):
    """
    Fills missing transaction types based on logical deductions using available balances and transaction amounts.
    
    Parameters:
    df (pd.DataFrame): The DataFrame containing transaction data, which must include columns 'accountNumber', 
                       'transactionDateTime', 'transactionAmount', 'availableMoney', and 'transactionType'.
    
    Returns:
    pd.DataFrame: The original DataFrame with missing 'transactionType' values filled based on specified rules.
    
    Process:
    1. Sorts the DataFrame by 'accountNumber' and 'transactionDateTime' to arrange transactions chronologically for each account.
    2. For each account (grouped by 'accountNumber'):
       - Iterates over transactions to identify consecutive transactions.
       - For each transaction with a missing 'transactionType', compares it with the next transaction in the group.
       - Determines the 'transactionType' based on the following logic:
         - **ADDRESS_VERIFICATION**: if 'availableMoney' remains the same between the current and next transactions.
         - **PURCHASE**: if 'availableMoney' decreases by exactly the transaction amount, or is lower than in the next transaction.
         - **REVERSAL**: if 'availableMoney' increases or equals the previous balance plus the transaction amount.
       - If there is no subsequent transaction, assigns a default type ('DEFAULT_TYPE') for flexibility.
    3. Returns the DataFrame with filled values for 'transactionType'.
    
    Usage Example:
        df = fill_na_in_transaction_type(df)
        df.drop(df[df['transactionType'] == 'DEFAULT_TYPE'].index, inplace=True)
        simple_eda(df)
    
    Notes:
    - The 'transactionType' column is modified based on financial logic, useful for filling gaps in historical data.
    - Sorting by 'accountNumber' and 'transactionDateTime' ensures transactions are evaluated in a realistic sequence.
    - If no subsequent transaction exists, assigns a placeholder type ('DEFAULT_TYPE') to be dropped later if needed.
    """
    # Sort by account number and transaction date/time
    df = df.sort_values(['accountNumber', 'transactionDateTime']).reset_index(drop=True)
    
    # Iterate over each group of transactions by account number
    for account, transactions in df.groupby('accountNumber'):
        if len(transactions[transactions['transactionType'].isnull()]) > 0:
            for i in range(0, len(transactions)):  # Start from the second transaction
                current_transaction = transactions.iloc[i]
                
                # Check if it's not the last transaction in the group
                if i + 1 < len(transactions):
                    next_transaction = transactions.iloc[i + 1]
                else:
                    next_transaction = None  # If it's the last transaction, set next_transaction to None
                
                # If current transaction type is NaN, proceed with the logic
                if pd.isna(current_transaction['transactionType']):
                    current_transaction_amount = current_transaction['transactionAmount']
                    current_transaction_available_money = current_transaction['availableMoney']
                    
                    if next_transaction is not None:
                        next_transaction_available_money = next_transaction['availableMoney']
                    
                        # Apply the logic for determining the transaction type based on available money and amounts
                        if current_transaction_available_money == next_transaction_available_money:
                            df.loc[current_transaction.name, 'transactionType'] = 'ADDRESS_VERIFICATION'
                        elif (current_transaction_available_money > next_transaction_available_money) or (current_transaction_available_money - current_transaction_amount == next_transaction_available_money):
                            df.loc[current_transaction.name, 'transactionType'] = 'PURCHASE'
                        elif (current_transaction_available_money < next_transaction_available_money) or (current_transaction_available_money + current_transaction_amount == next_transaction_available_money):
                            df.loc[current_transaction.name, 'transactionType'] = 'REVERSAL'
                    
                    # If there is no next transaction, we can set a default type or handle it accordingly
                    else:
                        # In case no next transaction is available, you could assign a default type or leave it NaN
                        df.loc[current_transaction.name, 'transactionType'] = 'DEFAULT_TYPE'  # Replace 'DEFAULT_TYPE' with your logic

    return df

df = fill_na_in_transaction_type(df)
df.drop(df[df['transactionType'] == 'DEFAULT_TYPE'].index, inplace=True)
eda_utils.data_summary(df)

In [None]:
def fill_na_in_account_related_columns(df):
    """
    Fills missing values in account-related columns ('acqCountry', 'merchantCountryCode', 'posEntryMode', 'posConditionCode') 
    within each account group by replacing them with the most frequent (mode) value for each column.

    Parameters:
    df (pd.DataFrame): The DataFrame containing transaction data, which must include the columns 
                       'accountNumber', 'transactionDateTime', 'acqCountry', 'merchantCountryCode', 
                       'posEntryMode', and 'posConditionCode'.

    Returns:
    pd.DataFrame: The original DataFrame with missing values filled in specified columns based on 
                  the mode for each accountNumber.
    """
    
    gen_acq_country_mode = df['acqCountry'].mode().iloc[0]
    gen_merchant_country_mode = df['merchantCountryCode'].mode().iloc[0]
    gen_pos_entry_mode = df['posEntryMode'].mode().iloc[0]
    gen_pos_condition_mode = df['posConditionCode'].mode().iloc[0]
    
    
    # Iterate over each group of transactions by account number
    for account, transactions in df.groupby('accountNumber'):
        # Calculate the mode for each column in the group
        acq_country_mode = transactions['acqCountry'].mode().iloc[0] if not transactions['acqCountry'].mode().empty else gen_acq_country_mode
        merchant_country_mode = transactions['merchantCountryCode'].mode().iloc[0] if not transactions['merchantCountryCode'].mode().empty else gen_merchant_country_mode
        pos_entry_mode = transactions['posEntryMode'].mode().iloc[0] if not transactions['posEntryMode'].mode().empty else gen_pos_entry_mode
        pos_condition_mode = transactions['posConditionCode'].mode().iloc[0] if not transactions['posConditionCode'].mode().empty else gen_pos_condition_mode
        
        # Directly assign modes to missing values in the original DataFrame
        if acq_country_mode is not None:
            df.loc[transactions.index, 'acqCountry'] = df.loc[transactions.index, 'acqCountry'].replace({pd.NA: acq_country_mode})
        if merchant_country_mode is not None:
            df.loc[transactions.index, 'merchantCountryCode'] = df.loc[transactions.index, 'merchantCountryCode'].replace({pd.NA: merchant_country_mode})
        if pos_entry_mode is not None:
            df.loc[transactions.index, 'posEntryMode'] = df.loc[transactions.index, 'posEntryMode'].replace({pd.NA: pos_entry_mode})
        if pos_condition_mode is not None:
            df.loc[transactions.index, 'posConditionCode'] = df.loc[transactions.index, 'posConditionCode'].replace({pd.NA: pos_condition_mode})
    
    return df

# Example usage
df = fill_na_in_account_related_columns(df)
eda_utils.data_summary(df)

In [None]:
stage_data_io_service.write_stage_data(df)