In [25]:
import os

In [None]:
!pip install tabulate



In [None]:
import pandas as pd

def aggregate_abm_data(file_path):
    """
    Reads ABM data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the ABM data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        # Read the ABM data into a pandas DataFrame. Adjust the 'sep' parameter if your file uses a different delimiter.
        abm_data = pd.read_csv(file_path, sep=',')  # Assuming CSV, change if needed

        # Ensure required columns exist. Handle cases where columns might be named slightly differently.
        if not all(col in abm_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        # Aggregate transactions by customer ID
        aggregated_data = abm_data.groupby('customer_id').agg(
            abm_credit=('amount_cad', lambda x: x[abm_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            abm_debit=('amount_cad', lambda x: x[abm_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        # Handle potential missing values after aggregation
        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_card_data(file_path):
    """
    Reads Card data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the Card data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        card_data = pd.read_csv(file_path, sep=',')

        if not all(col in card_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = card_data.groupby('customer_id').agg(
            card_credit=('amount_cad', lambda x: x[card_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            card_debit=('amount_cad', lambda x: x[card_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_eft_data(file_path):
    """
    Reads EFT data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the EFT data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        eft_data = pd.read_csv(file_path, sep=',')

        if not all(col in eft_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = eft_data.groupby('customer_id').agg(
            eft_credit=('amount_cad', lambda x: x[eft_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            eft_debit=('amount_cad', lambda x: x[eft_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_cheque_data(file_path):
    """
    Reads Cheque data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the Cheque data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        cheque_data = pd.read_csv(file_path, sep=',')

        if not all(col in cheque_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = cheque_data.groupby('customer_id').agg(
            cheque_credit=('amount_cad', lambda x: x[cheque_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            cheque_debit=('amount_cad', lambda x: x[cheque_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_emt_data(file_path):
    """
    Reads EMT data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the EMT data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        emt_data = pd.read_csv(file_path, sep=',')

        if not all(col in emt_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        # For EMT data, the debit/credit column might be represented differently (e.g., 'C' for credit, 'D' for debit)
        aggregated_data = emt_data.groupby('customer_id').agg(
            emt_credit=('amount_cad', lambda x: x[emt_data.loc[x.index, 'debit_credit'] == 'C'].sum()),
            emt_debit=('amount_cad', lambda x: x[emt_data.loc[x.index, 'debit_credit'] == 'D'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_wire_data(file_path):
    """
    Reads Wire data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the Wire data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        wire_data = pd.read_csv(file_path, sep=',')

        if not all(col in wire_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = wire_data.groupby('customer_id').agg(
            wire_credit=('amount_cad', lambda x: x[wire_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            wire_debit=('amount_cad', lambda x: x[wire_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

# Example usage:
file_path_abm = 'abm.csv'  # Replace with your actual file path
file_path_card = 'card.csv'
file_path_eft = 'eft.csv'
file_path_cheque = 'cheque.csv'
file_path_emt = 'emt.csv'
file_path_wire = 'wire.csv'
df_kyc=pd.read_csv('kyc.csv')
df_kyc_industry_codes=pd.read_csv('kyc_industry_codes.csv')

aggregated_abm_df = aggregate_abm_data(file_path_abm)
aggregated_card_df = aggregate_card_data(file_path_card)
aggregated_eft_df = aggregate_eft_data(file_path_eft)
aggregated_cheque_df = aggregate_cheque_data(file_path_cheque)
aggregated_emt_df = aggregate_emt_data(file_path_emt)
aggregated_wire_df = aggregate_wire_data(file_path_wire)

# Combine the dataframes
from functools import reduce

if all(df is not None for df in [aggregated_abm_df, aggregated_card_df, aggregated_eft_df, aggregated_cheque_df, aggregated_emt_df, aggregated_wire_df]):
    combined_df = reduce(lambda left, right: pd.merge(left, right, on=['customer_id'], how='outer'),
                         [aggregated_abm_df, aggregated_card_df, aggregated_eft_df, aggregated_cheque_df, aggregated_emt_df, aggregated_wire_df])
    combined_df.fillna(0, inplace=True)  # Handle any missing values after merging

    # Convert industry_code in df_kyc to numeric
    df_kyc['industry_code'] = pd.to_numeric(df_kyc['industry_code'], errors='coerce')

    # Merge the KYC and industry information into the aggregated dataframe
    combined_df = pd.merge(combined_df, df_kyc, on='customer_id', how='left')
    combined_df = pd.merge(combined_df, df_kyc_industry_codes, on='industry_code', how='left')

    print(combined_df.head().to_markdown(index=False, numalign="left", stralign="left"))
    print(combined_df.info())

| customer_id      | abm_credit   | abm_debit   | card_credit   | card_debit   | eft_credit   | eft_debit   | cheque_credit   | cheque_debit   | emt_credit   | emt_debit   | wire_credit   | wire_debit   | country   | province   | city       | industry_code   | employee_count   | sales   | established_date   | onboard_date   | industry                          |
|:-----------------|:-------------|:------------|:--------------|:-------------|:-------------|:------------|:----------------|:---------------|:-------------|:------------|:--------------|:-------------|:----------|:-----------|:-----------|:----------------|:-----------------|:--------|:-------------------|:---------------|:----------------------------------|
| SYNCID0000000000 | 0            | 0           | 0             | 0            | 0            | 0           | 0               | 415.24         | 0            | 0           | 0.03          | 6316.04      | CA        | ON         | NORTH YORK | 7292            | 0          

In [None]:
# Payment methods for iteration
payment_methods = ['abm', 'card', 'cheque', 'eft', 'emt', 'wire']

# Iterate through payment methods to handle the removal of rows where both credit and debit are zero
for method in payment_methods:
    credit_col = f'{method}_credit'
    debit_col = f'{method}_debit'

    # Identify rows where both credit and debit are 0 for the current method
    rows_to_remove = combined_df[(combined_df[credit_col] == 0) & (combined_df[debit_col] == 0)]

    # Remove these rows from combined_df
    combined_df = combined_df.drop(rows_to_remove.index)

# Calculate credit debit ratios AFTER removing rows with 0 credit and 0 debit
combined_df['abm_credit_debit_ratio'] = combined_df['abm_credit'] / (combined_df['abm_debit'] )
combined_df['card_credit_debit_ratio'] = combined_df['card_credit'] / (combined_df['card_debit'] )
combined_df['cheque_credit_debit_ratio'] = combined_df['cheque_credit'] / (combined_df['cheque_debit'] )
combined_df['eft_credit_debit_ratio'] = combined_df['eft_credit'] / (combined_df['eft_debit'] )
combined_df['emt_credit_debit_ratio'] = combined_df['emt_credit'] / (combined_df['emt_debit'] )
combined_df['wire_credit_debit_ratio'] = combined_df['wire_credit'] / (combined_df['wire_debit'] )

# Calculate debit credit ratios AFTER removing rows with 0 credit and 0 debit
combined_df['abm_debit_credit_ratio'] = combined_df['abm_debit'] / (combined_df['abm_credit'])
combined_df['card_debit_credit_ratio'] = combined_df['card_debit'] / (combined_df['card_credit'])
combined_df['cheque_debit_credit_ratio'] = combined_df['cheque_debit'] / (combined_df['cheque_credit'])
combined_df['eft_debit_credit_ratio'] = combined_df['eft_debit'] / (combined_df['eft_credit'])
combined_df['emt_debit_credit_ratio'] = combined_df['emt_debit'] / (combined_df['emt_credit'])
combined_df['wire_debit_credit_ratio'] = combined_df['wire_debit'] / (combined_df['wire_credit'])

# Replace infinite values with 0 (this line is still good as a general cleanup)
combined_df.replace([float('inf'), float('-inf')], 0, inplace=True)

In [None]:
print(combined_df)

            customer_id  abm_credit  abm_debit  card_credit  card_debit  \
34     SYNCID0000000038        0.00     104.55       -71.46    31861.18   
58     SYNCID0000000063     1050.38      52.66         0.00      106.98   
262    SYNCID0000000276    12626.19    5069.09         0.00     5726.39   
845    SYNCID0000000890    55053.74    2047.86      -200.42    21480.28   
870    SYNCID0000000917      103.71     691.12      9633.76    26585.04   
...                 ...         ...        ...          ...         ...   
15366  SYNCID0000016272      526.68     215.34      3436.52    12320.55   
15370  SYNCID0000016276      214.91     344.76      1713.94    60157.64   
15373  SYNCID0000016281      820.47    1454.74      -177.71     7627.44   
15520  SYNCID0000016435        0.00     898.44     -1041.23    18895.42   
15604  SYNCID0000016523        0.00     106.52      -118.30    17443.51   

       eft_credit  eft_debit  cheque_credit  cheque_debit  emt_credit  ...  \
34       10910.03   1

In [None]:
print(combined_df[combined_df['customer_id'] == 'SYNCID0000005244'])

           customer_id  abm_credit  abm_debit  card_credit  card_debit  \
4953  SYNCID0000005244     4235.09     886.93     22699.75    79740.08   

      eft_credit  eft_debit  cheque_credit  cheque_debit  emt_credit  ...  \
4953     1523.06    3984.26       10109.06       2750.65    31901.87  ...   

      cheque_credit_debit_ratio  eft_credit_debit_ratio  \
4953                   3.675153                0.382269   

      emt_credit_debit_ratio wire_credit_debit_ratio abm_debit_credit_ratio  \
4953                0.648714                0.471693               0.209424   

     card_debit_credit_ratio  cheque_debit_credit_ratio  \
4953                3.512818                   0.272098   

      eft_debit_credit_ratio  emt_debit_credit_ratio wire_debit_credit_ratio  
4953                2.615957                 1.54151                2.120021  

[1 rows x 34 columns]


In [None]:
import pandas as pd

def aggregate_abm_data(file_path):
    """
    Reads ABM data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the ABM data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        # Read the ABM data into a pandas DataFrame. Adjust the 'sep' parameter if your file uses a different delimiter.
        abm_data = pd.read_csv(file_path, sep=',')  # Assuming CSV, change if needed

        # Ensure required columns exist. Handle cases where columns might be named slightly differently.
        if not all(col in abm_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        # Aggregate transactions by customer ID
        aggregated_data = abm_data.groupby('customer_id').agg(
            abm_credit=('amount_cad', lambda x: x[abm_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            abm_debit=('amount_cad', lambda x: x[abm_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        # Handle potential missing values after aggregation
        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_card_data(file_path):
    """
    Reads Card data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the Card data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        card_data = pd.read_csv(file_path, sep=',')

        if not all(col in card_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = card_data.groupby('customer_id').agg(
            card_credit=('amount_cad', lambda x: x[card_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            card_debit=('amount_cad', lambda x: x[card_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_eft_data(file_path):
    """
    Reads EFT data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the EFT data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        eft_data = pd.read_csv(file_path, sep=',')

        if not all(col in eft_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = eft_data.groupby('customer_id').agg(
            eft_credit=('amount_cad', lambda x: x[eft_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            eft_debit=('amount_cad', lambda x: x[eft_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_cheque_data(file_path):
    """
    Reads Cheque data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the Cheque data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        cheque_data = pd.read_csv(file_path, sep=',')

        if not all(col in cheque_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = cheque_data.groupby('customer_id').agg(
            cheque_credit=('amount_cad', lambda x: x[cheque_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            cheque_debit=('amount_cad', lambda x: x[cheque_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_emt_data(file_path):
    """
    Reads EMT data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the EMT data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        emt_data = pd.read_csv(file_path, sep=',')

        if not all(col in emt_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        # For EMT data, the debit/credit column might be represented differently (e.g., 'C' for credit, 'D' for debit)
        aggregated_data = emt_data.groupby('customer_id').agg(
            emt_credit=('amount_cad', lambda x: x[emt_data.loc[x.index, 'debit_credit'] == 'C'].sum()),
            emt_debit=('amount_cad', lambda x: x[emt_data.loc[x.index, 'debit_credit'] == 'D'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

def aggregate_wire_data(file_path):
    """
    Reads Wire data, aggregates transactions by customer ID, and calculates total credit and debit amounts.

    Args:
        file_path (str): The path to the Wire data file (e.g., CSV).

    Returns:
        pandas.DataFrame: A DataFrame with customer ID, total credit amount, and total debit amount.
                         Returns None if there's an error during file reading or processing.
    """
    try:
        wire_data = pd.read_csv(file_path, sep=',')

        if not all(col in wire_data.columns for col in ['customer_id', 'amount_cad', 'debit_credit']):
            print("Error: Required columns ('customer_id', 'amount_cad', 'debit_credit') not found.")
            return None

        aggregated_data = wire_data.groupby('customer_id').agg(
            wire_credit=('amount_cad', lambda x: x[wire_data.loc[x.index, 'debit_credit'] == 'credit'].sum()),
            wire_debit=('amount_cad', lambda x: x[wire_data.loc[x.index, 'debit_credit'] == 'debit'].sum())
        ).reset_index()

        aggregated_data.fillna(0, inplace=True)

        return aggregated_data

    except FileNotFoundError:
        print(f"Error: File not found at '{file_path}'")
        return None
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return None

In [None]:
def calculate_abm_stats(df):
    """
    Calculates ABM statistics: credit/debit transaction counts, average credit/debit,
    max/min credit/debit, and account active days.

    Args:
        df (pandas.DataFrame): The combined DataFrame containing ABM transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added ABM statistics columns.
    """
    df_abm = pd.read_csv('abm.csv')  # Reload ABM data
    df_abm['transaction_date'] = pd.to_datetime(df_abm['transaction_date'])

    # Group by customer_id and calculate ABM statistics
    df_abm_grouped = df_abm.groupby('customer_id').agg(
        abm_credit_transaction_count=('debit_credit', lambda x: (x == 'credit').sum()),
        abm_debit_transaction_count=('debit_credit', lambda x: (x == 'debit').sum()),
        abm_average_credit=('amount_cad', lambda x: x[df_abm.loc[x.index, 'debit_credit'] == 'credit'].mean()),
        abm_average_debit=('amount_cad', lambda x: x[df_abm.loc[x.index, 'debit_credit'] == 'debit'].mean()),
        max_abm_credit=('amount_cad', lambda x: x[df_abm.loc[x.index, 'debit_credit'] == 'credit'].max()),
        max_abm_debit=('amount_cad', lambda x: x[df_abm.loc[x.index, 'debit_credit'] == 'debit'].max()),
        min_abm_credit=('amount_cad', lambda x: x[df_abm.loc[x.index, 'debit_credit'] == 'credit'].min()),
        min_abm_debit=('amount_cad', lambda x: x[df_abm.loc[x.index, 'debit_credit'] == 'debit'].min())
    ).reset_index()

    # Calculate account active days
    df_abm_grouped_days = df_abm.groupby('customer_id')['transaction_date'].agg(
        lambda x: (x.max() - x.min()).days
    ).reset_index(name='abm_account_active_days')

    # Merge the calculated statistics into the combined dataframe
    df = pd.merge(df, df_abm_grouped, on='customer_id', how='left')
    df = pd.merge(df, df_abm_grouped_days, on='customer_id', how='left')

    return df


def calculate_card_stats(df):
    """
    Calculates Card statistics: credit/debit transaction counts, average credit/debit,
    max/min credit/debit, and account active days.

    Args:
        df (pandas.DataFrame): The combined DataFrame containing Card transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added Card statistics columns.
    """
    df_card = pd.read_csv('card.csv')
    df_card['transaction_date'] = pd.to_datetime(df_card['transaction_date'])

    df_card_grouped = df_card.groupby('customer_id').agg(
        card_credit_transaction_count=('debit_credit', lambda x: (x == 'credit').sum()),
        card_debit_transaction_count=('debit_credit', lambda x: (x == 'debit').sum()),
        card_average_credit=('amount_cad', lambda x: x[df_card.loc[x.index, 'debit_credit'] == 'credit'].mean()),
        card_average_debit=('amount_cad', lambda x: x[df_card.loc[x.index, 'debit_credit'] == 'debit'].mean()),
        max_card_credit=('amount_cad', lambda x: x[df_card.loc[x.index, 'debit_credit'] == 'credit'].max()),
        max_card_debit=('amount_cad', lambda x: x[df_card.loc[x.index, 'debit_credit'] == 'debit'].max()),
        min_card_credit=('amount_cad', lambda x: x[df_card.loc[x.index, 'debit_credit'] == 'credit'].min()),
        min_card_debit=('amount_cad', lambda x: x[df_card.loc[x.index, 'debit_credit'] == 'debit'].min())
    ).reset_index()

    df_card_grouped_days = df_card.groupby('customer_id')['transaction_date'].agg(
        lambda x: (x.max() - x.min()).days
    ).reset_index(name='card_account_active_days')

    df = pd.merge(df, df_card_grouped, on='customer_id', how='left')
    df = pd.merge(df, df_card_grouped_days, on='customer_id', how='left')

    return df


def calculate_eft_stats(df):
    """
    Calculates EFT statistics: credit/debit transaction counts, average credit/debit,
    max/min credit/debit, and account active days.

    Args:
        df (pandas.DataFrame): The combined DataFrame containing EFT transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added EFT statistics columns.
    """
    df_eft = pd.read_csv('eft.csv')
    df_eft['transaction_date'] = pd.to_datetime(df_eft['transaction_date'])

    df_eft_grouped = df_eft.groupby('customer_id').agg(
        eft_credit_transaction_count=('debit_credit', lambda x: (x == 'credit').sum()),
        eft_debit_transaction_count=('debit_credit', lambda x: (x == 'debit').sum()),
        eft_average_credit=('amount_cad', lambda x: x[df_eft.loc[x.index, 'debit_credit'] == 'credit'].mean()),
        eft_average_debit=('amount_cad', lambda x: x[df_eft.loc[x.index, 'debit_credit'] == 'debit'].mean()),
        max_eft_credit=('amount_cad', lambda x: x[df_eft.loc[x.index, 'debit_credit'] == 'credit'].max()),
        max_eft_debit=('amount_cad', lambda x: x[df_eft.loc[x.index, 'debit_credit'] == 'debit'].max()),
        min_eft_credit=('amount_cad', lambda x: x[df_eft.loc[x.index, 'debit_credit'] == 'credit'].min()),
        min_eft_debit=('amount_cad', lambda x: x[df_eft.loc[x.index, 'debit_credit'] == 'debit'].min())
    ).reset_index()

    df_eft_grouped_days = df_eft.groupby('customer_id')['transaction_date'].agg(
        lambda x: (x.max() - x.min()).days
    ).reset_index(name='eft_account_active_days')

    df = pd.merge(df, df_eft_grouped, on='customer_id', how='left')
    df = pd.merge(df, df_eft_grouped_days, on='customer_id', how='left')

    return df


def calculate_cheque_stats(df):
    """
    Calculates Cheque statistics: credit/debit transaction counts, average credit/debit,
    max/min credit/debit, and account active days.

    Args:
        df (pandas.DataFrame): The combined DataFrame containing Cheque transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added Cheque statistics columns.
    """
    df_cheque = pd.read_csv('cheque.csv')
    df_cheque['transaction_date'] = pd.to_datetime(df_cheque['transaction_date'])

    df_cheque_grouped = df_cheque.groupby('customer_id').agg(
        cheque_credit_transaction_count=('debit_credit', lambda x: (x == 'credit').sum()),
        cheque_debit_transaction_count=('debit_credit', lambda x: (x == 'debit').sum()),
        cheque_average_credit=('amount_cad', lambda x: x[df_cheque.loc[x.index, 'debit_credit'] == 'credit'].mean()),
        cheque_average_debit=('amount_cad', lambda x: x[df_cheque.loc[x.index, 'debit_credit'] == 'debit'].mean()),
        max_cheque_credit=('amount_cad', lambda x: x[df_cheque.loc[x.index, 'debit_credit'] == 'credit'].max()),
        max_cheque_debit=('amount_cad', lambda x: x[df_cheque.loc[x.index, 'debit_credit'] == 'debit'].max()),
        min_cheque_credit=('amount_cad', lambda x: x[df_cheque.loc[x.index, 'debit_credit'] == 'credit'].min()),
        min_cheque_debit=('amount_cad', lambda x: x[df_cheque.loc[x.index, 'debit_credit'] == 'debit'].min())
    ).reset_index()

    df_cheque_grouped_days = df_cheque.groupby('customer_id')['transaction_date'].agg(
        lambda x: (x.max() - x.min()).days
    ).reset_index(name='cheque_account_active_days')

    df = pd.merge(df, df_cheque_grouped, on='customer_id', how='left')
    df = pd.merge(df, df_cheque_grouped_days, on='customer_id', how='left')

    return df

def calculate_emt_stats(df):
    """
    Calculates EMT statistics: credit/debit transaction counts, average credit/debit,
    max/min credit/debit, and account active days.

    Args:
        df (pandas.DataFrame): The combined DataFrame containing EMT transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added EMT statistics columns.
    """
    df_emt = pd.read_csv('emt.csv')
    df_emt['transaction_date'] = pd.to_datetime(df_emt['transaction_date'])

    # For EMT data, the debit/credit column might be represented differently (e.g., 'C' for credit, 'D' for debit)
    df_emt_grouped = df_emt.groupby('customer_id').agg(
        emt_credit_transaction_count=('debit_credit', lambda x: (x == 'C').sum()),
        emt_debit_transaction_count=('debit_credit', lambda x: (x == 'D').sum()),
        emt_average_credit=('amount_cad', lambda x: x[df_emt.loc[x.index, 'debit_credit'] == 'C'].mean()),
        emt_average_debit=('amount_cad', lambda x: x[df_emt.loc[x.index, 'debit_credit'] == 'D'].mean()),
        max_emt_credit=('amount_cad', lambda x: x[df_emt.loc[x.index, 'debit_credit'] == 'C'].max()),
        max_emt_debit=('amount_cad', lambda x: x[df_emt.loc[x.index, 'debit_credit'] == 'D'].max()),
        min_emt_credit=('amount_cad', lambda x: x[df_emt.loc[x.index, 'debit_credit'] == 'C'].min()),
        min_emt_debit=('amount_cad', lambda x: x[df_emt.loc[x.index, 'debit_credit'] == 'D'].min())
    ).reset_index()

    df_emt_grouped_days = df_emt.groupby('customer_id')['transaction_date'].agg(
        lambda x: (x.max() - x.min()).days
    ).reset_index(name='emt_account_active_days')

    df = pd.merge(df, df_emt_grouped, on='customer_id', how='left')
    df = pd.merge(df, df_emt_grouped_days, on='customer_id', how='left')

    return df


def calculate_wire_stats(df):
    """
    Calculates Wire statistics: credit/debit transaction counts, average credit/debit,
    max/min credit/debit, and account active days.

    Args:
        df (pandas.DataFrame): The combined DataFrame containing Wire transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added Wire statistics columns.
    """
    df_wire = pd.read_csv('wire.csv')
    df_wire['transaction_date'] = pd.to_datetime(df_wire['transaction_date'])

    df_wire_grouped = df_wire.groupby('customer_id').agg(
        wire_credit_transaction_count=('debit_credit', lambda x: (x == 'credit').sum()),
        wire_debit_transaction_count=('debit_credit', lambda x: (x == 'debit').sum()),
        wire_average_credit=('amount_cad', lambda x: x[df_wire.loc[x.index, 'debit_credit'] == 'credit'].mean()),
        wire_average_debit=('amount_cad', lambda x: x[df_wire.loc[x.index, 'debit_credit'] == 'debit'].mean()),
        max_wire_credit=('amount_cad', lambda x: x[df_wire.loc[x.index, 'debit_credit'] == 'credit'].max()),
        max_wire_debit=('amount_cad', lambda x: x[df_wire.loc[x.index, 'debit_credit'] == 'debit'].max()),
        min_wire_credit=('amount_cad', lambda x: x[df_wire.loc[x.index, 'debit_credit'] == 'credit'].min()),
        min_wire_debit=('amount_cad', lambda x: x[df_wire.loc[x.index, 'debit_credit'] == 'debit'].min())
    ).reset_index()

    df_wire_grouped_days = df_wire.groupby('customer_id')['transaction_date'].agg(
        lambda x: (x.max() - x.min()).days
    ).reset_index(name='wire_account_active_days')

    df = pd.merge(df, df_wire_grouped, on='customer_id', how='left')
    df = pd.merge(df, df_wire_grouped_days, on='customer_id', how='left')

    return df


def calculate_rolling_stats(df):
    """
    Calculates rolling statistics for each transaction type:
    - Rolling average for 30 days and 7 days
    - Rolling sum ratio for 7 days
    - Active days ratio

    Args:
        df (pandas.DataFrame): The combined DataFrame containing all transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added rolling statistics columns.
    """

    # ABM Rolling Stats
    df['abm_amount_rolling_avg_30d'] = (
        df['abm_credit'] + df['abm_debit']
    ).rolling(window=30, min_periods=1).mean()
    df['abm_amount_rolling_avg_7d'] = (
        df['abm_credit'] + df['abm_debit']
    ).rolling(window=7, min_periods=1).mean()
    df['avg_abm_amount'] = (df['abm_credit'] + df['abm_debit']) / (
        df['abm_credit_transaction_count'] + df['abm_debit_transaction_count']
    )
    df['abm_rolling_sum_7d_ratio'] = (
        df['abm_credit'] + df['abm_debit']
    ).rolling(window=7, min_periods=1).sum() / (df['abm_credit'] + df['abm_debit'])
    df['abm_active_days_ratio'] = df['abm_account_active_days'] / (
        df['abm_credit_transaction_count'] + df['abm_debit_transaction_count']
    )

    # Card Rolling Stats
    df['card_amount_rolling_avg_30d'] = (
        df['card_credit'] + df['card_debit']
    ).rolling(window=30, min_periods=1).mean()
    df['card_amount_rolling_avg_7d'] = (
        df['card_credit'] + df['card_debit']
    ).rolling(window=7, min_periods=1).mean()
    df['avg_card_amount'] = (df['card_credit'] + df['card_debit']) / (
        df['card_credit_transaction_count'] + df['card_debit_transaction_count']
    )
    df['card_rolling_sum_7d_ratio'] = (
        df['card_credit'] + df['card_debit']
    ).rolling(window=7, min_periods=1).sum() / (df['card_credit'] + df['card_debit'])
    df['card_active_days_ratio'] = df['card_account_active_days'] / (
        df['card_credit_transaction_count'] + df['card_debit_transaction_count']
    )

    # EFT Rolling Stats
    df['eft_amount_rolling_avg_30d'] = (
        df['eft_credit'] + df['eft_debit']
    ).rolling(window=30, min_periods=1).mean()
    df['eft_amount_rolling_avg_7d'] = (
        df['eft_credit'] + df['eft_debit']
    ).rolling(window=7, min_periods=1).mean()
    df['avg_eft_amount'] = (df['eft_credit'] + df['eft_debit']) / (
        df['eft_credit_transaction_count'] + df['eft_debit_transaction_count']
    )
    df['eft_rolling_sum_7d_ratio'] = (
        df['eft_credit'] + df['eft_debit']
    ).rolling(window=7, min_periods=1).sum() / (df['eft_credit'] + df['eft_debit'])
    df['eft_active_days_ratio'] = df['eft_account_active_days'] / (
        df['eft_credit_transaction_count'] + df['eft_debit_transaction_count']
    )

    # Cheque Rolling Stats
    df['cheque_amount_rolling_avg_30d'] = (
        df['cheque_credit'] + df['cheque_debit']
    ).rolling(window=30, min_periods=1).mean()
    df['cheque_amount_rolling_avg_7d'] = (
        df['cheque_credit'] + df['cheque_debit']
    ).rolling(window=7, min_periods=1).mean()
    df['avg_cheque_amount'] = (df['cheque_credit'] + df['cheque_debit']) / (
        df['cheque_credit_transaction_count'] + df['cheque_debit_transaction_count']
    )
    df['cheque_rolling_sum_7d_ratio'] = (
        df['cheque_credit'] + df['cheque_debit']
    ).rolling(window=7, min_periods=1).sum() / (df['cheque_credit'] + df['cheque_debit'])
    df['cheque_active_days_ratio'] = df['cheque_account_active_days'] / (
        df['cheque_credit_transaction_count'] + df['cheque_debit_transaction_count']
    )

    # EMT Rolling Stats
    df['emt_amount_rolling_avg_30d'] = (
        df['emt_credit'] + df['emt_debit']
    ).rolling(window=30, min_periods=1).mean()
    df['emt_amount_rolling_avg_7d'] = (
        df['emt_credit'] + df['emt_debit']
    ).rolling(window=7, min_periods=1).mean()
    df['avg_emt_amount'] = (df['emt_credit'] + df['emt_debit']) / (
        df['emt_credit_transaction_count'] + df['emt_debit_transaction_count']
    )
    df['emt_rolling_sum_7d_ratio'] = (
        df['emt_credit'] + df['emt_debit']
    ).rolling(window=7, min_periods=1).sum() / (df['emt_credit'] + df['emt_debit'])
    df['emt_active_days_ratio'] = df['emt_account_active_days'] / (
        df['emt_credit_transaction_count'] + df['emt_debit_transaction_count']
    )

    # Wire Rolling Stats
    df['wire_amount_rolling_avg_30d'] = (
        df['wire_credit'] + df['wire_debit']
    ).rolling(window=30, min_periods=1).mean()
    df['wire_amount_rolling_avg_7d'] = (
        df['wire_credit'] + df['wire_debit']
    ).rolling(window=7, min_periods=1).mean()
    df['avg_wire_amount'] = (df['wire_credit'] + df['wire_debit']) / (
        df['wire_credit_transaction_count'] + df['wire_debit_transaction_count']
    )
    df['wire_rolling_sum_7d_ratio'] = (
        df['wire_credit'] + df['wire_debit']
    ).rolling(window=7, min_periods=1).sum() / (df['wire_credit'] + df['wire_debit'])
    df['wire_active_days_ratio'] = df['wire_account_active_days'] / (
        df['wire_credit_transaction_count'] + df['wire_debit_transaction_count']
    )

    return df

# Example usage:
file_path_abm = 'abm.csv'  # Replace with your actual file path
file_path_card = 'card.csv'
file_path_eft = 'eft.csv'
file_path_cheque = 'cheque.csv'
file_path_emt = 'emt.csv'
file_path_wire = 'wire.csv'


In [None]:

aggregated_abm_df = aggregate_abm_data(file_path_abm)
aggregated_card_df = aggregate_card_data(file_path_card)
aggregated_eft_df = aggregate_eft_data(file_path_eft)
aggregated_cheque_df = aggregate_cheque_data(file_path_cheque)
aggregated_emt_df = aggregate_emt_data(file_path_emt)
aggregated_wire_df = aggregate_wire_data(file_path_wire)

In [None]:
# Combine the dataframes
from functools import reduce

if all(df is not None for df in [aggregated_abm_df, aggregated_card_df, aggregated_eft_df, aggregated_cheque_df, aggregated_emt_df, aggregated_wire_df]):
    combined_df = reduce(lambda left, right: pd.merge(left, right, on=['customer_id'], how='outer'),
                         [aggregated_abm_df, aggregated_card_df, aggregated_eft_df, aggregated_cheque_df, aggregated_emt_df, aggregated_wire_df])
    combined_df.fillna(0, inplace=True)  # Handle any missing values after merging

    # Calculate statistics for all transaction types
    combined_df = calculate_abm_stats(combined_df)
    combined_df = calculate_card_stats(combined_df)
    combined_df = calculate_eft_stats(combined_df)
    combined_df = calculate_cheque_stats(combined_df)
    combined_df = calculate_emt_stats(combined_df)
    combined_df = calculate_wire_stats(combined_df)

    # Calculate rolling statistics
    combined_df = calculate_rolling_stats(combined_df)

    print(combined_df.head().to_markdown(index=False, numalign="left", stralign="left"))
    print(combined_df.info())

| customer_id      | abm_credit   | abm_debit   | card_credit   | card_debit   | eft_credit   | eft_debit   | cheque_credit   | cheque_debit   | emt_credit   | emt_debit   | wire_credit   | wire_debit   | abm_credit_transaction_count   | abm_debit_transaction_count   | abm_average_credit   | abm_average_debit   | max_abm_credit   | max_abm_debit   | min_abm_credit   | min_abm_debit   | abm_account_active_days   | card_credit_transaction_count   | card_debit_transaction_count   | card_average_credit   | card_average_debit   | max_card_credit   | max_card_debit   | min_card_credit   | min_card_debit   | card_account_active_days   | eft_credit_transaction_count   | eft_debit_transaction_count   | eft_average_credit   | eft_average_debit   | max_eft_credit   | max_eft_debit   | min_eft_credit   | min_eft_debit   | eft_account_active_days   | cheque_credit_transaction_count   | cheque_debit_transaction_count   | cheque_average_credit   | cheque_average_debit   | max_cheque_credit   | max_ch

In [None]:
import pandas as pd
import numpy as np

def add_kyc_data(df):
    """
    Adds KYC and industry information to the combined dataframe.

    Args:
        df (pandas.DataFrame): The combined DataFrame containing transaction data.

    Returns:
        pandas.DataFrame: The DataFrame with added KYC and industry information.
    """
    df_kyc = pd.read_csv('kyc.csv')
    df_kyc_industry_codes = pd.read_csv('kyc_industry_codes.csv')
    # Check if 'industry_code' exists in df before merging
    if 'industry_code' in df.columns:
        # Ensure 'industry_code' in both dataframes is of the same type
        df['industry_code'] = df['industry_code'].astype(float)
        df_kyc_industry_codes['industry_code'] = df_kyc_industry_codes['industry_code'].astype(float)

        # Merge the industry information into the aggregated dataframe
        df = pd.merge(df, df_kyc_industry_codes, on='industry_code', how='left')
    else:
        print("Warning: 'industry_code' not found in combined_df. Skipping industry code merge.")

    # Merge the KYC  information into the aggregated dataframe
    df = pd.merge(df, df_kyc, on='customer_id', how='left') #This line was missing
    return df

In [None]:
# Add KYC data to combined_df
combined_df = add_kyc_data(combined_df)

# Print the first 5 rows of the combined dataframe with KYC data
print(combined_df.head().to_markdown(index=False, numalign="left", stralign="left"))
print(combined_df.info())

| customer_id      | abm_credit   | abm_debit   | card_credit   | card_debit   | eft_credit   | eft_debit   | cheque_credit   | cheque_debit   | emt_credit   | emt_debit   | wire_credit   | wire_debit   | abm_credit_transaction_count   | abm_debit_transaction_count   | abm_average_credit   | abm_average_debit   | max_abm_credit   | max_abm_debit   | min_abm_credit   | min_abm_debit   | abm_account_active_days   | card_credit_transaction_count   | card_debit_transaction_count   | card_average_credit   | card_average_debit   | max_card_credit   | max_card_debit   | min_card_credit   | min_card_debit   | card_account_active_days   | eft_credit_transaction_count   | eft_debit_transaction_count   | eft_average_credit   | eft_average_debit   | max_eft_credit   | max_eft_debit   | min_eft_credit   | min_eft_debit   | eft_account_active_days   | cheque_credit_transaction_count   | cheque_debit_transaction_count   | cheque_average_credit   | cheque_average_debit   | max_cheque_credit   | max_ch

In [None]:
import pandas as pd
import numpy as np

payment_methods = ['abm', 'card', 'cheque', 'eft', 'emt', 'wire']

# --- 1. Calculate log1p(account_active_days) for each transaction type ---

for method in payment_methods:
    active_days_col = f'{method}_account_active_days'
    log1p_active_days_col = f'log1p_{method}_account_active_days'

    # Apply log1p transformation to the 'account_active_days' column
    # Handle potential missing values (NaN) by filling with 0 before log1p
    combined_df[log1p_active_days_col] = combined_df[active_days_col].fillna(0).apply(np.log1p)


# --- 2. Interaction terms: log1p(account_active_days) * (total_transaction_amount / kyc_sales) ---

for method in payment_methods:
    recency_proxy_col = f'log1p_{method}_account_active_days' # Using active days as proxy for recency
    # Fix: Correctly concatenate column names for total volume
    volume_col = f'{method}_credit'  # Start with the credit column
    volume_col_debit = f'{method}_debit'
    interaction_col = f'interaction_active_days_volume_kyc_sales_{method}' # Descriptive interaction column name

    combined_df[interaction_col] = 0.0 # Initialize interaction column

    for index, row in combined_df.iterrows():
        recency_proxy_value = row[recency_proxy_col]
        # Fix: Access credit and debit columns separately and sum them
        volume_value = row[volume_col] + row[volume_col_debit]
        kyc_sales_value = row['sales'] # Assuming 'kyc_sales' column exists

        if pd.notna(recency_proxy_value) and pd.notna(volume_value) and pd.notna(kyc_sales_value):
            if kyc_sales_value != 0:
                interaction_value = recency_proxy_value * (volume_value / kyc_sales_value)
                combined_df.at[index, interaction_col] = interaction_value
            else:
                combined_df.at[index, interaction_col] = 0.0 # If kyc_sales is zero, interaction is zero



# Display the newly created columns (first few rows)
print(combined_df[[col for col in combined_df.columns if 'log1p_' in col or 'interaction_active_days' in col]].head())

   log1p_abm_account_active_days  log1p_card_account_active_days  \
0                            0.0                        0.000000   
1                            0.0                        3.891820   
2                            0.0                        0.000000   
3                            0.0                        4.465908   
4                            0.0                        0.000000   

   log1p_cheque_account_active_days  log1p_eft_account_active_days  \
0                          0.000000                       0.000000   
1                          0.000000                       0.000000   
2                          4.477337                       0.000000   
3                          0.000000                       0.000000   
4                          0.000000                       4.521789   

   log1p_emt_account_active_days  log1p_wire_account_active_days  \
0                            0.0                         3.78419   
1                            0.0  

In [None]:
import pandas as pd
import numpy as np

def calculate_card_stats(df):
    """
    Calculates card-specific statistics and adds them to the DataFrame.
    """
    # Read the card data again to access merchant_category and ecommerce_ind
    card_data = pd.read_csv('card.csv')

    # Calculate num_unique_card_merchant_categories
    # Group by 'customer_id' and get the number of unique merchant categories
    num_unique_categories = card_data.groupby('customer_id')['merchant_category'].nunique().reset_index()
    # Rename the column to 'num_unique_card_merchant_categories'
    num_unique_categories = num_unique_categories.rename(columns={'merchant_category': 'num_unique_card_merchant_categories'})

    # Merge num_unique_categories into df
    df = pd.merge(df, num_unique_categories, on='customer_id', how='left')

    # Calculate pct_card_ecommerce
    # Group by 'customer_id' and calculate the percentage of ecommerce transactions
    pct_ecommerce = card_data.groupby('customer_id')['ecommerce_ind'].mean().reset_index()
    # Rename the column to 'pct_card_ecommerce'
    pct_ecommerce = pct_ecommerce.rename(columns={'ecommerce_ind': 'pct_card_ecommerce'})

    # Merge pct_ecommerce into df
    df = pd.merge(df, pct_ecommerce, on='customer_id', how='left')

    return df

# Assuming 'combined_df' is already defined and populated
combined_df = calculate_card_stats(combined_df)

In [None]:
import numpy as np

# 1. log1p(kyc_employee_count)
combined_df['log1p_kyc_employee_count'] = np.log1p(combined_df['employee_count'])

# Display the updated dataframe
print(combined_df.head().to_markdown(index=False, numalign="left", stralign="left"))
print(combined_df.info())


| customer_id      | abm_credit   | abm_debit   | card_credit   | card_debit   | eft_credit   | eft_debit   | cheque_credit   | cheque_debit   | emt_credit   | emt_debit   | wire_credit   | wire_debit   | abm_credit_transaction_count   | abm_debit_transaction_count   | abm_average_credit   | abm_average_debit   | max_abm_credit   | max_abm_debit   | min_abm_credit   | min_abm_debit   | abm_account_active_days   | card_credit_transaction_count   | card_debit_transaction_count   | card_average_credit   | card_average_debit   | max_card_credit   | max_card_debit   | min_card_credit   | min_card_debit   | card_account_active_days   | eft_credit_transaction_count   | eft_debit_transaction_count   | eft_average_credit   | eft_average_debit   | max_eft_credit   | max_eft_debit   | min_eft_credit   | min_eft_debit   | eft_account_active_days   | cheque_credit_transaction_count   | cheque_debit_transaction_count   | cheque_average_credit   | cheque_average_debit   | max_cheque_credit   | max_ch

In [None]:
# Payment methods for iteration
payment_methods = ['abm', 'card', 'cheque', 'eft', 'emt', 'wire']

# Calculate credit debit ratios AFTER removing rows with 0 credit and 0 debit
combined_df['abm_credit_debit_ratio'] = combined_df['abm_credit'] / (combined_df['abm_debit'])
combined_df['card_credit_debit_ratio'] = combined_df['card_credit'] / (combined_df['card_debit'])
combined_df['cheque_credit_debit_ratio'] = combined_df['cheque_credit'] / (combined_df['cheque_debit'])
combined_df['eft_credit_debit_ratio'] = combined_df['eft_credit'] / (combined_df['eft_debit'])
combined_df['emt_credit_debit_ratio'] = combined_df['emt_credit'] / (combined_df['emt_debit'])
combined_df['wire_credit_debit_ratio'] = combined_df['wire_credit'] / (combined_df['wire_debit'])

# Calculate debit credit ratios AFTER removing rows with 0 credit and 0 debit
combined_df['abm_debit_credit_ratio'] = combined_df['abm_debit'] / (combined_df['abm_credit'])
combined_df['card_debit_credit_ratio'] = combined_df['card_debit'] / (combined_df['card_credit'])
combined_df['cheque_debit_credit_ratio'] = combined_df['cheque_debit'] / (combined_df['cheque_credit'])
combined_df['eft_debit_credit_ratio'] = combined_df['eft_debit'] / (combined_df['eft_credit'])
combined_df['emt_debit_credit_ratio'] = combined_df['emt_debit'] / (combined_df['emt_credit'])
combined_df['wire_debit_credit_ratio'] = combined_df['wire_debit'] / (combined_df['wire_credit'])

# Replace infinite values with 0 (this line is still good as a general cleanup)
combined_df.replace([float('inf'), float('-inf')], 0, inplace=True)

In [None]:
print(combined_df)

            customer_id  abm_credit  abm_debit  card_credit  card_debit  \
0      SYNCID0000000000         0.0        0.0         0.00        0.00   
1      SYNCID0000000001         0.0        0.0         0.00      291.39   
2      SYNCID0000000002         0.0        0.0         0.00        0.00   
3      SYNCID0000000004         0.0        0.0      8805.86     7534.12   
4      SYNCID0000000005         0.0        0.0         0.00        0.00   
...                 ...         ...        ...          ...         ...   
16221  SYNCID0000017178         0.0        0.0         0.00        0.00   
16222  SYNCID0000017179         0.0        0.0         0.00        0.00   
16223  SYNCID0000017180         0.0        0.0         0.00        0.00   
16224  SYNCID0000017181         0.0        0.0         0.00        0.00   
16225  SYNCID0000017182         0.0        0.0         0.00        0.00   

       eft_credit  eft_debit  cheque_credit  cheque_debit  emt_credit  ...  \
0            0.00    

In [None]:
# Replace NaN values with 0 in specified columns
combined_df['num_unique_card_merchant_categories'].fillna(0, inplace=True)
combined_df['pct_card_ecommerce'].fillna(0, inplace=True)
combined_df['log1p_kyc_employee_count'].fillna(0, inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['num_unique_card_merchant_categories'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  combined_df['pct_card_ecommerce'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediat

In [None]:
# prompt: print(combined_df[combined_df['customer_id'] == 'SYNCID0000005251']) print as a dictionary with column name as key and

selected_customer_data = combined_df[combined_df['customer_id'] == 'SYNCID0000005251'].iloc[0].to_dict()
selected_customer_data


{'customer_id': 'SYNCID0000005251',
 'abm_credit': 5999.209999999999,
 'abm_debit': 7904.359999999999,
 'card_credit': -96.74,
 'card_debit': 24414.909999999996,
 'eft_credit': 4816622.92,
 'eft_debit': 5286783.3100000005,
 'cheque_credit': 16315.13,
 'cheque_debit': 2176.4,
 'emt_credit': 0.0,
 'emt_debit': 0.0,
 'wire_credit': 0.0,
 'wire_debit': 0.0,
 'abm_credit_transaction_count': 3.0,
 'abm_debit_transaction_count': 14.0,
 'abm_average_credit': 1999.7366666666665,
 'abm_average_debit': 564.5971428571428,
 'max_abm_credit': 4905.54,
 'max_abm_debit': 1191.13,
 'min_abm_credit': 325.19,
 'min_abm_debit': 87.71,
 'abm_account_active_days': 72.0,
 'card_credit_transaction_count': 1.0,
 'card_debit_transaction_count': 116.0,
 'card_average_credit': -96.74,
 'card_average_debit': 210.47336206896549,
 'max_card_credit': -96.74,
 'max_card_debit': 5395.3,
 'min_card_credit': -96.74,
 'min_card_debit': 0.0,
 'card_account_active_days': 89.0,
 'eft_credit_transaction_count': 4.0,
 'eft_deb

In [None]:
combined_df.to_csv('combined_df.csv', index=False)

In [None]:
print(list(combined_df.columns))

['customer_id', 'abm_credit', 'abm_debit', 'card_credit', 'card_debit', 'eft_credit', 'eft_debit', 'cheque_credit', 'cheque_debit', 'emt_credit', 'emt_debit', 'wire_credit', 'wire_debit', 'abm_credit_transaction_count', 'abm_debit_transaction_count', 'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit', 'abm_account_active_days', 'card_credit_transaction_count', 'card_debit_transaction_count', 'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit', 'card_account_active_days', 'eft_credit_transaction_count', 'eft_debit_transaction_count', 'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit', 'eft_account_active_days', 'cheque_credit_transaction_count', 'cheque_debit_transaction_count', 'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_de

In [None]:
print(combined_df)

            customer_id  abm_credit  abm_debit  card_credit  card_debit  \
0      SYNCID0000000000         0.0        0.0         0.00        0.00   
1      SYNCID0000000001         0.0        0.0         0.00      291.39   
2      SYNCID0000000002         0.0        0.0         0.00        0.00   
3      SYNCID0000000004         0.0        0.0      8805.86     7534.12   
4      SYNCID0000000005         0.0        0.0         0.00        0.00   
...                 ...         ...        ...          ...         ...   
16221  SYNCID0000017178         0.0        0.0         0.00        0.00   
16222  SYNCID0000017179         0.0        0.0         0.00        0.00   
16223  SYNCID0000017180         0.0        0.0         0.00        0.00   
16224  SYNCID0000017181         0.0        0.0         0.00        0.00   
16225  SYNCID0000017182         0.0        0.0         0.00        0.00   

       eft_credit  eft_debit  cheque_credit  cheque_debit  emt_credit  ...  \
0            0.00    

In [None]:
combined_df.fillna(0, inplace=True)

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

# Define the variable groups
variable_groups = {
    'group1_transaction_counts': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count'
    ],
    'group2_transaction_amounts': [
        'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'emt_average_credit', 'emt_average_debit', 'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'wire_average_credit', 'wire_average_debit', 'max_wire_credit', 'max_wire_debit', 'min_wire_debit'
    ],
    'group3_transaction_timing': [
        'abm_amount_rolling_avg_30d', 'abm_amount_rolling_avg_7d', 'abm_rolling_sum_7d_ratio', 'abm_active_days_ratio',
        'card_amount_rolling_avg_30d', 'card_amount_rolling_avg_7d', 'card_rolling_sum_7d_ratio', 'card_active_days_ratio',
        'eft_amount_rolling_avg_30d', 'eft_amount_rolling_avg_7d', 'eft_rolling_sum_7d_ratio', 'eft_active_days_ratio',
        'cheque_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_7d', 'cheque_rolling_sum_7d_ratio', 'cheque_active_days_ratio',
        'emt_amount_rolling_avg_30d', 'emt_amount_rolling_avg_7d', 'emt_rolling_sum_7d_ratio', 'emt_active_days_ratio',
        'wire_amount_rolling_avg_30d', 'wire_amount_rolling_avg_7d', 'wire_rolling_sum_7d_ratio', 'wire_active_days_ratio',
        'abm_account_active_days', 'card_account_active_days', 'eft_account_active_days', 'cheque_account_active_days', 'emt_account_active_days', 'wire_account_active_days'
    ],
    'group4_credit_debit_ratios': [
        'abm_credit_debit_ratio', 'card_credit_debit_ratio', 'cheque_credit_debit_ratio', 'eft_credit_debit_ratio', 'emt_credit_debit_ratio', 'wire_credit_debit_ratio',
        'abm_debit_credit_ratio', 'card_debit_credit_ratio', 'cheque_debit_credit_ratio', 'eft_debit_credit_ratio', 'emt_debit_credit_ratio', 'wire_debit_credit_ratio'
    ],
    'group5_overall_amount_averages': [
        'avg_abm_amount', 'avg_card_amount', 'avg_eft_amount', 'avg_cheque_amount', 'avg_emt_amount', 'avg_wire_amount',
        'abm_amount_rolling_avg_30d', 'card_amount_rolling_avg_30d', 'eft_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_30d', 'emt_amount_rolling_avg_30d', 'wire_amount_rolling_avg_30d',
        'abm_amount_rolling_avg_7d', 'card_amount_rolling_avg_7d', 'eft_amount_rolling_avg_7d', 'cheque_amount_rolling_avg_7d', 'emt_amount_rolling_avg_7d', 'wire_amount_rolling_avg_7d'
    ],
    'group7_log_active_days': [
        'log1p_abm_account_active_days', 'log1p_card_account_active_days', 'log1p_cheque_account_active_days',
        'log1p_eft_account_active_days', 'log1p_emt_account_active_days', 'log1p_wire_account_active_days'
    ]
}

# Dictionary to store anomalies for each group
all_group_anomalies = {}

# Process each group
for group_name, columns in variable_groups.items():
    print(f"\nProcessing group: {group_name}")
    df_group = df[columns + ['customer_id']].copy()

    # Handle missing values (using median imputation for this example)
    # Exclude 'customer_id' from imputation
    numeric_columns = df_group.select_dtypes(include=np.number).columns
    df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())


    scaler = MinMaxScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)

    silhouette_scores = []
    cluster_range = range(2, 21)  # Increased range
    for k in cluster_range:
        model = KMeans(n_clusters=k, n_init=20, random_state=1, max_iter=500)
        cluster_labels = model.fit_predict(data_scaled)
        silhouette_avg = silhouette_score(data_scaled, cluster_labels)
        silhouette_scores.append(silhouette_avg)

    optimal_k = cluster_range[np.argmax(silhouette_scores)]
    print(f"  Optimal number of clusters: {optimal_k}")

    model = KMeans(n_clusters=optimal_k, n_init=20, random_state=4, max_iter=500)
    cluster_numbers = model.fit_predict(data_scaled)
    df_group['Cluster'] = cluster_numbers

    group_anomalies = []  # Store anomalies *for this group*
    for i in range(optimal_k):
        cluster_data = df_group[df_group['Cluster'] == i].drop(columns=['Cluster'])
        if cluster_data.empty:
            print(f"  Cluster {i} is empty. Skipping.")
            continue
        clf = IsolationForest(contamination=0.01, random_state=42, n_estimators=200)  # Adjusted parameters
        clf.fit(cluster_data.drop(columns=['customer_id']))
        y_pred = clf.predict(cluster_data.drop(columns=['customer_id']))
        cluster_anomalies = cluster_data['customer_id'][y_pred == -1].tolist()
        group_anomalies.extend(cluster_anomalies)

    # Store the anomalies for this group
    all_group_anomalies[group_name] = group_anomalies
    print(f"Anomalies for {group_name}: {len(group_anomalies)}") # Print number of anomalies

# --- Combine and find unique anomalies across ALL groups ---
all_anomalies_combined = []
for anomalies in all_group_anomalies.values():
    all_anomalies_combined.extend(anomalies)

unique_anomalies = list(set(all_anomalies_combined))  # Get unique customer_ids

print(f"\nTotal number of unique anomalies across all groups: {len(unique_anomalies)}")
print(f"Unique anomaly customer IDs: {unique_anomalies}")

# --- (Optional) Print anomalies per group for detailed inspection ---
print("\nAnomalies per group (counts):")
for group_name, anomalies in all_group_anomalies.items():
    print(f"{group_name}: {len(anomalies)}")


Processing group: group1_transaction_counts
  Optimal number of clusters: 2
Anomalies for group1_transaction_counts: 163

Processing group: group2_transaction_amounts
  Optimal number of clusters: 2
Anomalies for group2_transaction_amounts: 163

Processing group: group3_transaction_timing
  Optimal number of clusters: 20
Anomalies for group3_transaction_timing: 174

Processing group: group4_credit_debit_ratios
  Optimal number of clusters: 3
Anomalies for group4_credit_debit_ratios: 165

Processing group: group5_overall_amount_averages
  Optimal number of clusters: 3
Anomalies for group5_overall_amount_averages: 163

Processing group: group7_log_active_days
  Optimal number of clusters: 20
Anomalies for group7_log_active_days: 170

Total number of unique anomalies across all groups: 788
Unique anomaly customer IDs: ['SYNCID0000013895', 'SYNCID0000009082', 'SYNCID0000000871', 'SYNCID0000007656', 'SYNCID0000015917', 'SYNCID0000003382', 'SYNCID0000003100', 'SYNCID0000014090', 'SYNCID0000

In [None]:
len(unique_anomalies)

788

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

variable_groups_ml = {
    'group1_high_value_transactions': [
        'max_abm_credit', 'max_abm_debit', 'max_card_credit', 'max_card_debit',
        'max_eft_credit', 'max_eft_debit', 'max_cheque_credit', 'max_cheque_debit',
        'max_emt_credit', 'max_emt_debit', 'max_wire_credit', 'max_wire_debit'
    ],
    'group2_frequent_transactions': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count'
    ],
    'group3_rapid_velocity': [
        'abm_amount_rolling_avg_7d', 'card_amount_rolling_avg_7d',
        'eft_amount_rolling_avg_7d', 'cheque_amount_rolling_avg_7d',
        'emt_amount_rolling_avg_7d', 'wire_amount_rolling_avg_7d',
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio',
        'eft_rolling_sum_7d_ratio', 'cheque_rolling_sum_7d_ratio',
        'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group4_inconsistent_activity': [
         'abm_credit_debit_ratio', 'card_credit_debit_ratio', 'cheque_credit_debit_ratio',
         'eft_credit_debit_ratio', 'emt_credit_debit_ratio', 'wire_credit_debit_ratio',
         'abm_debit_credit_ratio', 'card_debit_credit_ratio', 'cheque_debit_credit_ratio',
         'eft_debit_credit_ratio', 'emt_debit_credit_ratio', 'wire_debit_credit_ratio',
        'abm_active_days_ratio', 'card_active_days_ratio',
        'eft_active_days_ratio', 'cheque_active_days_ratio',
        'emt_active_days_ratio', 'wire_active_days_ratio'
    ],
    'group5_round_number_transactions': [  # Placeholder - Requires feature engineering
        'abm_average_credit', 'abm_average_debit',
        'card_average_credit', 'card_average_debit',
    ],
     'group6_international_transactions' : [
         'wire_credit_transaction_count', 'wire_debit_transaction_count',
         'wire_average_credit', 'wire_average_debit', 'max_wire_credit',
         'max_wire_debit', 'min_wire_credit', 'min_wire_debit',
         'wire_amount_rolling_avg_30d','wire_amount_rolling_avg_7d',
         'wire_rolling_sum_7d_ratio','wire_active_days_ratio',
         'wire_credit_debit_ratio', 'wire_debit_credit_ratio',
         'avg_wire_amount','wire_account_active_days',
         'log1p_wire_account_active_days'

     ]
}

# Dictionary to store anomalies for each group
all_group_anomalies = {}

# Process each group
for group_name, columns in variable_groups_ml.items():
    print(f"\nProcessing group: {group_name}")
    df_group = df[columns + ['customer_id']].copy()

    # Handle missing values (using median imputation for this example)
    # Exclude 'customer_id' from imputation
    numeric_columns = df_group.select_dtypes(include=np.number).columns
    df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())


    scaler = MinMaxScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)

    silhouette_scores = []
    cluster_range = range(2, 21)  # Increased range
    for k in cluster_range:
        model = KMeans(n_clusters=k, n_init=20, random_state=1, max_iter=500)
        cluster_labels = model.fit_predict(data_scaled)
        silhouette_avg = silhouette_score(data_scaled, cluster_labels)
        silhouette_scores.append(silhouette_avg)

    optimal_k = cluster_range[np.argmax(silhouette_scores)]
    print(f"  Optimal number of clusters: {optimal_k}")

    model = KMeans(n_clusters=optimal_k, n_init=20, random_state=4, max_iter=500)
    cluster_numbers = model.fit_predict(data_scaled)
    df_group['Cluster'] = cluster_numbers

    group_anomalies = []  # Store anomalies *for this group*
    for i in range(optimal_k):
        cluster_data = df_group[df_group['Cluster'] == i].drop(columns=['Cluster'])
        if cluster_data.empty:
            print(f"  Cluster {i} is empty. Skipping.")
            continue
        clf = IsolationForest(contamination=0.01, random_state=42, n_estimators=200)  # Adjusted parameters
        clf.fit(cluster_data.drop(columns=['customer_id']))
        y_pred = clf.predict(cluster_data.drop(columns=['customer_id']))
        cluster_anomalies = cluster_data['customer_id'][y_pred == -1].tolist()
        group_anomalies.extend(cluster_anomalies)

    # Store the anomalies for this group
    all_group_anomalies[group_name] = group_anomalies
    print(f"Anomalies for {group_name}: {len(group_anomalies)}") # Print number of anomalies

# --- Combine and find unique anomalies across ALL groups ---
all_anomalies_combined = []
for anomalies in all_group_anomalies.values():
    all_anomalies_combined.extend(anomalies)

unique_anomalies_strategy_2 = list(set(all_anomalies_combined))  # Get unique customer_ids

print(f"\nTotal number of unique anomalies across all groups: {len(unique_anomalies)}")
print(f"Unique anomaly customer IDs: {unique_anomalies}")

# --- (Optional) Print anomalies per group for detailed inspection ---
print("\nAnomalies per group (counts):")
for group_name, anomalies in all_group_anomalies.items():
    print(f"{group_name}: {len(anomalies)}")


Processing group: group1_high_value_transactions
  Optimal number of clusters: 2
Anomalies for group1_high_value_transactions: 163

Processing group: group2_frequent_transactions
  Optimal number of clusters: 2
Anomalies for group2_frequent_transactions: 163

Processing group: group3_rapid_velocity
  Optimal number of clusters: 3
Anomalies for group3_rapid_velocity: 163

Processing group: group4_inconsistent_activity
  Optimal number of clusters: 7
Anomalies for group4_inconsistent_activity: 166

Processing group: group5_round_number_transactions
  Optimal number of clusters: 2
Anomalies for group5_round_number_transactions: 163

Processing group: group6_international_transactions
  Optimal number of clusters: 2
Anomalies for group6_international_transactions: 163

Total number of unique anomalies across all groups: 788
Unique anomaly customer IDs: ['SYNCID0000013895', 'SYNCID0000009082', 'SYNCID0000000871', 'SYNCID0000007656', 'SYNCID0000015917', 'SYNCID0000003382', 'SYNCID0000003100

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

variable_groups_v3 = {
    'group_abm_profile': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'abm_amount_rolling_avg_30d', 'abm_amount_rolling_avg_7d', 'abm_rolling_sum_7d_ratio', 'abm_active_days_ratio',
        'abm_account_active_days', 'avg_abm_amount', 'abm_credit', 'abm_debit',
        'abm_credit_debit_ratio', 'abm_debit_credit_ratio'
    ],
    'group_card_profile': [
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'card_amount_rolling_avg_30d', 'card_amount_rolling_avg_7d', 'card_rolling_sum_7d_ratio', 'card_active_days_ratio',
        'card_account_active_days', 'avg_card_amount', 'card_credit', 'card_debit',
        'card_credit_debit_ratio', 'card_debit_credit_ratio'
    ],
    'group_eft_profile': [
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'eft_amount_rolling_avg_30d', 'eft_amount_rolling_avg_7d', 'eft_rolling_sum_7d_ratio', 'eft_active_days_ratio',
        'eft_account_active_days', 'avg_eft_amount', 'eft_credit', 'eft_debit',
        'eft_credit_debit_ratio', 'eft_debit_credit_ratio'
    ],
    'group_cheque_profile': [
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'cheque_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_7d', 'cheque_rolling_sum_7d_ratio', 'cheque_active_days_ratio',
        'cheque_account_active_days', 'avg_cheque_amount', 'cheque_credit', 'cheque_debit',
        'cheque_credit_debit_ratio', 'cheque_debit_credit_ratio'
    ],
    'group_emt_profile': [
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'emt_average_credit', 'emt_average_debit', 'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'emt_amount_rolling_avg_30d', 'emt_amount_rolling_avg_7d', 'emt_rolling_sum_7d_ratio', 'emt_active_days_ratio',
        'emt_account_active_days', 'avg_emt_amount', 'emt_credit', 'emt_debit',
        'emt_credit_debit_ratio', 'emt_debit_credit_ratio'
    ],
    'group_wire_profile': [
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'wire_average_credit', 'wire_average_debit', 'max_wire_credit', 'max_wire_debit', 'min_wire_credit', 'min_wire_debit',
        'wire_amount_rolling_avg_30d', 'wire_amount_rolling_avg_7d', 'wire_rolling_sum_7d_ratio', 'wire_active_days_ratio',
        'wire_account_active_days', 'avg_wire_amount', 'wire_credit', 'wire_debit',
        'wire_credit_debit_ratio', 'wire_debit_credit_ratio'
    ],
    'group_transaction_intensity': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio', 'eft_rolling_sum_7d_ratio',
        'cheque_rolling_sum_7d_ratio', 'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group_account_duration': [
        'abm_account_active_days', 'card_account_active_days', 'eft_account_active_days',
        'cheque_account_active_days', 'emt_account_active_days', 'wire_account_active_days',
        'log1p_abm_account_active_days', 'log1p_card_account_active_days', 'log1p_cheque_account_active_days',
        'log1p_eft_account_active_days', 'log1p_emt_account_active_days', 'log1p_wire_account_active_days'
    ],
    'group_amount_rolling_ratios': [
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio', 'eft_rolling_sum_7d_ratio',
        'cheque_rolling_sum_7d_ratio', 'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group_extreme_amounts': [
        'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'max_wire_credit', 'max_wire_debit', 'min_wire_credit', 'min_wire_debit'
    ],
    'group_low_activity': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'abm_active_days_ratio', 'card_active_days_ratio', 'eft_active_days_ratio',
        'cheque_active_days_ratio', 'emt_active_days_ratio', 'wire_active_days_ratio',
        'avg_abm_amount', 'avg_card_amount', 'avg_eft_amount', 'avg_cheque_amount', 'avg_emt_amount', 'avg_wire_amount'
    ]
}


# Dictionary to store anomalies for each group
all_group_anomalies = {}

# Process each group
for group_name, columns in variable_groups_v3.items():
    print(f"\nProcessing group: {group_name}")
    df_group = df[columns + ['customer_id']].copy()

    # Handle missing values (using median imputation for this example)
    # Exclude 'customer_id' from imputation
    numeric_columns = df_group.select_dtypes(include=np.number).columns
    df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())


    scaler = MinMaxScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)

    silhouette_scores = []
    cluster_range = range(2, 21)  # Increased range
    for k in cluster_range:
        model = KMeans(n_clusters=k, n_init=20, random_state=1, max_iter=500)
        cluster_labels = model.fit_predict(data_scaled)
        silhouette_avg = silhouette_score(data_scaled, cluster_labels)
        silhouette_scores.append(silhouette_avg)

    optimal_k = cluster_range[np.argmax(silhouette_scores)]
    print(f"  Optimal number of clusters: {optimal_k}")

    model = KMeans(n_clusters=optimal_k, n_init=20, random_state=4, max_iter=500)
    cluster_numbers = model.fit_predict(data_scaled)
    df_group['Cluster'] = cluster_numbers

    group_anomalies = []  # Store anomalies *for this group*
    for i in range(optimal_k):
        cluster_data = df_group[df_group['Cluster'] == i].drop(columns=['Cluster'])
        if cluster_data.empty:
            print(f"  Cluster {i} is empty. Skipping.")
            continue
        clf = IsolationForest(contamination=0.01, random_state=42, n_estimators=200)  # Adjusted parameters
        clf.fit(cluster_data.drop(columns=['customer_id']))
        y_pred = clf.predict(cluster_data.drop(columns=['customer_id']))
        cluster_anomalies = cluster_data['customer_id'][y_pred == -1].tolist()
        group_anomalies.extend(cluster_anomalies)

    # Store the anomalies for this group
    all_group_anomalies[group_name] = group_anomalies
    print(f"Anomalies for {group_name}: {len(group_anomalies)}") # Print number of anomalies

# --- Combine and find unique anomalies across ALL groups ---
all_anomalies_combined = []
for anomalies in all_group_anomalies.values():
    all_anomalies_combined.extend(anomalies)

unique_anomalies_strategy_3 = list(set(all_anomalies_combined))  # Get unique customer_ids

print(f"\nTotal number of unique anomalies across all groups: {len(unique_anomalies)}")
print(f"Unique anomaly customer IDs: {unique_anomalies}")

# --- (Optional) Print anomalies per group for detailed inspection ---
print("\nAnomalies per group (counts):")
for group_name, anomalies in all_group_anomalies.items():
    print(f"{group_name}: {len(anomalies)}")


Processing group: group_abm_profile
  Optimal number of clusters: 2
Anomalies for group_abm_profile: 163

Processing group: group_card_profile
  Optimal number of clusters: 2
Anomalies for group_card_profile: 163

Processing group: group_eft_profile
  Optimal number of clusters: 2
Anomalies for group_eft_profile: 163

Processing group: group_cheque_profile
  Optimal number of clusters: 2
Anomalies for group_cheque_profile: 163

Processing group: group_emt_profile
  Optimal number of clusters: 2
Anomalies for group_emt_profile: 163

Processing group: group_wire_profile
  Optimal number of clusters: 3
Anomalies for group_wire_profile: 164

Processing group: group_transaction_intensity
  Optimal number of clusters: 2
Anomalies for group_transaction_intensity: 163

Processing group: group_account_duration
  Optimal number of clusters: 20
Anomalies for group_account_duration: 173

Processing group: group_amount_rolling_ratios
  Optimal number of clusters: 3
Anomalies for group_amount_rolli

In [None]:
len(unique_anomalies)+len(unique_anomalies_strategy_2)+len(unique_anomalies_strategy_3)

3049

In [None]:
anamolies = (unique_anomalies)+(unique_anomalies_strategy_2)+(unique_anomalies_strategy_3)

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Assuming 'unique_anomalies' is a list of customer IDs identified as anomalies
# and 'combined_df' is your DataFrame with customer data

# 1. Define your two categorical variables:
#    - 'anomaly_status':  Anomaly (in unique_anomalies) or Non-Anomaly
#    - 'industry_code':  Industry sector of the customer (or replace with your chosen categorical column)

categorical_column = 'industry_code' # **Replace 'industry_code' with your actual categorical column name**

combined_df['anomaly_status'] = combined_df['customer_id'].apply(lambda x: 'Anomaly' if x in anamolies else 'Non-Anomaly') # Use your anomaly list
combined_df = combined_df.dropna(subset=[categorical_column]) # Remove rows with NaN in the categorical column to avoid errors

# 2. Create a contingency table using pd.crosstab with TWO categorical variables
contingency_table = pd.crosstab(combined_df['anomaly_status'], combined_df[categorical_column])

print("\nContingency Table (Anomaly Status vs. {}):\n".format(categorical_column))
print(contingency_table)

# 3. Perform Chi-squared test on the contingency table
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-squared statistic: {chi2:.4f}")
print(f"P-value: {p:.4f}")
print(f"Degrees of freedom: {dof}")

# 4. Interpret the results
alpha = 0.05  # Significance level
print(f"\nSignificance level (alpha): {alpha}")

if p < alpha:
    print("Reject null hypothesis (p < {}).".format(alpha))
    print(f"There is a statistically significant association between Anomaly Status and {categorical_column}.")
    print("This suggests that anomalies are not evenly distributed across {} and might represent a non-random pattern.".format(categorical_column))
else:
    print("Fail to reject null hypothesis (p >= {}).".format(alpha))
    print(f"No statistically significant association found between Anomaly Status and {categorical_column}.")
    print("This *could* suggest anomalies are distributed similarly across {}. Further investigation is needed.".format(categorical_column))

print("\n--- Interpretation Notes ---")
print("A significant p-value suggests that the distribution of anomalies across {} is different from the distribution of non-anomalies across {}.".format(categorical_column, categorical_column))
print("This *might* indicate that your anomaly detection method is identifying something beyond random chance related to {}".format(categorical_column))
print("However, a non-significant p-value does *not* mean your anomalies are not valid, it just means there's no statistically significant association with {} based on the Chi-squared test.".format(categorical_column))
print("Always consider domain expertise and further investigation to validate your findings.")


Contingency Table (Anomaly Status vs. industry_code):

industry_code     0  0112  0115  0119  0122  0129  0131  0132  0133  0139  \
anomaly_status                                                              
Anomaly          50     4     0     3     2     3     3     0     1     2   
Non-Anomaly     337    57    15    12    29    20    10    11    13    23   

industry_code   ...  9931  9942  9949  9951  9952  9953  9959  9961  9999  \
anomaly_status  ...                                                         
Anomaly         ...     5     6     3     1     1    16     9     4    11   
Non-Anomaly     ...    32    30    35    15    10   126    55    17    83   

industry_code   other  
anomaly_status         
Anomaly           239  
Non-Anomaly      1692  

[2 rows x 253 columns]

Chi-squared statistic: 364.8448
P-value: 0.0000
Degrees of freedom: 252

Significance level (alpha): 0.05
Reject null hypothesis (p < 0.05).
There is a statistically significant association between Anomaly

In [None]:
# prompt: find commaon anomolies in both unique_anomalies_strategy_2 and unique_anomalies. also add uuique anomolies in each

common_anomalies = list(set(unique_anomalies) & set(unique_anomalies_strategy_2)& set(unique_anomalies_strategy_3))
unique_to_strategy_1 = list(set(unique_anomalies) - set(unique_anomalies_strategy_2)-set(unique_anomalies_strategy_3))
unique_to_strategy_2 = list(set(unique_anomalies_strategy_2) - set(unique_anomalies)-set(unique_anomalies_strategy_3))
unique_to_strategy_3 = list(set(unique_anomalies_strategy_3)-set(unique_anomalies_strategy_2) - set(unique_anomalies))
print(f"Common Anomalies: {common_anomalies}")
print(f"Number of Common Anomalies: {len(common_anomalies)}")
print(f"Unique to unique_anomalies (Strategy 1): {unique_to_strategy_1}")
print(f"Number of Unique to Strategy 1: {len(unique_to_strategy_1)}")
print(f"Unique to unique_anomalies_strategy_2 (Strategy 2): {unique_to_strategy_2}")
print(f"Number of Unique to Strategy 2: {len(unique_to_strategy_2)}")
print(f"Unique to unique_anomalies_strategy_3 (Strategy 3): {unique_to_strategy_3}")
print(f"Number of Unique to Strategy 3: {len(unique_to_strategy_3)}")


Common Anomalies: ['SYNCID0000001064', 'SYNCID0000000871', 'SYNCID0000006429', 'SYNCID0000015688', 'SYNCID0000007029', 'SYNCID0000003100', 'SYNCID0000014090', 'SYNCID0000012732', 'SYNCID0000002736', 'SYNCID0000010770', 'SYNCID0000008299', 'SYNCID0000007031', 'SYNCID0000010138', 'SYNCID0000002396', 'SYNCID0000004994', 'SYNCID0000005538', 'SYNCID0000002923', 'SYNCID0000013984', 'SYNCID0000007082', 'SYNCID0000003966', 'SYNCID0000013957', 'SYNCID0000009260', 'SYNCID0000005967', 'SYNCID0000011150', 'SYNCID0000013392', 'SYNCID0000008051', 'SYNCID0000004446', 'SYNCID0000013129', 'SYNCID0000005809', 'SYNCID0000005779', 'SYNCID0000011167', 'SYNCID0000007519', 'SYNCID0000001070', 'SYNCID0000013011', 'SYNCID0000008509', 'SYNCID0000005730', 'SYNCID0000000545', 'SYNCID0000001889', 'SYNCID0000015466', 'SYNCID0000005498', 'SYNCID0000014308', 'SYNCID0000011409', 'SYNCID0000005244', 'SYNCID0000005293', 'SYNCID0000002220', 'SYNCID0000015228', 'SYNCID0000002810', 'SYNCID0000004428', 'SYNCID0000007646', '

In [None]:
len(common_anomalies)+len(unique_to_strategy_1)+len(unique_to_strategy_2)+len(unique_to_strategy_3)

1525

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, Birch
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import GridSearchCV

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

# Define the variable groups - using strategy 1 as example
variable_groups = {
    'group1_transaction_counts': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count'
    ],
    'group2_transaction_amounts': [
        'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'emt_average_credit', 'emt_average_debit', 'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'wire_average_credit', 'wire_average_debit', 'max_wire_credit', 'max_wire_debit', 'min_wire_debit'
    ],
    'group3_transaction_timing': [
        'abm_amount_rolling_avg_30d', 'abm_amount_rolling_avg_7d', 'abm_rolling_sum_7d_ratio', 'abm_active_days_ratio',
        'card_amount_rolling_avg_30d', 'card_amount_rolling_avg_7d', 'card_rolling_sum_7d_ratio', 'card_active_days_ratio',
        'eft_amount_rolling_avg_30d', 'eft_amount_rolling_avg_7d', 'eft_rolling_sum_7d_ratio', 'eft_active_days_ratio',
        'cheque_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_7d', 'cheque_rolling_sum_7d_ratio', 'cheque_active_days_ratio',
        'emt_amount_rolling_avg_30d', 'emt_amount_rolling_avg_7d', 'emt_rolling_sum_7d_ratio', 'emt_active_days_ratio',
        'wire_amount_rolling_avg_30d', 'wire_amount_rolling_avg_7d', 'wire_rolling_sum_7d_ratio', 'wire_active_days_ratio',
        'abm_account_active_days', 'card_account_active_days', 'eft_account_active_days', 'cheque_account_active_days', 'emt_account_active_days', 'wire_account_active_days'
    ],
    'group4_credit_debit_ratios': [
        'abm_credit_debit_ratio', 'card_credit_debit_ratio', 'cheque_credit_debit_ratio', 'eft_credit_debit_ratio', 'emt_credit_debit_ratio', 'wire_credit_debit_ratio',
        'abm_debit_credit_ratio', 'card_debit_credit_ratio', 'cheque_debit_credit_ratio', 'eft_debit_credit_ratio', 'emt_debit_credit_ratio', 'wire_debit_credit_ratio'
    ],
    'group5_overall_amount_averages': [
        'avg_abm_amount', 'avg_card_amount', 'avg_eft_amount', 'avg_cheque_amount', 'avg_emt_amount', 'avg_wire_amount',
        'abm_amount_rolling_avg_30d', 'card_amount_rolling_avg_30d', 'eft_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_30d', 'emt_amount_rolling_avg_30d', 'wire_amount_rolling_avg_30d',
        'abm_amount_rolling_avg_7d', 'card_amount_rolling_avg_7d', 'eft_amount_rolling_avg_7d', 'cheque_amount_rolling_avg_7d', 'emt_amount_rolling_avg_7d', 'wire_amount_rolling_avg_7d'
    ],
    'group7_log_active_days': [
        'log1p_abm_account_active_days', 'log1p_card_account_active_days', 'log1p_cheque_account_active_days',
        'log1p_eft_account_active_days', 'log1p_emt_account_active_days', 'log1p_wire_account_active_days'
    ]
}


# Dictionary to store anomalies for each group
all_group_anomalies_birch = {}

# Custom Birch Clustering class and silhouette scorer (from user's notebook)
class BirchClustering(BaseEstimator, ClusterMixin):
    def __init__(self, threshold=0.5, branching_factor=50, n_clusters=None):
        self.threshold = threshold
        self.branching_factor = branching_factor
        self.n_clusters = n_clusters

    def fit(self, X, y=None):
        self.model_ = Birch(threshold=self.threshold,
                            branching_factor=self.branching_factor,
                            n_clusters=self.n_clusters)
        self.model_.fit(X)
        return self

    def predict(self, X):
        return self.model_.predict(X)

    def fit_predict(self, X, y=None):
        return self.fit(X).predict(X)

def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) < 2:  # Silhouette score requires at least 2 clusters
        return -1  # Assign a low score for invalid clustering
    return silhouette_score(X, labels)

# Process each group
for group_name, columns in variable_groups.items():
    print(f"\nProcessing group: {group_name}")
    df_group = df[columns + ['customer_id']].copy()

    # Handle missing values (using median imputation for this example)
    # Exclude 'customer_id' from imputation
    numeric_columns = df_group.select_dtypes(include=np.number).columns
    df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())

    scaler = MinMaxScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)

    # Parameter grid for Birch
    param_grid = {
        'threshold': [0.1, 0.3, 0.5, 0.7],
        'branching_factor': [5, 10, 25, 50, 75],
        'n_clusters': [None, 3, 4, 5]}

    # Grid search for optimal Birch parameters
    grid_search = GridSearchCV(BirchClustering(), param_grid, cv=3, scoring=silhouette_scorer, n_jobs=-1) # Reduced cv for speed
    grid_search.fit(data_scaled)

    best_params_birch = grid_search.best_params_
    best_birch_model = grid_search.best_estimator_
    print("  Best Birch Parameters:", best_params_birch)

    # Use Birch with optimal parameters
    birch_model = BirchClustering(**best_params_birch) # Use best parameters
    cluster_numbers_birch = birch_model.fit_predict(data_scaled)
    df_group['ClusterBIRCH'] = cluster_numbers_birch

    group_anomalies_birch = []  # Store anomalies *for this group*
    unique_clusters_birch = df_group['ClusterBIRCH'].unique() # Get unique cluster labels
    for i in unique_clusters_birch: # Iterate through unique cluster labels
        cluster_data = df_group[df_group['ClusterBIRCH'] == i].drop(columns=['ClusterBIRCH'])
        if cluster_data.empty:
            print(f"  Cluster {i} is empty. Skipping.")
            continue # This line was missing proper indentation causing the error.
        clf = IsolationForest(contamination=0.01, random_state=42, n_estimators=200)  # Adjusted parameters
        clf.fit(cluster_data.drop(columns=['customer_id']))
        y_pred = clf.predict(cluster_data.drop(columns=['customer_id']))
        cluster_anomalies = cluster_data['customer_id'][y_pred == -1].tolist()
        group_anomalies_birch.extend(cluster_anomalies)

    # Store the anomalies for this group
    all_group_anomalies_birch[group_name] = group_anomalies_birch
    print(f"Anomalies for {group_name} using Birch: {len(group_anomalies_birch)}") # Print number of anomalies

# --- Combine and find unique anomalies across ALL groups using Birch ---
all_anomalies_combined_birch = []
for anomalies in all_group_anomalies_birch.values():
    all_anomalies_combined_birch.extend(anomalies)

unique_anomalies_birch = list(set(all_anomalies_combined_birch))  # Get unique customer_ids

print(f"\nTotal number of unique anomalies across all groups using Birch: {len(unique_anomalies_birch)}")
print(f"Unique anomaly customer IDs (Birch): {unique_anomalies_birch}")

# --- (Optional) Print anomalies per group for detailed inspection ---
print("\nAnomalies per group (counts) using Birch:")
for group_name, anomalies in all_group_anomalies_birch.items():
    print(f"{group_name}: {len(anomalies)}")


Processing group: group1_transaction_counts
  Best Birch Parameters: {'branching_factor': 5, 'n_clusters': 3, 'threshold': 0.1}
Anomalies for group1_transaction_counts using Birch: 164

Processing group: group2_transaction_amounts
  Best Birch Parameters: {'branching_factor': 10, 'n_clusters': 3, 'threshold': 0.1}
Anomalies for group2_transaction_amounts using Birch: 164

Processing group: group3_transaction_timing
  Best Birch Parameters: {'branching_factor': 50, 'n_clusters': None, 'threshold': 0.7}
Anomalies for group3_transaction_timing using Birch: 194

Processing group: group4_credit_debit_ratios
  Best Birch Parameters: {'branching_factor': 5, 'n_clusters': None, 'threshold': 0.1}
Anomalies for group4_credit_debit_ratios using Birch: 163

Processing group: group5_overall_amount_averages
  Best Birch Parameters: {'branching_factor': 25, 'n_clusters': 3, 'threshold': 0.3}
Anomalies for group5_overall_amount_averages using Birch: 163

Processing group: group7_log_active_days
  Bes

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, Birch
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import GridSearchCV

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

# Define the variable groups - using strategy 1 as example
variable_groups = {
    'group1_high_value_transactions': [
        'max_abm_credit', 'max_abm_debit', 'max_card_credit', 'max_card_debit',
        'max_eft_credit', 'max_eft_debit', 'max_cheque_credit', 'max_cheque_debit',
        'max_emt_credit', 'max_emt_debit', 'max_wire_credit', 'max_wire_debit'
    ],
    'group2_frequent_transactions': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count'
    ],
    'group3_rapid_velocity': [
        'abm_amount_rolling_avg_7d', 'card_amount_rolling_avg_7d',
        'eft_amount_rolling_avg_7d', 'cheque_amount_rolling_avg_7d',
        'emt_amount_rolling_avg_7d', 'wire_amount_rolling_avg_7d',
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio',
        'eft_rolling_sum_7d_ratio', 'cheque_rolling_sum_7d_ratio',
        'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group4_inconsistent_activity': [
         'abm_credit_debit_ratio', 'card_credit_debit_ratio', 'cheque_credit_debit_ratio',
         'eft_credit_debit_ratio', 'emt_credit_debit_ratio', 'wire_credit_debit_ratio',
         'abm_debit_credit_ratio', 'card_debit_credit_ratio', 'cheque_debit_credit_ratio',
         'eft_debit_credit_ratio', 'emt_debit_credit_ratio', 'wire_debit_credit_ratio',
        'abm_active_days_ratio', 'card_active_days_ratio',
        'eft_active_days_ratio', 'cheque_active_days_ratio',
        'emt_active_days_ratio', 'wire_active_days_ratio'
    ],
    'group5_round_number_transactions': [  # Placeholder - Requires feature engineering
        'abm_average_credit', 'abm_average_debit',
        'card_average_credit', 'card_average_debit',
    ],
     'group6_international_transactions' : [
         'wire_credit_transaction_count', 'wire_debit_transaction_count',
         'wire_average_credit', 'wire_average_debit', 'max_wire_credit',
         'max_wire_debit', 'min_wire_credit', 'min_wire_debit',
         'wire_amount_rolling_avg_30d','wire_amount_rolling_avg_7d',
         'wire_rolling_sum_7d_ratio','wire_active_days_ratio',
         'wire_credit_debit_ratio', 'wire_debit_credit_ratio',
         'avg_wire_amount','wire_account_active_days',
         'log1p_wire_account_active_days'

     ]
}


# Dictionary to store anomalies for each group
all_group_anomalies_birch = {}

# Custom Birch Clustering class and silhouette scorer (from user's notebook)
class BirchClustering(BaseEstimator, ClusterMixin):
    def __init__(self, threshold=0.5, branching_factor=50, n_clusters=None):
        self.threshold = threshold
        self.branching_factor = branching_factor
        self.n_clusters = n_clusters

    def fit(self, X, y=None):
        self.model_ = Birch(threshold=self.threshold,
                            branching_factor=self.branching_factor,
                            n_clusters=self.n_clusters)
        self.model_.fit(X)
        return self

    def predict(self, X):
        return self.model_.predict(X)

    def fit_predict(self, X, y=None):
        return self.fit(X).predict(X)

def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) < 2:  # Silhouette score requires at least 2 clusters
        return -1  # Assign a low score for invalid clustering
    return silhouette_score(X, labels)

# Process each group
for group_name, columns in variable_groups.items():
    print(f"\nProcessing group: {group_name}")
    df_group = df[columns + ['customer_id']].copy()

    # Handle missing values (using median imputation for this example)
    # Exclude 'customer_id' from imputation
    numeric_columns = df_group.select_dtypes(include=np.number).columns
    df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())

    scaler = MinMaxScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)

    # Parameter grid for Birch
    param_grid = {
        'threshold': [0.1, 0.3, 0.5, 0.7],
        'branching_factor': [5, 10, 25, 50, 75],
        'n_clusters': [None, 3, 4, 5]}

    # Grid search for optimal Birch parameters
    grid_search = GridSearchCV(BirchClustering(), param_grid, cv=3, scoring=silhouette_scorer, n_jobs=-1) # Reduced cv for speed
    grid_search.fit(data_scaled)

    best_params_birch = grid_search.best_params_
    best_birch_model = grid_search.best_estimator_
    print("  Best Birch Parameters:", best_params_birch)

    # Use Birch with optimal parameters
    birch_model = BirchClustering(**best_params_birch) # Use best parameters
    cluster_numbers_birch = birch_model.fit_predict(data_scaled)
    df_group['ClusterBIRCH'] = cluster_numbers_birch

    group_anomalies_birch = []  # Store anomalies *for this group*
    unique_clusters_birch = df_group['ClusterBIRCH'].unique() # Get unique cluster labels
    for i in unique_clusters_birch: # Iterate through unique cluster labels
        cluster_data = df_group[df_group['ClusterBIRCH'] == i].drop(columns=['ClusterBIRCH'])
        if cluster_data.empty:
            print(f"  Cluster {i} is empty. Skipping.")
            continue # This line was missing proper indentation causing the error.
        clf = IsolationForest(contamination=0.01, random_state=42, n_estimators=200)  # Adjusted parameters
        clf.fit(cluster_data.drop(columns=['customer_id']))
        y_pred = clf.predict(cluster_data.drop(columns=['customer_id']))
        cluster_anomalies = cluster_data['customer_id'][y_pred == -1].tolist()
        group_anomalies_birch.extend(cluster_anomalies)

    # Store the anomalies for this group
    all_group_anomalies_birch[group_name] = group_anomalies_birch
    print(f"Anomalies for {group_name} using Birch: {len(group_anomalies_birch)}") # Print number of anomalies

# --- Combine and find unique anomalies across ALL groups using Birch ---
all_anomalies_combined_birch = []
for anomalies in all_group_anomalies_birch.values():
    all_anomalies_combined_birch.extend(anomalies)

unique_anomalies_birch_2 = list(set(all_anomalies_combined_birch))  # Get unique customer_ids

print(f"\nTotal number of unique anomalies across all groups using Birch: {len(unique_anomalies_birch)}")
print(f"Unique anomaly customer IDs (Birch): {unique_anomalies_birch}")

# --- (Optional) Print anomalies per group for detailed inspection ---
print("\nAnomalies per group (counts) using Birch:")
for group_name, anomalies in all_group_anomalies_birch.items():
    print(f"{group_name}: {len(anomalies)}")


Processing group: group1_high_value_transactions
  Best Birch Parameters: {'branching_factor': 75, 'n_clusters': 3, 'threshold': 0.1}
Anomalies for group1_high_value_transactions using Birch: 164

Processing group: group2_frequent_transactions
  Best Birch Parameters: {'branching_factor': 5, 'n_clusters': 3, 'threshold': 0.1}
Anomalies for group2_frequent_transactions using Birch: 164

Processing group: group3_rapid_velocity
  Best Birch Parameters: {'branching_factor': 50, 'n_clusters': 3, 'threshold': 0.1}
Anomalies for group3_rapid_velocity using Birch: 163

Processing group: group4_inconsistent_activity




  Best Birch Parameters: {'branching_factor': 5, 'n_clusters': 3, 'threshold': 0.3}




Anomalies for group4_inconsistent_activity using Birch: 163

Processing group: group5_round_number_transactions
  Best Birch Parameters: {'branching_factor': 5, 'n_clusters': None, 'threshold': 0.1}
Anomalies for group5_round_number_transactions using Birch: 163

Processing group: group6_international_transactions
  Best Birch Parameters: {'branching_factor': 50, 'n_clusters': 4, 'threshold': 0.1}
Anomalies for group6_international_transactions using Birch: 164

Total number of unique anomalies across all groups using Birch: 874
Unique anomaly customer IDs (Birch): ['SYNCID0000009082', 'SYNCID0000008221', 'SYNCID0000015917', 'SYNCID0000003100', 'SYNCID0000014090', 'SYNCID0000003598', 'SYNCID0000008000', 'SYNCID0000002923', 'SYNCID0000001447', 'SYNCID0000002701', 'SYNCID0000013984', 'SYNCID0000010531', 'SYNCID0000009362', 'SYNCID0000001367', 'SYNCID0000009260', 'SYNCID0000009135', 'SYNCID0000004446', 'SYNCID0000005809', 'SYNCID0000001905', 'SYNCID0000012487', 'SYNCID0000015866', 'SYNCID

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, Birch
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import GridSearchCV

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()


variable_groups = {
    'group_abm_profile': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'abm_amount_rolling_avg_30d', 'abm_amount_rolling_avg_7d', 'abm_rolling_sum_7d_ratio', 'abm_active_days_ratio',
        'abm_account_active_days', 'avg_abm_amount', 'abm_credit', 'abm_debit',
        'abm_credit_debit_ratio', 'abm_debit_credit_ratio'
    ],
    'group_card_profile': [
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'card_amount_rolling_avg_30d', 'card_amount_rolling_avg_7d', 'card_rolling_sum_7d_ratio', 'card_active_days_ratio',
        'card_account_active_days', 'avg_card_amount', 'card_credit', 'card_debit',
        'card_credit_debit_ratio', 'card_debit_credit_ratio'
    ],
    'group_eft_profile': [
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'eft_amount_rolling_avg_30d', 'eft_amount_rolling_avg_7d', 'eft_rolling_sum_7d_ratio', 'eft_active_days_ratio',
        'eft_account_active_days', 'avg_eft_amount', 'eft_credit', 'eft_debit',
        'eft_credit_debit_ratio', 'eft_debit_credit_ratio'
    ],
    'group_cheque_profile': [
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'cheque_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_7d', 'cheque_rolling_sum_7d_ratio', 'cheque_active_days_ratio',
        'cheque_account_active_days', 'avg_cheque_amount', 'cheque_credit', 'cheque_debit',
        'cheque_credit_debit_ratio', 'cheque_debit_credit_ratio'
    ],
    'group_emt_profile': [
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'emt_average_credit', 'emt_average_debit', 'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'emt_amount_rolling_avg_30d', 'emt_amount_rolling_avg_7d', 'emt_rolling_sum_7d_ratio', 'emt_active_days_ratio',
        'emt_account_active_days', 'avg_emt_amount', 'emt_credit', 'emt_debit',
        'emt_credit_debit_ratio', 'emt_debit_credit_ratio'
    ],
    'group_wire_profile': [
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'wire_average_credit', 'wire_average_debit', 'max_wire_credit', 'max_wire_debit', 'min_wire_credit', 'min_wire_debit',
        'wire_amount_rolling_avg_30d', 'wire_amount_rolling_avg_7d', 'wire_rolling_sum_7d_ratio', 'wire_active_days_ratio',
        'wire_account_active_days', 'avg_wire_amount', 'wire_credit', 'wire_debit',
        'wire_credit_debit_ratio', 'wire_debit_credit_ratio'
    ],
    'group_transaction_intensity': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio', 'eft_rolling_sum_7d_ratio',
        'cheque_rolling_sum_7d_ratio', 'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group_account_duration': [
        'abm_account_active_days', 'card_account_active_days', 'eft_account_active_days',
        'cheque_account_active_days', 'emt_account_active_days', 'wire_account_active_days',
        'log1p_abm_account_active_days', 'log1p_card_account_active_days', 'log1p_cheque_account_active_days',
        'log1p_eft_account_active_days', 'log1p_emt_account_active_days', 'log1p_wire_account_active_days'
    ],
    'group_amount_rolling_ratios': [
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio', 'eft_rolling_sum_7d_ratio',
        'cheque_rolling_sum_7d_ratio', 'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group_extreme_amounts': [
        'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'max_wire_credit', 'max_wire_debit', 'min_wire_credit', 'min_wire_debit'
    ],
    'group_low_activity': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'abm_active_days_ratio', 'card_active_days_ratio', 'eft_active_days_ratio',
        'cheque_active_days_ratio', 'emt_active_days_ratio', 'wire_active_days_ratio',
        'avg_abm_amount', 'avg_card_amount', 'avg_eft_amount', 'avg_cheque_amount', 'avg_emt_amount', 'avg_wire_amount'
    ]
}


# Dictionary to store anomalies for each group
all_group_anomalies_birch = {}

# Custom Birch Clustering class and silhouette scorer (from user's notebook)
class BirchClustering(BaseEstimator, ClusterMixin):
    def __init__(self, threshold=0.5, branching_factor=50, n_clusters=None):
        self.threshold = threshold
        self.branching_factor = branching_factor
        self.n_clusters = n_clusters

    def fit(self, X, y=None):
        self.model_ = Birch(threshold=self.threshold,
                            branching_factor=self.branching_factor,
                            n_clusters=self.n_clusters)
        self.model_.fit(X)
        return self

    def predict(self, X):
        return self.model_.predict(X)

    def fit_predict(self, X, y=None):
        return self.fit(X).predict(X)

def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) < 2:  # Silhouette score requires at least 2 clusters
        return -1  # Assign a low score for invalid clustering
    return silhouette_score(X, labels)

# Process each group
for group_name, columns in variable_groups.items():
    print(f"\nProcessing group: {group_name}")
    df_group = df[columns + ['customer_id']].copy()

    # Handle missing values (using median imputation for this example)
    # Exclude 'customer_id' from imputation
    numeric_columns = df_group.select_dtypes(include=np.number).columns
    df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())

    scaler = MinMaxScaler()
    data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)

    # Parameter grid for Birch
    param_grid = {
        'threshold': [0.1, 0.3, 0.5, 0.7],
        'branching_factor': [5, 10, 25, 50, 75],
        'n_clusters': [None, 3, 4, 5]}

    # Grid search for optimal Birch parameters
    grid_search = GridSearchCV(BirchClustering(), param_grid, cv=3, scoring=silhouette_scorer, n_jobs=-1) # Reduced cv for speed
    grid_search.fit(data_scaled)

    best_params_birch = grid_search.best_params_
    best_birch_model = grid_search.best_estimator_
    print("  Best Birch Parameters:", best_params_birch)

    # Use Birch with optimal parameters
    birch_model = BirchClustering(**best_params_birch) # Use best parameters
    cluster_numbers_birch = birch_model.fit_predict(data_scaled)
    df_group['ClusterBIRCH'] = cluster_numbers_birch

    group_anomalies_birch = []  # Store anomalies *for this group*
    unique_clusters_birch = df_group['ClusterBIRCH'].unique() # Get unique cluster labels
    for i in unique_clusters_birch: # Iterate through unique cluster labels
        cluster_data = df_group[df_group['ClusterBIRCH'] == i].drop(columns=['ClusterBIRCH'])
        if cluster_data.empty:
            print(f"  Cluster {i} is empty. Skipping.")
            continue # This line was missing proper indentation causing the error.
        clf = IsolationForest(contamination=0.01, random_state=42, n_estimators=200)  # Adjusted parameters
        clf.fit(cluster_data.drop(columns=['customer_id']))
        y_pred = clf.predict(cluster_data.drop(columns=['customer_id']))
        cluster_anomalies = cluster_data['customer_id'][y_pred == -1].tolist()
        group_anomalies_birch.extend(cluster_anomalies)

    # Store the anomalies for this group
    all_group_anomalies_birch[group_name] = group_anomalies_birch
    print(f"Anomalies for {group_name} using Birch: {len(group_anomalies_birch)}") # Print number of anomalies

# --- Combine and find unique anomalies across ALL groups using Birch ---
all_anomalies_combined_birch = []
for anomalies in all_group_anomalies_birch.values():
    all_anomalies_combined_birch.extend(anomalies)

unique_anomalies_birch_3 = list(set(all_anomalies_combined_birch))  # Get unique customer_ids

print(f"\nTotal number of unique anomalies across all groups using Birch: {len(unique_anomalies_birch)}")
print(f"Unique anomaly customer IDs (Birch): {unique_anomalies_birch}")

# --- (Optional) Print anomalies per group for detailed inspection ---
print("\nAnomalies per group (counts) using Birch:")
for group_name, anomalies in all_group_anomalies_birch.items():
    print(f"{group_name}: {len(anomalies)}")


Processing group: group_abm_profile
  Best Birch Parameters: {'branching_factor': 25, 'n_clusters': 3, 'threshold': 0.3}
Anomalies for group_abm_profile using Birch: 164

Processing group: group_card_profile
  Best Birch Parameters: {'branching_factor': 5, 'n_clusters': 3, 'threshold': 0.5}
Anomalies for group_card_profile using Birch: 164

Processing group: group_eft_profile




  Best Birch Parameters: {'branching_factor': 10, 'n_clusters': 4, 'threshold': 0.3}




Anomalies for group_eft_profile using Birch: 163

Processing group: group_cheque_profile




  Best Birch Parameters: {'branching_factor': 10, 'n_clusters': 4, 'threshold': 0.3}




Anomalies for group_cheque_profile using Birch: 163

Processing group: group_emt_profile
  Best Birch Parameters: {'branching_factor': 5, 'n_clusters': None, 'threshold': 0.5}
Anomalies for group_emt_profile using Birch: 163

Processing group: group_wire_profile
  Best Birch Parameters: {'branching_factor': 50, 'n_clusters': 3, 'threshold': 0.1}
Anomalies for group_wire_profile using Birch: 164

Processing group: group_transaction_intensity
  Best Birch Parameters: {'branching_factor': 10, 'n_clusters': 4, 'threshold': 0.1}
Anomalies for group_transaction_intensity using Birch: 165

Processing group: group_account_duration
  Best Birch Parameters: {'branching_factor': 75, 'n_clusters': None, 'threshold': 0.5}
Anomalies for group_account_duration using Birch: 233

Processing group: group_amount_rolling_ratios
  Best Birch Parameters: {'branching_factor': 5, 'n_clusters': None, 'threshold': 0.1}
Anomalies for group_amount_rolling_ratios using Birch: 163

Processing group: group_extreme_a

In [None]:
len(unique_anomalies_birch)+len(unique_anomalies_birch_2)+len(unique_anomalies_birch_3)

3159

In [None]:
anamolies_birch = (unique_anomalies_birch)+(unique_anomalies_birch_2)+(unique_anomalies_birch_3)

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Assuming 'unique_anomalies' is a list of customer IDs identified as anomalies
# and 'combined_df' is your DataFrame with customer data

# 1. Define your two categorical variables:
#    - 'anomaly_status':  Anomaly (in unique_anomalies) or Non-Anomaly
#    - 'industry_code':  Industry sector of the customer (or replace with your chosen categorical column)

categorical_column = 'industry_code' # **Replace 'industry_code' with your actual categorical column name**

combined_df['anomaly_status'] = combined_df['customer_id'].apply(lambda x: 'Anomaly' if x in anamolies_birch else 'Non-Anomaly') # Use your anomaly list
combined_df = combined_df.dropna(subset=[categorical_column]) # Remove rows with NaN in the categorical column to avoid errors

# 2. Create a contingency table using pd.crosstab with TWO categorical variables
contingency_table = pd.crosstab(combined_df['anomaly_status'], combined_df[categorical_column])

print("\nContingency Table (Anomaly Status vs. {}):\n".format(categorical_column))
print(contingency_table)

# 3. Perform Chi-squared test on the contingency table
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-squared statistic: {chi2:.4f}")
print(f"P-value: {p:.4f}")
print(f"Degrees of freedom: {dof}")

# 4. Interpret the results
alpha = 0.05  # Significance level
print(f"\nSignificance level (alpha): {alpha}")

if p < alpha:
    print("Reject null hypothesis (p < {}).".format(alpha))
    print(f"There is a statistically significant association between Anomaly Status and {categorical_column}.")
    print("This suggests that anomalies are not evenly distributed across {} and might represent a non-random pattern.".format(categorical_column))
else:
    print("Fail to reject null hypothesis (p >= {}).".format(alpha))
    print(f"No statistically significant association found between Anomaly Status and {categorical_column}.")
    print("This *could* suggest anomalies are distributed similarly across {}. Further investigation is needed.".format(categorical_column))

print("\n--- Interpretation Notes ---")
print("A significant p-value suggests that the distribution of anomalies across {} is different from the distribution of non-anomalies across {}.".format(categorical_column, categorical_column))
print("This *might* indicate that your anomaly detection method is identifying something beyond random chance related to {}".format(categorical_column))
print("However, a non-significant p-value does *not* mean your anomalies are not valid, it just means there's no statistically significant association with {} based on the Chi-squared test.".format(categorical_column))
print("Always consider domain expertise and further investigation to validate your findings.")


Contingency Table (Anomaly Status vs. industry_code):

industry_code     0  0112  0115  0119  0122  0129  0131  0132  0133  0139  \
anomaly_status                                                              
Anomaly          52     5     0     3     1     3     0     0     1     1   
Non-Anomaly     335    56    15    12    30    20    13    11    13    24   

industry_code   ...  9931  9942  9949  9951  9952  9953  9959  9961  9999  \
anomaly_status  ...                                                         
Anomaly         ...     6     5     3     1     1    22     9     3    12   
Non-Anomaly     ...    31    31    35    15    10   120    55    18    82   

industry_code   other  
anomaly_status         
Anomaly           234  
Non-Anomaly      1697  

[2 rows x 253 columns]

Chi-squared statistic: 331.1830
P-value: 0.0006
Degrees of freedom: 252

Significance level (alpha): 0.05
Reject null hypothesis (p < 0.05).
There is a statistically significant association between Anomaly

In [None]:
# prompt: find commaon anomolies in both unique_anomalies_strategy_2 and unique_anomalies. also add uuique anomolies in each

common_anomalies_birch = list(set(unique_anomalies_birch) & set(unique_anomalies_birch_2)& set(unique_anomalies_birch_3))
unique_to_strategy_1_birch = list(set(unique_anomalies_birch) - set(unique_anomalies_birch_2)-set(unique_anomalies_birch_3))
unique_to_strategy_2_birch = list(set(unique_anomalies_birch_2) - set(unique_anomalies_birch)-set(unique_anomalies_birch_3))
unique_to_strategy_3_birch = list(set(unique_anomalies_birch_3)-set(unique_anomalies_birch_2) - set(unique_anomalies_birch))
print(f"Common Anomalies: {common_anomalies_birch}")
print(f"Number of Common Anomalies: {len(common_anomalies_birch)}")
print(f"Unique to unique_anomalies (Strategy 1): {unique_to_strategy_1_birch}")
print(f"Number of Unique to Strategy 1: {len(unique_to_strategy_1_birch)}")
print(f"Unique to unique_anomalies_strategy_2 (Strategy 2): {unique_to_strategy_2_birch}")
print(f"Number of Unique to Strategy 2: {len(unique_to_strategy_2_birch)}")
print(f"Unique to unique_anomalies_strategy_3 (Strategy 3): {unique_to_strategy_3_birch}")
print(f"Number of Unique to Strategy 3: {len(unique_to_strategy_3_birch)}")


Common Anomalies: ['SYNCID0000016034', 'SYNCID0000013397', 'SYNCID0000003100', 'SYNCID0000014090', 'SYNCID0000006829', 'SYNCID0000012732', 'SYNCID0000003598', 'SYNCID0000010770', 'SYNCID0000007031', 'SYNCID0000005538', 'SYNCID0000002923', 'SYNCID0000008000', 'SYNCID0000010531', 'SYNCID0000013417', 'SYNCID0000013984', 'SYNCID0000009995', 'SYNCID0000007082', 'SYNCID0000003966', 'SYNCID0000013957', 'SYNCID0000009260', 'SYNCID0000016084', 'SYNCID0000011150', 'SYNCID0000006266', 'SYNCID0000008051', 'SYNCID0000009135', 'SYNCID0000004446', 'SYNCID0000005809', 'SYNCID0000013392', 'SYNCID0000005779', 'SYNCID0000011167', 'SYNCID0000011115', 'SYNCID0000007519', 'SYNCID0000013011', 'SYNCID0000001070', 'SYNCID0000005730', 'SYNCID0000012670', 'SYNCID0000001889', 'SYNCID0000016364', 'SYNCID0000011409', 'SYNCID0000005244', 'SYNCID0000004200', 'SYNCID0000005293', 'SYNCID0000002220', 'SYNCID0000007646', 'SYNCID0000006554', 'SYNCID0000004191', 'SYNCID0000011703', 'SYNCID0000012928', 'SYNCID0000001066', '

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, Birch
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import GridSearchCV

import torch
import torch.nn as nn
import torch.nn.functional as F

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

# Define the variable groups - using strategy 1 as example
variable_groups = {
    'group1_transaction_counts': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count'
    ],
    'group2_transaction_amounts': [
        'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'emt_average_credit', 'emt_average_debit', 'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'wire_average_credit', 'wire_average_debit', 'max_wire_credit', 'max_wire_debit', 'min_wire_debit'
    ],
    'group3_transaction_timing': [
        'abm_amount_rolling_avg_30d', 'abm_amount_rolling_avg_7d', 'abm_rolling_sum_7d_ratio', 'abm_active_days_ratio',
        'card_amount_rolling_avg_30d', 'card_amount_rolling_avg_7d', 'card_rolling_sum_7d_ratio', 'card_active_days_ratio',
        'eft_amount_rolling_avg_30d', 'eft_amount_rolling_avg_7d', 'eft_rolling_sum_7d_ratio', 'eft_active_days_ratio',
        'cheque_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_7d', 'cheque_rolling_sum_7d_ratio', 'cheque_active_days_ratio',
        'emt_amount_rolling_avg_30d', 'emt_amount_rolling_avg_7d', 'emt_rolling_sum_7d_ratio', 'emt_active_days_ratio',
        'wire_amount_rolling_avg_30d', 'wire_amount_rolling_avg_7d', 'wire_rolling_sum_7d_ratio', 'wire_active_days_ratio',
        'abm_account_active_days', 'card_account_active_days', 'eft_account_active_days', 'cheque_account_active_days', 'emt_account_active_days', 'wire_account_active_days'
    ],
    'group4_credit_debit_ratios': [
        'abm_credit_debit_ratio', 'card_credit_debit_ratio', 'cheque_credit_debit_ratio', 'eft_credit_debit_ratio', 'emt_credit_debit_ratio', 'wire_credit_debit_ratio',
        'abm_debit_credit_ratio', 'card_debit_credit_ratio', 'cheque_debit_credit_ratio', 'eft_debit_credit_ratio', 'emt_debit_credit_ratio', 'wire_debit_credit_ratio'
    ],
    'group5_overall_amount_averages': [
        'avg_abm_amount', 'avg_card_amount', 'avg_eft_amount', 'avg_cheque_amount', 'avg_emt_amount', 'avg_wire_amount',
        'abm_amount_rolling_avg_30d', 'card_amount_rolling_avg_30d', 'eft_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_30d', 'emt_amount_rolling_avg_30d', 'wire_amount_rolling_avg_30d',
        'abm_amount_rolling_avg_7d', 'card_amount_rolling_avg_7d', 'eft_amount_rolling_avg_7d', 'cheque_amount_rolling_avg_7d', 'emt_amount_rolling_avg_7d', 'wire_amount_rolling_avg_7d'
    ],
    'group7_log_active_days': [
        'log1p_abm_account_active_days', 'log1p_card_account_active_days', 'log1p_cheque_account_active_days',
        'log1p_eft_account_active_days', 'log1p_emt_account_active_days', 'log1p_wire_account_active_days'
    ]
}

# --- Autoencoder Anomaly Detection Function ---
def detect_anomalies_autoencoder(df, variable_groups):
    all_group_anomalies_ae = {}

    for group_name, columns in variable_groups.items():
        print(f"\nProcessing group for Autoencoder: {group_name}")
        df_group = df[columns + ['customer_id']].copy()

        # Handle missing values
        numeric_columns = df_group.select_dtypes(include=np.number).columns
        df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())

        scaler = MinMaxScaler()
        data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)
        X_tensor = torch.tensor(data_scaled.values, dtype=torch.float32)

        input_dim = data_scaled.shape[1]
        hidden_dim = int(data_scaled.shape[1]/2) # Adjusted hidden dimension
        output_dim = data_scaled.shape[1]

        model_ae = Autoencoder(input_dim, hidden_dim, output_dim)
        criterion_ae = nn.MSELoss()
        optimizer_ae = torch.optim.Adam(model_ae.parameters(), lr=0.001)

        # Train Autoencoder
        for epoch in range(50): # Reduced epochs
            for i in range(0, len(X_tensor), 120):
                x = X_tensor[i:i+120]
                reconstructed = model_ae(x)
                loss = criterion_ae(reconstructed, x)
                optimizer_ae.zero_grad()
                loss.backward()
                optimizer_ae.step()
            #print(f'Epoch {epoch+1}, Loss: {loss.item()}') # Optional: Print epoch loss

        # Calculate reconstruction error
        with torch.no_grad():
            reconstructed = model_ae(X_tensor)
            reconstruction_error = torch.mean((X_tensor - reconstructed) ** 2, dim=1)

        threshold_ae = torch.quantile(reconstruction_error, 0.995) # 95th percentile threshold
        outliers_ae = reconstruction_error > threshold_ae
        anomaly_customer_ids_ae = df_group['customer_id'][outliers_ae.numpy()].tolist()

        all_group_anomalies_ae[group_name] = anomaly_customer_ids_ae
        print(f"Anomalies for {group_name} using Autoencoder: {len(anomaly_customer_ids_ae)}")

    # Combine and find unique anomalies across ALL groups for Autoencoder
    all_anomalies_combined_ae = []
    for anomalies in all_group_anomalies_ae.values():
        all_anomalies_combined_ae.extend(anomalies)
    unique_anomalies_ae = list(set(all_anomalies_combined_ae))

    print(f"\nTotal number of unique anomalies across all groups using Autoencoder: {len(unique_anomalies_ae)}")
    print(f"Unique anomaly customer IDs (Autoencoder): {unique_anomalies_ae}")

    print("\nAnomalies per group (counts) using Autoencoder:")
    for group_name, anomalies in all_group_anomalies_ae.items():
        print(f"{group_name}: {len(anomalies)}")

    return unique_anomalies_ae, all_group_anomalies_ae


class Autoencoder(nn.Module): # Autoencoder class - keep it outside the function for reusability if needed later
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim), # Added extra hidden layer
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), # Adjusted to match encoder
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


# --- Run Autoencoder Anomaly Detection ---
unique_anomalies_ae, all_group_anomalies_ae = detect_anomalies_autoencoder(df, variable_groups)



Processing group for Autoencoder: group1_transaction_counts
Anomalies for group1_transaction_counts using Autoencoder: 82

Processing group for Autoencoder: group2_transaction_amounts
Anomalies for group2_transaction_amounts using Autoencoder: 82

Processing group for Autoencoder: group3_transaction_timing
Anomalies for group3_transaction_timing using Autoencoder: 82

Processing group for Autoencoder: group4_credit_debit_ratios
Anomalies for group4_credit_debit_ratios using Autoencoder: 82

Processing group for Autoencoder: group5_overall_amount_averages
Anomalies for group5_overall_amount_averages using Autoencoder: 82

Processing group for Autoencoder: group7_log_active_days
Anomalies for group7_log_active_days using Autoencoder: 82

Total number of unique anomalies across all groups using Autoencoder: 422
Unique anomaly customer IDs (Autoencoder): ['SYNCID0000017015', 'SYNCID0000001552', 'SYNCID0000008330', 'SYNCID0000014090', 'SYNCID0000003100', 'SYNCID0000003598', 'SYNCID00000080

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, Birch
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import GridSearchCV

import torch
import torch.nn as nn
import torch.nn.functional as F

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

# Define the variable groups - using strategy 1 as example
variable_groups = {
    'group1_high_value_transactions': [
        'max_abm_credit', 'max_abm_debit', 'max_card_credit', 'max_card_debit',
        'max_eft_credit', 'max_eft_debit', 'max_cheque_credit', 'max_cheque_debit',
        'max_emt_credit', 'max_emt_debit', 'max_wire_credit', 'max_wire_debit'
    ],
    'group2_frequent_transactions': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count'
    ],
    'group3_rapid_velocity': [
        'abm_amount_rolling_avg_7d', 'card_amount_rolling_avg_7d',
        'eft_amount_rolling_avg_7d', 'cheque_amount_rolling_avg_7d',
        'emt_amount_rolling_avg_7d', 'wire_amount_rolling_avg_7d',
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio',
        'eft_rolling_sum_7d_ratio', 'cheque_rolling_sum_7d_ratio',
        'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group4_inconsistent_activity': [
         'abm_credit_debit_ratio', 'card_credit_debit_ratio', 'cheque_credit_debit_ratio',
         'eft_credit_debit_ratio', 'emt_credit_debit_ratio', 'wire_credit_debit_ratio',
         'abm_debit_credit_ratio', 'card_debit_credit_ratio', 'cheque_debit_credit_ratio',
         'eft_debit_credit_ratio', 'emt_debit_credit_ratio', 'wire_debit_credit_ratio',
        'abm_active_days_ratio', 'card_active_days_ratio',
        'eft_active_days_ratio', 'cheque_active_days_ratio',
        'emt_active_days_ratio', 'wire_active_days_ratio'
    ],
    'group5_round_number_transactions': [  # Placeholder - Requires feature engineering
        'abm_average_credit', 'abm_average_debit',
        'card_average_credit', 'card_average_debit',
    ],
     'group6_international_transactions' : [
         'wire_credit_transaction_count', 'wire_debit_transaction_count',
         'wire_average_credit', 'wire_average_debit', 'max_wire_credit',
         'max_wire_debit', 'min_wire_credit', 'min_wire_debit',
         'wire_amount_rolling_avg_30d','wire_amount_rolling_avg_7d',
         'wire_rolling_sum_7d_ratio','wire_active_days_ratio',
         'wire_credit_debit_ratio', 'wire_debit_credit_ratio',
         'avg_wire_amount','wire_account_active_days',
         'log1p_wire_account_active_days'

     ]
}

# --- Autoencoder Anomaly Detection Function ---
def detect_anomalies_autoencoder(df, variable_groups):
    all_group_anomalies_ae = {}

    for group_name, columns in variable_groups.items():
        print(f"\nProcessing group for Autoencoder: {group_name}")
        df_group = df[columns + ['customer_id']].copy()

        # Handle missing values
        numeric_columns = df_group.select_dtypes(include=np.number).columns
        df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())

        scaler = MinMaxScaler()
        data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)
        X_tensor = torch.tensor(data_scaled.values, dtype=torch.float32)

        input_dim = data_scaled.shape[1]
        hidden_dim = int(data_scaled.shape[1]/2) # Adjusted hidden dimension
        output_dim = data_scaled.shape[1]

        model_ae = Autoencoder(input_dim, hidden_dim, output_dim)
        criterion_ae = nn.MSELoss()
        optimizer_ae = torch.optim.Adam(model_ae.parameters(), lr=0.001)

        # Train Autoencoder
        for epoch in range(50): # Reduced epochs
            for i in range(0, len(X_tensor), 120):
                x = X_tensor[i:i+120]
                reconstructed = model_ae(x)
                loss = criterion_ae(reconstructed, x)
                optimizer_ae.zero_grad()
                loss.backward()
                optimizer_ae.step()
            #print(f'Epoch {epoch+1}, Loss: {loss.item()}') # Optional: Print epoch loss

        # Calculate reconstruction error
        with torch.no_grad():
            reconstructed = model_ae(X_tensor)
            reconstruction_error = torch.mean((X_tensor - reconstructed) ** 2, dim=1)

        threshold_ae = torch.quantile(reconstruction_error, 0.995) # 95th percentile threshold
        outliers_ae = reconstruction_error > threshold_ae
        anomaly_customer_ids_ae = df_group['customer_id'][outliers_ae.numpy()].tolist()

        all_group_anomalies_ae[group_name] = anomaly_customer_ids_ae
        print(f"Anomalies for {group_name} using Autoencoder: {len(anomaly_customer_ids_ae)}")

    # Combine and find unique anomalies across ALL groups for Autoencoder
    all_anomalies_combined_ae = []
    for anomalies in all_group_anomalies_ae.values():
        all_anomalies_combined_ae.extend(anomalies)
    unique_anomalies_ae = list(set(all_anomalies_combined_ae))

    print(f"\nTotal number of unique anomalies across all groups using Autoencoder: {len(unique_anomalies_ae)}")
    print(f"Unique anomaly customer IDs (Autoencoder): {unique_anomalies_ae}")

    print("\nAnomalies per group (counts) using Autoencoder:")
    for group_name, anomalies in all_group_anomalies_ae.items():
        print(f"{group_name}: {len(anomalies)}")

    return unique_anomalies_ae, all_group_anomalies_ae


class Autoencoder(nn.Module): # Autoencoder class - keep it outside the function for reusability if needed later
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim), # Added extra hidden layer
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), # Adjusted to match encoder
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


# --- Run Autoencoder Anomaly Detection ---
unique_anomalies_ae_2, all_group_anomalies_ae_2 = detect_anomalies_autoencoder(df, variable_groups)



Processing group for Autoencoder: group1_high_value_transactions
Anomalies for group1_high_value_transactions using Autoencoder: 82

Processing group for Autoencoder: group2_frequent_transactions
Anomalies for group2_frequent_transactions using Autoencoder: 82

Processing group for Autoencoder: group3_rapid_velocity
Anomalies for group3_rapid_velocity using Autoencoder: 82

Processing group for Autoencoder: group4_inconsistent_activity
Anomalies for group4_inconsistent_activity using Autoencoder: 82

Processing group for Autoencoder: group5_round_number_transactions
Anomalies for group5_round_number_transactions using Autoencoder: 82

Processing group for Autoencoder: group6_international_transactions
Anomalies for group6_international_transactions using Autoencoder: 82

Total number of unique anomalies across all groups using Autoencoder: 436
Unique anomaly customer IDs (Autoencoder): ['SYNCID0000000871', 'SYNCID0000001552', 'SYNCID0000008330', 'SYNCID0000014090', 'SYNCID0000003100',

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, Birch
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import GridSearchCV

import torch
import torch.nn as nn
import torch.nn.functional as F

# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

# Define the variable groups - using strategy 1 as example
variable_groups = {
    'group_abm_profile': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'abm_amount_rolling_avg_30d', 'abm_amount_rolling_avg_7d', 'abm_rolling_sum_7d_ratio', 'abm_active_days_ratio',
        'abm_account_active_days', 'avg_abm_amount', 'abm_credit', 'abm_debit',
        'abm_credit_debit_ratio', 'abm_debit_credit_ratio'
    ],
    'group_card_profile': [
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'card_amount_rolling_avg_30d', 'card_amount_rolling_avg_7d', 'card_rolling_sum_7d_ratio', 'card_active_days_ratio',
        'card_account_active_days', 'avg_card_amount', 'card_credit', 'card_debit',
        'card_credit_debit_ratio', 'card_debit_credit_ratio'
    ],
    'group_eft_profile': [
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'eft_amount_rolling_avg_30d', 'eft_amount_rolling_avg_7d', 'eft_rolling_sum_7d_ratio', 'eft_active_days_ratio',
        'eft_account_active_days', 'avg_eft_amount', 'eft_credit', 'eft_debit',
        'eft_credit_debit_ratio', 'eft_debit_credit_ratio'
    ],
    'group_cheque_profile': [
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'cheque_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_7d', 'cheque_rolling_sum_7d_ratio', 'cheque_active_days_ratio',
        'cheque_account_active_days', 'avg_cheque_amount', 'cheque_credit', 'cheque_debit',
        'cheque_credit_debit_ratio', 'cheque_debit_credit_ratio'
    ],
    'group_emt_profile': [
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'emt_average_credit', 'emt_average_debit', 'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'emt_amount_rolling_avg_30d', 'emt_amount_rolling_avg_7d', 'emt_rolling_sum_7d_ratio', 'emt_active_days_ratio',
        'emt_account_active_days', 'avg_emt_amount', 'emt_credit', 'emt_debit',
        'emt_credit_debit_ratio', 'emt_debit_credit_ratio'
    ],
    'group_wire_profile': [
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'wire_average_credit', 'wire_average_debit', 'max_wire_credit', 'max_wire_debit', 'min_wire_credit', 'min_wire_debit',
        'wire_amount_rolling_avg_30d', 'wire_amount_rolling_avg_7d', 'wire_rolling_sum_7d_ratio', 'wire_active_days_ratio',
        'wire_account_active_days', 'avg_wire_amount', 'wire_credit', 'wire_debit',
        'wire_credit_debit_ratio', 'wire_debit_credit_ratio'
    ],
    'group_transaction_intensity': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio', 'eft_rolling_sum_7d_ratio',
        'cheque_rolling_sum_7d_ratio', 'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group_account_duration': [
        'abm_account_active_days', 'card_account_active_days', 'eft_account_active_days',
        'cheque_account_active_days', 'emt_account_active_days', 'wire_account_active_days',
        'log1p_abm_account_active_days', 'log1p_card_account_active_days', 'log1p_cheque_account_active_days',
        'log1p_eft_account_active_days', 'log1p_emt_account_active_days', 'log1p_wire_account_active_days'
    ],
    'group_amount_rolling_ratios': [
        'abm_rolling_sum_7d_ratio', 'card_rolling_sum_7d_ratio', 'eft_rolling_sum_7d_ratio',
        'cheque_rolling_sum_7d_ratio', 'emt_rolling_sum_7d_ratio', 'wire_rolling_sum_7d_ratio'
    ],
    'group_extreme_amounts': [
        'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'max_wire_credit', 'max_wire_debit', 'min_wire_credit', 'min_wire_debit'
    ],
    'group_low_activity': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count',
        'abm_active_days_ratio', 'card_active_days_ratio', 'eft_active_days_ratio',
        'cheque_active_days_ratio', 'emt_active_days_ratio', 'wire_active_days_ratio',
        'avg_abm_amount', 'avg_card_amount', 'avg_eft_amount', 'avg_cheque_amount', 'avg_emt_amount', 'avg_wire_amount'
    ]
}

# --- Autoencoder Anomaly Detection Function ---
def detect_anomalies_autoencoder(df, variable_groups):
    all_group_anomalies_ae = {}

    for group_name, columns in variable_groups.items():
        print(f"\nProcessing group for Autoencoder: {group_name}")
        df_group = df[columns + ['customer_id']].copy()

        # Handle missing values
        numeric_columns = df_group.select_dtypes(include=np.number).columns
        df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())

        scaler = MinMaxScaler()
        data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)
        X_tensor = torch.tensor(data_scaled.values, dtype=torch.float32)

        input_dim = data_scaled.shape[1]
        hidden_dim = int(data_scaled.shape[1]/2) # Adjusted hidden dimension
        output_dim = data_scaled.shape[1]

        model_ae = Autoencoder(input_dim, hidden_dim, output_dim)
        criterion_ae = nn.MSELoss()
        optimizer_ae = torch.optim.Adam(model_ae.parameters(), lr=0.001)

        # Train Autoencoder
        for epoch in range(50): # Reduced epochs
            for i in range(0, len(X_tensor), 120):
                x = X_tensor[i:i+120]
                reconstructed = model_ae(x)
                loss = criterion_ae(reconstructed, x)
                optimizer_ae.zero_grad()
                loss.backward()
                optimizer_ae.step()
            #print(f'Epoch {epoch+1}, Loss: {loss.item()}') # Optional: Print epoch loss

        # Calculate reconstruction error
        with torch.no_grad():
            reconstructed = model_ae(X_tensor)
            reconstruction_error = torch.mean((X_tensor - reconstructed) ** 2, dim=1)

        threshold_ae = torch.quantile(reconstruction_error, 0.995) # 95th percentile threshold
        outliers_ae = reconstruction_error > threshold_ae
        anomaly_customer_ids_ae = df_group['customer_id'][outliers_ae.numpy()].tolist()

        all_group_anomalies_ae[group_name] = anomaly_customer_ids_ae
        print(f"Anomalies for {group_name} using Autoencoder: {len(anomaly_customer_ids_ae)}")

    # Combine and find unique anomalies across ALL groups for Autoencoder
    all_anomalies_combined_ae = []
    for anomalies in all_group_anomalies_ae.values():
        all_anomalies_combined_ae.extend(anomalies)
    unique_anomalies_ae = list(set(all_anomalies_combined_ae))

    print(f"\nTotal number of unique anomalies across all groups using Autoencoder: {len(unique_anomalies_ae)}")
    print(f"Unique anomaly customer IDs (Autoencoder): {unique_anomalies_ae}")

    print("\nAnomalies per group (counts) using Autoencoder:")
    for group_name, anomalies in all_group_anomalies_ae.items():
        print(f"{group_name}: {len(anomalies)}")

    return unique_anomalies_ae, all_group_anomalies_ae


class Autoencoder(nn.Module): # Autoencoder class - keep it outside the function for reusability if needed later
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim), # Added extra hidden layer
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), # Adjusted to match encoder
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


# --- Run Autoencoder Anomaly Detection ---
unique_anomalies_ae_3, all_group_anomalies_ae_3 = detect_anomalies_autoencoder(df, variable_groups)



Processing group for Autoencoder: group_abm_profile
Anomalies for group_abm_profile using Autoencoder: 82

Processing group for Autoencoder: group_card_profile
Anomalies for group_card_profile using Autoencoder: 82

Processing group for Autoencoder: group_eft_profile
Anomalies for group_eft_profile using Autoencoder: 82

Processing group for Autoencoder: group_cheque_profile
Anomalies for group_cheque_profile using Autoencoder: 82

Processing group for Autoencoder: group_emt_profile
Anomalies for group_emt_profile using Autoencoder: 82

Processing group for Autoencoder: group_wire_profile
Anomalies for group_wire_profile using Autoencoder: 82

Processing group for Autoencoder: group_transaction_intensity
Anomalies for group_transaction_intensity using Autoencoder: 82

Processing group for Autoencoder: group_account_duration
Anomalies for group_account_duration using Autoencoder: 82

Processing group for Autoencoder: group_amount_rolling_ratios
Anomalies for group_amount_rolling_ratios

In [None]:
len(unique_anomalies_ae)+len(unique_anomalies_ae_2)+len(unique_anomalies_ae_3)

1500

In [None]:
anamolies_ae = (unique_anomalies_ae)+(unique_anomalies_ae_2)+(unique_anomalies_ae_3)

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Assuming 'unique_anomalies' is a list of customer IDs identified as anomalies
# and 'combined_df' is your DataFrame with customer data

# 1. Define your two categorical variables:
#    - 'anomaly_status':  Anomaly (in unique_anomalies) or Non-Anomaly
#    - 'industry_code':  Industry sector of the customer (or replace with your chosen categorical column)

categorical_column = 'industry_code' # **Replace 'industry_code' with your actual categorical column name**

combined_df['anomaly_status'] = combined_df['customer_id'].apply(lambda x: 'Anomaly' if x in anamolies_ae else 'Non-Anomaly') # Use your anomaly list
combined_df = combined_df.dropna(subset=[categorical_column]) # Remove rows with NaN in the categorical column to avoid errors

# 2. Create a contingency table using pd.crosstab with TWO categorical variables
contingency_table = pd.crosstab(combined_df['anomaly_status'], combined_df[categorical_column])

print("\nContingency Table (Anomaly Status vs. {}):\n".format(categorical_column))
print(contingency_table)

# 3. Perform Chi-squared test on the contingency table
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-squared statistic: {chi2:.4f}")
print(f"P-value: {p:.4f}")
print(f"Degrees of freedom: {dof}")

# 4. Interpret the results
alpha = 0.05  # Significance level
print(f"\nSignificance level (alpha): {alpha}")

if p < alpha:
    print("Reject null hypothesis (p < {}).".format(alpha))
    print(f"There is a statistically significant association between Anomaly Status and {categorical_column}.")
    print("This suggests that anomalies are not evenly distributed across {} and might represent a non-random pattern.".format(categorical_column))
else:
    print("Fail to reject null hypothesis (p >= {}).".format(alpha))
    print(f"No statistically significant association found between Anomaly Status and {categorical_column}.")
    print("This *could* suggest anomalies are distributed similarly across {}. Further investigation is needed.".format(categorical_column))

print("\n--- Interpretation Notes ---")
print("A significant p-value suggests that the distribution of anomalies across {} is different from the distribution of non-anomalies across {}.".format(categorical_column, categorical_column))
print("This *might* indicate that your anomaly detection method is identifying something beyond random chance related to {}".format(categorical_column))
print("However, a non-significant p-value does *not* mean your anomalies are not valid, it just means there's no statistically significant association with {} based on the Chi-squared test.".format(categorical_column))
print("Always consider domain expertise and further investigation to validate your findings.")


Contingency Table (Anomaly Status vs. industry_code):

industry_code     0  0112  0115  0119  0122  0129  0131  0132  0133  0139  \
anomaly_status                                                              
Anomaly          19     3     1     2     0     0     0     0     1     0   
Non-Anomaly     368    58    14    13    31    23    13    11    13    25   

industry_code   ...  9931  9942  9949  9951  9952  9953  9959  9961  9999  \
anomaly_status  ...                                                         
Anomaly         ...     1     2     1     1     1     5     6     1     4   
Non-Anomaly     ...    36    34    37    15    10   137    58    20    90   

industry_code   other  
anomaly_status         
Anomaly            96  
Non-Anomaly      1835  

[2 rows x 253 columns]

Chi-squared statistic: 297.4260
P-value: 0.0260
Degrees of freedom: 252

Significance level (alpha): 0.05
Reject null hypothesis (p < 0.05).
There is a statistically significant association between Anomaly

In [None]:
# prompt: find commaon anomolies in both unique_anomalies_strategy_2 and unique_anomalies. also add uuique anomolies in each

common_anomalies_ae = list(set(unique_anomalies_ae) & set(unique_anomalies_ae_2)& set(unique_anomalies_ae_3))
unique_to_strategy_1_ae = list(set(unique_anomalies_ae) - set(unique_anomalies_ae_2)-set(unique_anomalies_ae_3))
unique_to_strategy_2_ae = list(set(unique_anomalies_ae_2) - set(unique_anomalies_ae)-set(unique_anomalies_ae_3))
unique_to_strategy_3_ae = list(set(unique_anomalies_ae_3)-set(unique_anomalies_ae_2) - set(unique_anomalies_ae))
print(f"Common Anomalies: {common_anomalies_ae}")
print(f"Number of Common Anomalies: {len(common_anomalies_ae)}")
print(f"Unique to unique_anomalies (Strategy 1): {unique_to_strategy_1_ae}")
print(f"Number of Unique to Strategy 1: {len(unique_to_strategy_1_ae)}")
print(f"Unique to unique_anomalies_strategy_2 (Strategy 2): {unique_to_strategy_2_ae}")
print(f"Number of Unique to Strategy 2: {len(unique_to_strategy_2_ae)}")
print(f"Unique to unique_anomalies_strategy_3 (Strategy 3): {unique_to_strategy_3_ae}")
print(f"Number of Unique to Strategy 3: {len(unique_to_strategy_3_ae)}")


Common Anomalies: ['SYNCID0000003677', 'SYNCID0000013995', 'SYNCID0000015688', 'SYNCID0000007029', 'SYNCID0000014090', 'SYNCID0000003100', 'SYNCID0000010770', 'SYNCID0000003598', 'SYNCID0000010138', 'SYNCID0000008000', 'SYNCID0000004719', 'SYNCID0000013984', 'SYNCID0000014504', 'SYNCID0000013957', 'SYNCID0000014637', 'SYNCID0000006266', 'SYNCID0000008666', 'SYNCID0000015818', 'SYNCID0000005779', 'SYNCID0000014271', 'SYNCID0000015037', 'SYNCID0000014308', 'SYNCID0000010612', 'SYNCID0000005282', 'SYNCID0000004191', 'SYNCID0000013967', 'SYNCID0000012928', 'SYNCID0000011636', 'SYNCID0000012431', 'SYNCID0000013970', 'SYNCID0000012577', 'SYNCID0000013961', 'SYNCID0000013974', 'SYNCID0000004512', 'SYNCID0000005135', 'SYNCID0000000703', 'SYNCID0000006926', 'SYNCID0000005776', 'SYNCID0000006473', 'SYNCID0000008048', 'SYNCID0000004322', 'SYNCID0000005251', 'SYNCID0000011111', 'SYNCID0000005347', 'SYNCID0000011351', 'SYNCID0000004258', 'SYNCID0000009654', 'SYNCID0000009708', 'SYNCID0000008674', '

In [None]:
all_anomolies=[]

In [None]:
all_anomolies=common_anomalies_ae+unique_to_strategy_1_ae+unique_to_strategy_2_ae+unique_to_strategy_3_ae+common_anomalies_birch+unique_to_strategy_1_birch+unique_to_strategy_2_birch+unique_to_strategy_3_birch+common_anomalies+unique_to_strategy_1+unique_to_strategy_2+unique_to_strategy_3

In [None]:
kmeans_anomolies=common_anomalies+unique_to_strategy_1+unique_to_strategy_2+unique_to_strategy_3

In [None]:
birch_anomalies=common_anomalies_birch+unique_to_strategy_1_birch+unique_to_strategy_2_birch+unique_to_strategy_3_birch

In [None]:
ae_anomolies=common_anomalies_ae+unique_to_strategy_1_ae+unique_to_strategy_2_ae+unique_to_strategy_3_ae

In [None]:
print(all_anomolies)

['SYNCID0000003677', 'SYNCID0000013995', 'SYNCID0000015688', 'SYNCID0000007029', 'SYNCID0000014090', 'SYNCID0000003100', 'SYNCID0000010770', 'SYNCID0000003598', 'SYNCID0000010138', 'SYNCID0000008000', 'SYNCID0000004719', 'SYNCID0000013984', 'SYNCID0000014504', 'SYNCID0000013957', 'SYNCID0000014637', 'SYNCID0000006266', 'SYNCID0000008666', 'SYNCID0000015818', 'SYNCID0000005779', 'SYNCID0000014271', 'SYNCID0000015037', 'SYNCID0000014308', 'SYNCID0000010612', 'SYNCID0000005282', 'SYNCID0000004191', 'SYNCID0000013967', 'SYNCID0000012928', 'SYNCID0000011636', 'SYNCID0000012431', 'SYNCID0000013970', 'SYNCID0000012577', 'SYNCID0000013961', 'SYNCID0000013974', 'SYNCID0000004512', 'SYNCID0000005135', 'SYNCID0000000703', 'SYNCID0000006926', 'SYNCID0000005776', 'SYNCID0000006473', 'SYNCID0000008048', 'SYNCID0000004322', 'SYNCID0000005251', 'SYNCID0000011111', 'SYNCID0000005347', 'SYNCID0000011351', 'SYNCID0000004258', 'SYNCID0000009654', 'SYNCID0000009708', 'SYNCID0000008674', 'SYNCID0000013960',

In [None]:
len(all_anomolies)

3658

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Assuming 'unique_anomalies' is a list of customer IDs identified as anomalies
# and 'combined_df' is your DataFrame with customer data

# 1. Define your two categorical variables:
#    - 'anomaly_status':  Anomaly (in unique_anomalies) or Non-Anomaly
#    - 'industry_code':  Industry sector of the customer (or replace with your chosen categorical column)

categorical_column = 'industry_code' # **Replace 'industry_code' with your actual categorical column name**

combined_df['anomaly_status'] = combined_df['customer_id'].apply(lambda x: 'Anomaly' if x in all_anomolies else 'Non-Anomaly') # Use your anomaly list
combined_df = combined_df.dropna(subset=[categorical_column]) # Remove rows with NaN in the categorical column to avoid errors

# 2. Create a contingency table using pd.crosstab with TWO categorical variables
contingency_table = pd.crosstab(combined_df['anomaly_status'], combined_df[categorical_column])

print("\nContingency Table (Anomaly Status vs. {}):\n".format(categorical_column))
print(contingency_table)

# 3. Perform Chi-squared test on the contingency table
chi2, p, dof, expected = chi2_contingency(contingency_table)

print(f"\nChi-squared statistic: {chi2:.4f}")
print(f"P-value: {p:.4f}")
print(f"Degrees of freedom: {dof}")

# 4. Interpret the results
alpha = 0.05  # Significance level
print(f"\nSignificance level (alpha): {alpha}")

if p < alpha:
    print("Reject null hypothesis (p < {}).".format(alpha))
    print(f"There is a statistically significant association between Anomaly Status and {categorical_column}.")
    print("This suggests that anomalies are not evenly distributed across {} and might represent a non-random pattern.".format(categorical_column))
else:
    print("Fail to reject null hypothesis (p >= {}).".format(alpha))
    print(f"No statistically significant association found between Anomaly Status and {categorical_column}.")
    print("This *could* suggest anomalies are distributed similarly across {}. Further investigation is needed.".format(categorical_column))

print("\n--- Interpretation Notes ---")
print("A significant p-value suggests that the distribution of anomalies across {} is different from the distribution of non-anomalies across {}.".format(categorical_column, categorical_column))
print("This *might* indicate that your anomaly detection method is identifying something beyond random chance related to {}".format(categorical_column))
print("However, a non-significant p-value does *not* mean your anomalies are not valid, it just means there's no statistically significant association with {} based on the Chi-squared test.".format(categorical_column))
print("Always consider domain expertise and further investigation to validate your findings.")


Contingency Table (Anomaly Status vs. industry_code):

industry_code     0  0112  0115  0119  0122  0129  0131  0132  0133  0139  \
anomaly_status                                                              
Anomaly          57     5     1     2     2     1     2     0     2     1   
Non-Anomaly     330    56    14    13    29    22    11    11    12    24   

industry_code   ...  9931  9942  9949  9951  9952  9953  9959  9961  9999  \
anomaly_status  ...                                                         
Anomaly         ...     4     5     5     3     1    21     9     5    16   
Non-Anomaly     ...    33    31    33    13    10   121    55    16    78   

industry_code   other  
anomaly_status         
Anomaly           265  
Non-Anomaly      1666  

[2 rows x 253 columns]

Chi-squared statistic: 310.3269
P-value: 0.0071
Degrees of freedom: 252

Significance level (alpha): 0.05
Reject null hypothesis (p < 0.05).
There is a statistically significant association between Anomaly

In [None]:
common_anomalies_final = list(set(kmeans_anomolies) & set(birch_anomalies)& set(ae_anomolies))
unique_to_strategy_1_final = list(set(kmeans_anomolies) - set(birch_anomalies)-set(ae_anomolies))
unique_to_strategy_2_final = list(set(birch_anomalies) - set(kmeans_anomolies)-set(ae_anomolies))
unique_to_strategy_3_final = list(set(ae_anomolies)-set(birch_anomalies) - set(kmeans_anomolies))
print(f"Common Anomalies: {common_anomalies_ae}")
print(f"Number of Common Anomalies: {len(common_anomalies_final)}")
print(f"Unique to unique_anomalies (Strategy 1): {unique_to_strategy_1_final}")
print(f"Number of Unique to Strategy 1: {len(unique_to_strategy_1_final)}")
print(f"Unique to unique_anomalies_strategy_2 (Strategy 2): {unique_to_strategy_2_final}")
print(f"Number of Unique to Strategy 2: {len(unique_to_strategy_2_final)}")
print(f"Unique to unique_anomalies_strategy_3 (Strategy 3): {unique_to_strategy_3_final}")
print(f"Number of Unique to Strategy 3: {len(unique_to_strategy_3_final)}")


Common Anomalies: ['SYNCID0000003677', 'SYNCID0000013995', 'SYNCID0000015688', 'SYNCID0000007029', 'SYNCID0000014090', 'SYNCID0000003100', 'SYNCID0000010770', 'SYNCID0000003598', 'SYNCID0000010138', 'SYNCID0000008000', 'SYNCID0000004719', 'SYNCID0000013984', 'SYNCID0000014504', 'SYNCID0000013957', 'SYNCID0000014637', 'SYNCID0000006266', 'SYNCID0000008666', 'SYNCID0000015818', 'SYNCID0000005779', 'SYNCID0000014271', 'SYNCID0000015037', 'SYNCID0000014308', 'SYNCID0000010612', 'SYNCID0000005282', 'SYNCID0000004191', 'SYNCID0000013967', 'SYNCID0000012928', 'SYNCID0000011636', 'SYNCID0000012431', 'SYNCID0000013970', 'SYNCID0000012577', 'SYNCID0000013961', 'SYNCID0000013974', 'SYNCID0000004512', 'SYNCID0000005135', 'SYNCID0000000703', 'SYNCID0000006926', 'SYNCID0000005776', 'SYNCID0000006473', 'SYNCID0000008048', 'SYNCID0000004322', 'SYNCID0000005251', 'SYNCID0000011111', 'SYNCID0000005347', 'SYNCID0000011351', 'SYNCID0000004258', 'SYNCID0000009654', 'SYNCID0000009708', 'SYNCID0000008674', '

In [None]:
final_anomolies=common_anomalies_final+unique_to_strategy_1_final+unique_to_strategy_2_final+unique_to_strategy_3_final

In [None]:
print(len(final_anomolies))

1520


In [None]:
# Create a DataFrame from the list
df_anomalies = pd.DataFrame({'customer_id': final_anomolies})

# Save the DataFrame to a CSV file
df_anomalies.to_csv('final_anomalies.csv', index=False)

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans, Birch
from sklearn.ensemble import IsolationForest
from sklearn.metrics import silhouette_score
import numpy as np
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.model_selection import GridSearchCV

import torch
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# Assuming combined_df is already loaded and has a 'customer_id' column
df = combined_df.copy()

# Define the variable groups - using strategy 1 as example
variable_groups = {
    'group1_transaction_counts': [
        'abm_credit_transaction_count', 'abm_debit_transaction_count',
        'card_credit_transaction_count', 'card_debit_transaction_count',
        'eft_credit_transaction_count', 'eft_debit_transaction_count',
        'cheque_credit_transaction_count', 'cheque_debit_transaction_count',
        'emt_credit_transaction_count', 'emt_debit_transaction_count',
        'wire_credit_transaction_count', 'wire_debit_transaction_count'
    ],
    'group2_transaction_amounts': [
        'abm_average_credit', 'abm_average_debit', 'max_abm_credit', 'max_abm_debit', 'min_abm_credit', 'min_abm_debit',
        'card_average_credit', 'card_average_debit', 'max_card_credit', 'max_card_debit', 'min_card_credit', 'min_card_debit',
        'eft_average_credit', 'eft_average_debit', 'max_eft_credit', 'max_eft_debit', 'min_eft_credit', 'min_eft_debit',
        'cheque_average_credit', 'cheque_average_debit', 'max_cheque_credit', 'max_cheque_debit', 'min_cheque_credit', 'min_cheque_debit',
        'emt_average_credit', 'emt_average_debit', 'max_emt_credit', 'max_emt_debit', 'min_emt_credit', 'min_emt_debit',
        'wire_average_credit', 'wire_average_debit', 'max_wire_credit', 'max_wire_debit', 'min_wire_debit'
    ],
    'group3_transaction_timing': [
        'abm_amount_rolling_avg_30d', 'abm_amount_rolling_avg_7d', 'abm_rolling_sum_7d_ratio', 'abm_active_days_ratio',
        'card_amount_rolling_avg_30d', 'card_amount_rolling_avg_7d', 'card_rolling_sum_7d_ratio', 'card_active_days_ratio',
        'eft_amount_rolling_avg_30d', 'eft_amount_rolling_avg_7d', 'eft_rolling_sum_7d_ratio', 'eft_active_days_ratio',
        'cheque_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_7d', 'cheque_rolling_sum_7d_ratio', 'cheque_active_days_ratio',
        'emt_amount_rolling_avg_30d', 'emt_amount_rolling_avg_7d', 'emt_rolling_sum_7d_ratio', 'emt_active_days_ratio',
        'wire_amount_rolling_avg_30d', 'wire_amount_rolling_avg_7d', 'wire_rolling_sum_7d_ratio', 'wire_active_days_ratio',
        'abm_account_active_days', 'card_account_active_days', 'eft_account_active_days', 'cheque_account_active_days', 'emt_account_active_days', 'wire_account_active_days'
    ],
    'group4_credit_debit_ratios': [
        'abm_credit_debit_ratio', 'card_credit_debit_ratio', 'cheque_credit_debit_ratio', 'eft_credit_debit_ratio', 'emt_credit_debit_ratio', 'wire_credit_debit_ratio',
        'abm_debit_credit_ratio', 'card_debit_credit_ratio', 'cheque_debit_credit_ratio', 'eft_debit_credit_ratio', 'emt_debit_credit_ratio', 'wire_debit_credit_ratio'
    ],
    'group5_overall_amount_averages': [
        'avg_abm_amount', 'avg_card_amount', 'avg_eft_amount', 'avg_cheque_amount', 'avg_emt_amount', 'avg_wire_amount',
        'abm_amount_rolling_avg_30d', 'card_amount_rolling_avg_30d', 'eft_amount_rolling_avg_30d', 'cheque_amount_rolling_avg_30d', 'emt_amount_rolling_avg_30d', 'wire_amount_rolling_avg_30d',
        'abm_amount_rolling_avg_7d', 'card_amount_rolling_avg_7d', 'eft_amount_rolling_avg_7d', 'cheque_amount_rolling_avg_7d', 'emt_amount_rolling_avg_7d', 'wire_amount_rolling_avg_7d'
    ],
    'group7_log_active_days': [
        'log1p_abm_account_active_days', 'log1p_card_account_active_days', 'log1p_cheque_account_active_days',
        'log1p_eft_account_active_days', 'log1p_emt_account_active_days', 'log1p_wire_account_active_days'
    ]
}


In [None]:
class Autoencoder(nn.Module): # Autoencoder class - keep it outside for reusability
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(Autoencoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim), # Added extra hidden layer
            nn.ReLU()
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), # Adjusted to match encoder
            nn.ReLU(),
            nn.Linear(hidden_dim, output_dim), # Output dimension should match input dimension
            nn.ReLU() #added a relu layer here as well
            # Output layer - adjust activation if needed
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded # Return decoded output for reconstruction loss

    def get_embedding(self, x): # Function to get embeddings
        return self.encoder(x)

In [None]:
# --- Function to Get Autoencoder Embeddings ---
def get_autoencoder_embeddings(df, variable_groups):
    all_group_embeddings = {}
    trained_models = {} # Store trained autoencoder models for later embedding extraction

    for group_name, columns in variable_groups.items():
        print(f"\nProcessing group for Autoencoder Embeddings: {group_name}")
        df_group = df[columns + ['customer_id']].copy()

        # Handle missing values
        numeric_columns = df_group.select_dtypes(include=np.number).columns
        df_group[numeric_columns] = df_group[numeric_columns].fillna(df_group[numeric_columns].median())

        scaler = MinMaxScaler()
        data_scaled = pd.DataFrame(scaler.fit_transform(df_group.drop(columns=['customer_id'])), columns=df_group.drop(columns=['customer_id']).columns)
        X_tensor = torch.tensor(data_scaled.values, dtype=torch.float32)

        input_dim = data_scaled.shape[1]
        hidden_dim = int(data_scaled.shape[1]/2)
        output_dim = data_scaled.shape[1]

        model_ae = Autoencoder(input_dim, hidden_dim, output_dim)
        criterion_ae = nn.MSELoss()
        optimizer_ae = torch.optim.Adam(model_ae.parameters(), lr=0.001)

        # Train Autoencoder (same training as before)
        for epoch in range(50):
            for i in range(0, len(X_tensor), 120):
                x = X_tensor[i:i+120]
                reconstructed = model_ae(x)
                loss = criterion_ae(reconstructed, x)
                optimizer_ae.zero_grad()
                loss.backward()
                optimizer_ae.step()

        trained_models[group_name] = model_ae # Store trained model

        # Get embeddings
        with torch.no_grad():
            embeddings_tensor = model_ae.get_embedding(X_tensor) # Use get_embedding function
            embeddings = embeddings_tensor.numpy() # Convert to numpy array

        all_group_embeddings[group_name] = pd.DataFrame(embeddings, index=df_group['customer_id'], columns=[f'emb_{i}' for i in range(hidden_dim)]) # Store embeddings as DataFrame with customer_ids

    return all_group_embeddings, trained_models

In [None]:
# --- Custom Birch Clustering class and silhouette scorer (from previous code, keep it) ---
class BirchClustering(BaseEstimator, ClusterMixin):
    def __init__(self, threshold=0.5, branching_factor=50, n_clusters=None):
        self.threshold = threshold
        self.branching_factor = branching_factor
        self.n_clusters = n_clusters

    def fit(self, X, y=None):
        self.model_ = Birch(threshold=self.threshold,
                            branching_factor=self.branching_factor,
                            n_clusters=self.n_clusters)
        self.model_.fit(X)
        return self

    def predict(self, X):
        return self.model_.predict(X)

    def fit_predict(self, X, y=None):
        return self.fit(X).predict(X)

def silhouette_scorer(estimator, X):
    labels = estimator.fit_predict(X)
    if len(set(labels)) < 2:  # Silhouette score requires at least 2 clusters
        return -1  # Assign a low score for invalid clustering
    return silhouette_score(X, labels)


def detect_anomalies_clustering_embeddings(all_group_embeddings, clustering_method='KMeans'): # Generic function for clustering on embeddings
    all_group_anomalies_embedding_cluster = {}

    for group_name, embeddings_df in all_group_embeddings.items():
        print(f"\nProcessing group for {clustering_method} Clustering on Embeddings: {group_name}")
        data_scaled_embeddings = embeddings_df.copy() # Embeddings are already scaled by AE, no need to scale again

        silhouette_scores = []
        cluster_range = range(2, 11) # Reduced cluster range for embeddings (experiment)

        #Added this to bypass the silhouette score computation if only one cluster is detected.
        #This is likely to happen due to Birch being more sensitive to density and potentially finding a single cluster in the embedding space.
        num_clusters_found = len(set(BirchClustering(n_clusters=None).fit_predict(data_scaled_embeddings)))
        if num_clusters_found < 2 and clustering_method == 'Birch':
            print(f"Only 1 cluster found by Birch. Skipping silhouette score computation and using n_clusters=2")
            optimal_k = 2
        else: #Proceed with normal silhouette analysis if more than one cluster is found.
            for k in cluster_range:
                if clustering_method == 'KMeans':
                    model = KMeans(n_clusters=k, n_init=10, random_state=1, max_iter=300) # Reduced n_init and max_iter for speed
                elif clustering_method == 'Birch':
                    model = BirchClustering(n_clusters=k) # Use Birch directly with n_clusters
                else:
                    raise ValueError("Invalid clustering_method. Choose 'KMeans' or 'Birch'.")

                cluster_labels = model.fit_predict(data_scaled_embeddings)
                silhouette_avg = silhouette_score(data_scaled_embeddings, cluster_labels)
                silhouette_scores.append(silhouette_avg)

            optimal_k = cluster_range[np.argmax(silhouette_scores)]
            print(f"  Optimal number of clusters (Embeddings - {clustering_method}): {optimal_k}")

        if clustering_method == 'KMeans':
            model = KMeans(n_clusters=optimal_k, n_init=10, random_state=4, max_iter=300)
        elif clustering_method == 'Birch':
            model = BirchClustering(n_clusters=optimal_k)

        cluster_numbers = model.fit_predict(data_scaled_embeddings)
        embeddings_df['Cluster'] = cluster_numbers # Add cluster labels to embeddings dataframe

        group_anomalies_embedding_cluster = [] # Anomaly list for this group and clustering method
        unique_clusters = embeddings_df['Cluster'].unique()
        for i in unique_clusters:
            cluster_data = embeddings_df[embeddings_df['Cluster'] == i].drop(columns=['Cluster'])
            if cluster_data.empty:
                print(f"  Cluster {i} is empty. Skipping.")
                continue
            clf = IsolationForest(contamination=0.01, random_state=42, n_estimators=100) # Reduced n_estimators for speed
            clf.fit(cluster_data) # Fit IsolationForest on embeddings
            y_pred = clf.predict(cluster_data)
            cluster_anomalies = cluster_data.index[y_pred == -1].tolist() # Get customer IDs from index
            group_anomalies_embedding_cluster.extend(cluster_anomalies)

        all_group_anomalies_embedding_cluster[group_name] = group_anomalies_embedding_cluster
        print(f"Anomalies for {group_name} using {clustering_method} on Embeddings: {len(group_anomalies_embedding_cluster)}")

    return all_group_anomalies_embedding_cluster

In [None]:
# --- 1. Get Autoencoder Embeddings ---
all_group_embeddings_ae, trained_models_ae = get_autoencoder_embeddings(df, variable_groups)

# --- 2. Anomaly Detection using K-Means on Embeddings ---
all_group_anomalies_kmeans_embeddings = detect_anomalies_clustering_embeddings(all_group_embeddings_ae, clustering_method='KMeans')

# --- 3. Anomaly Detection using Birch on Embeddings ---
all_group_anomalies_birch_embeddings = detect_anomalies_clustering_embeddings(all_group_embeddings_ae, clustering_method='Birch')


Processing group for Autoencoder Embeddings: group1_transaction_counts

Processing group for Autoencoder Embeddings: group2_transaction_amounts

Processing group for Autoencoder Embeddings: group3_transaction_timing

Processing group for Autoencoder Embeddings: group4_credit_debit_ratios

Processing group for Autoencoder Embeddings: group5_overall_amount_averages

Processing group for Autoencoder Embeddings: group7_log_active_days

Processing group for KMeans Clustering on Embeddings: group1_transaction_counts
  Optimal number of clusters (Embeddings - KMeans): 2
Anomalies for group1_transaction_counts using KMeans on Embeddings: 163

Processing group for KMeans Clustering on Embeddings: group2_transaction_amounts
  Optimal number of clusters (Embeddings - KMeans): 2
Anomalies for group2_transaction_amounts using KMeans on Embeddings: 164

Processing group for KMeans Clustering on Embeddings: group3_transaction_timing
  Optimal number of clusters (Embeddings - KMeans): 2
Anomalies for



Anomalies for group1_transaction_counts using Birch on Embeddings: 163

Processing group for Birch Clustering on Embeddings: group2_transaction_amounts
Only 1 cluster found by Birch. Skipping silhouette score computation and using n_clusters=2




Anomalies for group2_transaction_amounts using Birch on Embeddings: 163

Processing group for Birch Clustering on Embeddings: group3_transaction_timing
  Optimal number of clusters (Embeddings - Birch): 2
Anomalies for group3_transaction_timing using Birch on Embeddings: 163

Processing group for Birch Clustering on Embeddings: group4_credit_debit_ratios
Only 1 cluster found by Birch. Skipping silhouette score computation and using n_clusters=2




Anomalies for group4_credit_debit_ratios using Birch on Embeddings: 157

Processing group for Birch Clustering on Embeddings: group5_overall_amount_averages
  Optimal number of clusters (Embeddings - Birch): 4
Anomalies for group5_overall_amount_averages using Birch on Embeddings: 164

Processing group for Birch Clustering on Embeddings: group7_log_active_days




  Optimal number of clusters (Embeddings - Birch): 7
Anomalies for group7_log_active_days using Birch on Embeddings: 156


In [None]:
print(len(all_group_anomalies_kmeans_embeddings))


6


In [None]:

# Combine unique values from all_group_anomalies_kmeans_embeddings into a single list
all_anomalies_list_kmeans_embeddings = []
for group_name, anomalies in all_group_anomalies_kmeans_embeddings.items():
    all_anomalies_list_kmeans_embeddings.extend(anomalies)

unique_anomalies_list_kmeans_embeddings = list(set(all_anomalies_list_kmeans_embeddings))
print(f"Total number of unique anomalies across groups: {len(unique_anomalies_list_kmeans_embeddings)}")
print(unique_anomalies_list_kmeans_embeddings)


Total number of unique anomalies across groups: 907
['SYNCID0000017015', 'SYNCID0000009082', 'SYNCID0000001552', 'SYNCID0000014044', 'SYNCID0000005445', 'SYNCID0000003382', 'SYNCID0000011790', 'SYNCID0000016283', 'SYNCID0000009840', 'SYNCID0000017020', 'SYNCID0000015558', 'SYNCID0000010531', 'SYNCID0000002923', 'SYNCID0000014803', 'SYNCID0000001447', 'SYNCID0000004360', 'SYNCID0000004417', 'SYNCID0000001367', 'SYNCID0000007192', 'SYNCID0000005809', 'SYNCID0000001299', 'SYNCID0000016112', 'SYNCID0000001405', 'SYNCID0000007442', 'SYNCID0000010627', 'SYNCID0000004367', 'SYNCID0000003319', 'SYNCID0000008520', 'SYNCID0000010578', 'SYNCID0000008297', 'SYNCID0000003827', 'SYNCID0000001833', 'SYNCID0000015997', 'SYNCID0000011539', 'SYNCID0000001198', 'SYNCID0000011290', 'SYNCID0000013579', 'SYNCID0000008558', 'SYNCID0000013492', 'SYNCID0000015563', 'SYNCID0000015907', 'SYNCID0000003297', 'SYNCID0000007254', 'SYNCID0000001092', 'SYNCID0000008054', 'SYNCID0000016995', 'SYNCID0000010268', 'SYNCID

In [None]:
print(all_group_anomalies_birch_embeddings)

{'group1_transaction_counts': ['SYNCID0000000047', 'SYNCID0000000084', 'SYNCID0000000144', 'SYNCID0000000269', 'SYNCID0000000431', 'SYNCID0000000486', 'SYNCID0000000491', 'SYNCID0000000564', 'SYNCID0000000717', 'SYNCID0000000844', 'SYNCID0000000904', 'SYNCID0000001032', 'SYNCID0000001062', 'SYNCID0000001258', 'SYNCID0000001292', 'SYNCID0000001466', 'SYNCID0000001647', 'SYNCID0000001703', 'SYNCID0000001841', 'SYNCID0000001948', 'SYNCID0000002219', 'SYNCID0000002435', 'SYNCID0000002454', 'SYNCID0000002485', 'SYNCID0000002736', 'SYNCID0000002742', 'SYNCID0000002810', 'SYNCID0000002898', 'SYNCID0000002931', 'SYNCID0000003114', 'SYNCID0000003150', 'SYNCID0000003175', 'SYNCID0000003244', 'SYNCID0000003350', 'SYNCID0000003614', 'SYNCID0000003740', 'SYNCID0000004105', 'SYNCID0000004116', 'SYNCID0000004143', 'SYNCID0000004322', 'SYNCID0000004341', 'SYNCID0000004450', 'SYNCID0000004466', 'SYNCID0000004924', 'SYNCID0000005069', 'SYNCID0000005170', 'SYNCID0000005251', 'SYNCID0000005277', 'SYNCID00

In [None]:
# Combine unique values from all_group_anomalies_kmeans_embeddings into a single list
all_anomalies_list_birch_embeddings = []
for group_name, anomalies in all_group_anomalies_birch_embeddings.items():
    all_anomalies_list_birch_embeddings.extend(anomalies)

unique_anomalies_list_birch_embeddings = list(set(all_anomalies_list_birch_embeddings))
print(f"Total number of unique anomalies across groups: {len(unique_anomalies_list_birch_embeddings)}")
print(unique_anomalies_list_birch_embeddings)

Total number of unique anomalies across groups: 900
['SYNCID0000017015', 'SYNCID0000009082', 'SYNCID0000000871', 'SYNCID0000004635', 'SYNCID0000001552', 'SYNCID0000002910', 'SYNCID0000008330', 'SYNCID0000005445', 'SYNCID0000009840', 'SYNCID0000002736', 'SYNCID0000016283', 'SYNCID0000017020', 'SYNCID0000015558', 'SYNCID0000010531', 'SYNCID0000000629', 'SYNCID0000001447', 'SYNCID0000010530', 'SYNCID0000004360', 'SYNCID0000001367', 'SYNCID0000004417', 'SYNCID0000007192', 'SYNCID0000013200', 'SYNCID0000005809', 'SYNCID0000013890', 'SYNCID0000001405', 'SYNCID0000016112', 'SYNCID0000007442', 'SYNCID0000010627', 'SYNCID0000004367', 'SYNCID0000014271', 'SYNCID0000008520', 'SYNCID0000014308', 'SYNCID0000004200', 'SYNCID0000004903', 'SYNCID0000008297', 'SYNCID0000003827', 'SYNCID0000015997', 'SYNCID0000011539', 'SYNCID0000001198', 'SYNCID0000011290', 'SYNCID0000008558', 'SYNCID0000015563', 'SYNCID0000003297', 'SYNCID0000007254', 'SYNCID0000015907', 'SYNCID0000001092', 'SYNCID0000008054', 'SYNCID

In [None]:
embedding_clustering_anomalies = list(set(unique_anomalies_list_kmeans_embeddings) | set(unique_anomalies_list_birch_embeddings))

In [None]:
len(embedding_clustering_anomalies)

1229

In [None]:
common_anomalies_final_after_embedding = list(set(final_anomolies) & set(embedding_clustering_anomalies))
unique_to_strategy_1_final_after_embedding = list(set(final_anomolies) - set(embedding_clustering_anomalies))
unique_to_strategy_2_final_after_embedding = list(set(embedding_clustering_anomalies) - set(final_anomolies))
print(f"Common Anomalies: {common_anomalies_final_after_embedding}")
print(f"Number of Common Anomalies: {len(common_anomalies_final_after_embedding)}")
print(f"Unique to unique_anomalies (Strategy 1): {unique_to_strategy_1_final_after_embedding}")
print(f"Number of Unique to Strategy 1: {len(unique_to_strategy_1_final_after_embedding)}")
print(f"Unique to unique_anomalies_strategy_2 (Strategy 2): {unique_to_strategy_2_final_after_embedding}")
print(f"Number of Unique to Strategy 2: {len(unique_to_strategy_2_final_after_embedding)}")

Common Anomalies: ['SYNCID0000017015', 'SYNCID0000009082', 'SYNCID0000000871', 'SYNCID0000009840', 'SYNCID0000016283', 'SYNCID0000002736', 'SYNCID0000010531', 'SYNCID0000002923', 'SYNCID0000001367', 'SYNCID0000016112', 'SYNCID0000013890', 'SYNCID0000014271', 'SYNCID0000014308', 'SYNCID0000004200', 'SYNCID0000015997', 'SYNCID0000001198', 'SYNCID0000008558', 'SYNCID0000003297', 'SYNCID0000010268', 'SYNCID0000008512', 'SYNCID0000002778', 'SYNCID0000001132', 'SYNCID0000009654', 'SYNCID0000003142', 'SYNCID0000004034', 'SYNCID0000013414', 'SYNCID0000012019', 'SYNCID0000009803', 'SYNCID0000008206', 'SYNCID0000008588', 'SYNCID0000016583', 'SYNCID0000012956', 'SYNCID0000008526', 'SYNCID0000003114', 'SYNCID0000015019', 'SYNCID0000001703', 'SYNCID0000001590', 'SYNCID0000000624', 'SYNCID0000013134', 'SYNCID0000017139', 'SYNCID0000003826', 'SYNCID0000016855', 'SYNCID0000000617', 'SYNCID0000007458', 'SYNCID0000013237', 'SYNCID0000002454', 'SYNCID0000016728', 'SYNCID0000006008', 'SYNCID0000005508', '

In [None]:
anamolies_after_embedding=common_anomalies_final_after_embedding+unique_to_strategy_1_final_after_embedding+unique_to_strategy_2_final_after_embedding

In [None]:
import pandas as pd

# Assuming anamolies_after_embedding is your list of anomaly IDs
anamolies_after_embedding_df = pd.DataFrame(anamolies_after_embedding, columns=['customer_id'])
# Create a DataFrame with the list

anamolies_after_embedding_df.to_csv('final_anomalies_after_embedding.csv', index=False)
# Save the DataFrame to CSV

In [None]:
# prompt: find the top industry_code counts

# Assuming 'combined_df' is already loaded and has an 'industry_code' column.
top_industry_codes = combined_df['industry_code'].value_counts()
print(top_industry_codes)


industry_code
other    1931
7215     1099
7771      768
4561      556
7292      517
         ... 
5319        9
3192        9
3741        9
6032        9
6223        9
Name: count, Length: 253, dtype: int64


In [28]:
# Load the CSV file into a DataFrame
df = pd.read_csv('final_anomalies_after_embedding.csv')

# Convert the 'customer_id' column to strings
df['customer_id'] = df['customer_id'].astype(str)

# Save the DataFrame to a TXT file without the header and index
df.to_csv('task_1_cluster_output.txt', header=False, index=False)


In [29]:


# Determine output directory
OUTPUT_DIR = '/mnt/output' if os.path.exists('/mnt/output') else '.'
print(f"Using output directory: {OUTPUT_DIR}")

# Create the output directory if it doesn't exist (important for local runs)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Construct the full output file path
output_path = os.path.join(OUTPUT_DIR, "task_1_cluster_output.txt")


Using output directory: .


In [None]:
# prompt: filter customer_id's that only have final_anomalies_after_embedding in combined_df and sort the rwos based on the values counts of the industry. print the top five industry.

import pandas as pd

# Load the final anomalies after embedding
final_anomalies_after_embedding_df = pd.read_csv('final_anomalies_after_embedding.csv')

# Filter combined_df to include only customer IDs present in final_anomalies_after_embedding
filtered_df = combined_df[combined_df['customer_id'].isin(final_anomalies_after_embedding_df['customer_id'])]

# Calculate industry value counts and get the top five
top_five_industries = filtered_df['industry_code'].value_counts()
print(top_five_industries)


industry_code
other    285
7215     140
7771     118
4561      93
7761      78
        ... 
2831       1
4259       1
3999       1
6232       1
8654       1
Name: count, Length: 233, dtype: int64


In [None]:
# prompt: divide top_five_industries by  top_industry_codes when industry code is same

# Merge top_five_industries and top_industry_codes based on index (industry_code)
merged_industries = pd.merge(top_five_industries, top_industry_codes, left_index=True, right_index=True, how='left')

# Rename columns for clarity
merged_industries.columns = ['top_five_industries', 'top_industry_codes']

# Divide top_five_industries by top_industry_codes
merged_industries['ratio'] = merged_industries['top_five_industries'] / merged_industries['top_industry_codes']

print(merged_industries.sort_values(by=['ratio']))

               top_five_industries  top_industry_codes     ratio
industry_code                                                   
0122                             1                  31  0.032258
6342                             1                  28  0.035714
0139                             1                  25  0.040000
8654                             1                  24  0.041667
9611                             2                  44  0.045455
...                            ...                 ...       ...
9951                             5                  16  0.312500
6561                             7                  19  0.368421
4569                             9                  23  0.391304
4236                             5                  12  0.416667
5231                             5                  11  0.454545

[233 rows x 3 columns]


In [36]:
import pandas as pd
import numpy as np

# --- Data Loading (ADAPT PATHS) ---
abm_df = pd.read_csv("abm.csv")
card_df = pd.read_csv("card.csv")
cheque_df = pd.read_csv("cheque.csv")
eft_df = pd.read_csv("eft.csv")
emt_df = pd.read_csv("emt.csv")
wire_df = pd.read_csv("wire.csv")
kyc_df = pd.read_csv("kyc.csv")
industry_codes_df = pd.read_csv("kyc_industry_codes.csv")

In [37]:
# --- Data Cleaning and Consistency ---

# 1. Standardize Column Names
def standardize_column_names(df, prefix=None):
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    if prefix:
        df.columns = [f"{prefix}_{col}" if col not in ['customer_id', 'industry_code'] else col for col in df.columns]  # Keep customer_id and industry_code
    return df

all_dfs = [abm_df, card_df, cheque_df, eft_df, emt_df, wire_df, kyc_df, industry_codes_df]
prefixes = ['abm', 'card', 'cheque', 'eft', 'emt', 'wire', 'kyc', 'industry']

for df, prefix in zip(all_dfs, prefixes):
    standardize_column_names(df, prefix)

In [38]:
# 2. Consistent Data Types and Transaction Type Flags
def ensure_consistent_types(df, id_col, date_cols, amount_col=None, transaction_type=None):
    if id_col in df.columns:
        df[id_col] = df[id_col].astype(str)
    if 'customer_id' in df.columns:
        df['customer_id'] = df['customer_id'].astype(str)

    for col in date_cols:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce')

    if amount_col and amount_col in df.columns:
        df[amount_col] = pd.to_numeric(df[amount_col], errors='coerce').fillna(0)

    # Add transaction type flag
    if transaction_type:
        df[f'is_{transaction_type}'] = 1
    return df

transaction_dfs = [abm_df, card_df, cheque_df, eft_df, emt_df, wire_df]
transaction_id_cols = ['abm_abm_id', 'card_card_trxn_id', 'cheque_cheque_id', 'eft_eft_id', 'emt_emt_id', 'wire_wire_id']
transaction_date_cols = ['abm_transaction_date','card_transaction_date','cheque_transaction_date','eft_transaction_date','emt_transaction_date','wire_transaction_date']  # Use prefixed names
amount_col_names = ['abm_amount_cad', 'card_amount_cad','cheque_amount_cad','eft_amount_cad', 'emt_amount_cad', 'wire_amount_cad']
transaction_types = ['abm', 'card', 'cheque', 'eft', 'emt', 'wire']

for df, id_col, amount_col, txn_type, date_col in zip(transaction_dfs, transaction_id_cols, amount_col_names, transaction_types, transaction_date_cols):
    ensure_consistent_types(df, id_col, [date_col], amount_col, txn_type) #pass date_col as a list
    if amount_col in df.columns:  # Consistent amount column name
        df.rename(columns={amount_col: f'{txn_type}_transaction_amount'}, inplace=True)

# Ensure all transaction dataframes have the same columns for merging
for df in transaction_dfs:
    for txn_type in transaction_types:
        if f'is_{txn_type}' not in df.columns:
            df[f'is_{txn_type}'] = 0

ensure_consistent_types(kyc_df, 'customer_id', ['kyc_established_date', 'kyc_onboard_date'])
ensure_consistent_types(industry_codes_df, 'industry_code', [])



Unnamed: 0,industry_code,industry_industry
0,112,Cattle Farms
1,115,Sheep and Goat Farms
2,119,Livestock Combination Farms
3,122,Horse and Other Equine Farms
4,129,Other Animal Specialty Farms n.e.c.
...,...,...
246,9952,Window Cleaning Services
247,9953,Janitorial Services
248,9959,Other Services to Buildings and Dwellings
249,9961,Ticket and Travel Agencies


In [39]:
# --- Data Merging ---

# 1. Concatenate *after* standardizing
merged_transactions = pd.concat(transaction_dfs, ignore_index=True)

# 2. Merge with KYC
merged_df = pd.merge(merged_transactions, kyc_df, on='customer_id', how='left')

# 3. Merge with Industry Codes
merged_df = pd.merge(merged_df, industry_codes_df, on='industry_code', how='left')

print(merged_df.info())

import pandas as pd
# read detected unusual customers
unusual_cust = pd.read_csv('final_anomalies_after_embedding.csv')

unusual_cust = unusual_cust.iloc[0:200]
unusual_cust.set_index('customer_id', inplace=True)
unusual_cust

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1494097 entries, 0 to 1494096
Data columns (total 54 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   abm_abm_id                 20657 non-null    object        
 1   customer_id                1494097 non-null  object        
 2   abm_transaction_amount     20657 non-null    float64       
 3   abm_debit_credit           20657 non-null    object        
 4   abm_cash_indicator         20657 non-null    object        
 5   abm_country                18867 non-null    object        
 6   abm_province               12059 non-null    object        
 7   abm_city                   18867 non-null    object        
 8   abm_transaction_date       20657 non-null    datetime64[ns]
 9   abm_transaction_time       20657 non-null    object        
 10  is_abm                     1494097 non-null  int64         
 11  is_card                    1494097 no

SYNCID0000004455
SYNCID0000014641
SYNCID0000005724
SYNCID0000012431
SYNCID0000004467
...
SYNCID0000013800
SYNCID0000009587
SYNCID0000007785
SYNCID0000002602
SYNCID0000008579


In [40]:
# import pandas as pd
# import numpy as np

# # --- Data Loading (ADAPT PATHS) ---
# abm_df = pd.read_csv("abm.csv")
# card_df = pd.read_csv("card.csv")
# cheque_df = pd.read_csv("cheque.csv")
# eft_df = pd.read_csv("eft.csv")
# emt_df = pd.read_csv("emt.csv")
# wire_df = pd.read_csv("wire.csv")
# kyc_df = pd.read_csv("kyc.csv")
# industry_codes_df = pd.read_csv("kyc_industry_codes.csv")

# # --- Data Cleaning and Consistency ---

# # 1. Standardize Column Names
# def standardize_column_names(df, prefix=None):
#     df.columns = df.columns.str.lower().str.replace(' ', '_')
#     if prefix:
#         df.columns = [f"{prefix}_{col}" if col not in ['customer_id', 'industry_code'] else col for col in df.columns]  # Keep customer_id and industry_code
#     return df

# all_dfs = [abm_df, card_df, cheque_df, eft_df, emt_df, wire_df, kyc_df, industry_codes_df]
# prefixes = ['abm', 'card', 'cheque', 'eft', 'emt', 'wire', 'kyc', 'industry']

# for df, prefix in zip(all_dfs, prefixes):
#     standardize_column_names(df, prefix)

# # 2. Consistent Data Types and Transaction Type Flags
# def ensure_consistent_types(df, id_col, date_cols, amount_col=None, transaction_type=None):
#     if id_col in df.columns:
#         df[id_col] = df[id_col].astype(str)
#     if 'customer_id' in df.columns:
#         df['customer_id'] = df['customer_id'].astype(str)

#     for col in date_cols:
#         if col in df.columns:
#             df[col] = pd.to_datetime(df[col], errors='coerce')

#     if amount_col and amount_col in df.columns:
#         df[amount_col] = pd.to_numeric(df[amount_col], errors='coerce').fillna(0)
#     # Add transaction type flag
#     if transaction_type:
#         df[f'is_{transaction_type}'] = 1
#     return df

# transaction_dfs = [abm_df, card_df, cheque_df, eft_df, emt_df, wire_df]
# transaction_id_cols = ['abm_abm_id', 'card_card_trxn_id', 'cheque_cheque_id', 'eft_eft_id', 'emt_emt_id', 'wire_wire_id']
# transaction_date_cols = ['abm_transaction_date','card_transaction_date','cheque_transaction_date','eft_transaction_date','emt_transaction_date','wire_transaction_date']  # Use prefixed names
# amount_col_names = ['abm_amount_cad', 'card_amount_cad','cheque_amount_cad','eft_amount_cad', 'emt_amount_cad', 'wire_amount_cad']
# transaction_types = ['abm', 'card', 'cheque', 'eft', 'emt', 'wire']

# for df, id_col, amount_col, txn_type, date_col in zip(transaction_dfs, transaction_id_cols, amount_col_names, transaction_types, transaction_date_cols):
#     ensure_consistent_types(df, id_col, [date_col], amount_col, txn_type) #pass date_col as a list
#     if amount_col in df.columns:  # Consistent amount column name
#         df.rename(columns={amount_col: f'{txn_type}_transaction_amount'}, inplace=True)


# ensure_consistent_types(kyc_df, 'customer_id', ['kyc_established_date', 'kyc_onboard_date'])
# ensure_consistent_types(industry_codes_df, 'industry_code', [])



# # --- Data Merging ---

# # 1. Concatenate *after* standardizing
# merged_transactions = pd.concat(transaction_dfs, ignore_index=True)

# # 2. Merge with KYC
# merged_df = pd.merge(merged_transactions, kyc_df, on='customer_id', how='left')

# # 3. Merge with Industry Codes
# merged_df = pd.merge(merged_df, industry_codes_df, on='industry_code', how='left')

# print(merged_df.info())

# --- Feature Engineering ---

# 1. Aggregate Transaction Features (for each transaction type)
for txn_type in transaction_types:
    amount_col = f'{txn_type}_transaction_amount'
    if amount_col in merged_df.columns:  # Check if the column exists
        customer_agg = merged_df.groupby('customer_id')[amount_col].agg(
            ['sum', 'mean', 'count', 'max', 'min', 'std']
        ).reset_index()
        customer_agg.columns = [
            'customer_id',
            f'total_{txn_type}_amount',
            f'avg_{txn_type}_amount',
            f'{txn_type}_count',
            f'max_{txn_type}_amount',
            f'min_{txn_type}_amount',
            f'std_{txn_type}_amount'
        ]
        merged_df = pd.merge(merged_df, customer_agg, on='customer_id', how='left')

# Fill NaN values resulting from aggregation with 0 (where std might be NaN)
for col in merged_df.columns:
    if 'std_' in col:
         merged_df[col] = merged_df[col].fillna(0)

print(merged_df.info())

# 2. Time-Based Features (for each transaction type)

# Find the global maximum date across all transaction types
global_max_date = pd.NaT  # Initialize as Not a Time
for date_col in transaction_date_cols:
    if date_col in merged_df.columns:
        max_date = merged_df[date_col].max()
        if pd.isna(global_max_date) or max_date > global_max_date:
            global_max_date = max_date

for txn_type, date_col in zip(transaction_types, transaction_date_cols):
    if date_col in merged_df.columns:
        # Days since last transaction (for this type)
        merged_df[f'days_since_last_{txn_type}'] = (merged_df.groupby('customer_id')[date_col].transform('max') - merged_df[date_col]).dt.days
        merged_df[f'days_since_last_{txn_type}'] = merged_df[f'days_since_last_{txn_type}'].fillna(0)

        # Transactions per month (for this type)  Need count from aggregation step.
        count_col = f'{txn_type}_count'
        if count_col in merged_df.columns:
            # Account active months (using global max date, and kyc onboard date)
              merged_df['account_active_months'] = (global_max_date - merged_df['kyc_onboard_date']).dt.days / 30.44
              merged_df['account_active_months'] = merged_df['account_active_months'].fillna(merged_df['account_active_months'].median()).clip(lower=0.5) # Replace 0 and NaN with 0.5
              merged_df[f'transactions_per_month_{txn_type}'] = merged_df[count_col] / merged_df['account_active_months']
              merged_df[f'transactions_per_month_{txn_type}'] = merged_df[f'transactions_per_month_{txn_type}'].fillna(0)

#Account age days
merged_df['account_age_days'] = (global_max_date - merged_df['kyc_onboard_date']).dt.days
merged_df['account_age_days'] = merged_df['account_age_days'].fillna(merged_df['account_age_days'].median())

merged_df

# 3. Debit/Credit Ratio (for each transaction type)
for txn_type in transaction_types:
    debit_credit_col = f'{txn_type}_debit_credit'
    amount_col = f'{txn_type}_transaction_amount'

    if debit_credit_col in merged_df.columns and amount_col in merged_df.columns:
        # Ensure consistent values ('debit' and 'credit')
        merged_df[debit_credit_col] = merged_df[debit_credit_col].str.lower()

        # Aggregate by customer and debit/credit, summing the transaction amounts
        debit_credit_agg = merged_df.groupby(['customer_id', debit_credit_col])[amount_col].sum().unstack(fill_value=0)

        # Calculate the debit/credit ratio, handling cases where 'debit' or 'credit' might be missing
        if 'debit' in debit_credit_agg.columns and 'credit' in debit_credit_agg.columns:
            debit_credit_agg[f'{txn_type}_debit_credit_ratio'] = debit_credit_agg['debit'] / (debit_credit_agg['credit'] + 1e-9)
        else:
             debit_credit_agg[f'{txn_type}_debit_credit_ratio'] = 0  # Default value if either is missing

        debit_credit_agg = debit_credit_agg[[f'{txn_type}_debit_credit_ratio']].reset_index() # Keep only the ratio
        merged_df = pd.merge(merged_df, debit_credit_agg, on='customer_id', how='left') #Merge back to main df
    else:
      merged_df[f'{txn_type}_debit_credit_ratio'] = 0

merged_df

# 3b. Credit/Debit Ratio (for each transaction type) - NEW CODE
for txn_type in transaction_types:
    debit_credit_col = f'{txn_type}_debit_credit'  # Same column as before
    amount_col = f'{txn_type}_transaction_amount'    # Same column as before

    if debit_credit_col in merged_df.columns and amount_col in merged_df.columns:
        # No need to lowercase again; it's done in the debit/credit ratio section
        credit_debit_agg = merged_df.groupby(['customer_id', debit_credit_col])[amount_col].sum().unstack(fill_value=0)

        # Calculate credit/debit ratio.  Key change: credit / (debit + 1e-9)
        if 'credit' in credit_debit_agg.columns and 'debit' in credit_debit_agg.columns:
            credit_debit_agg[f'{txn_type}_credit_debit_ratio'] = credit_debit_agg['credit'] / (credit_debit_agg['debit'] + 1e-9)
        else:
            credit_debit_agg[f'{txn_type}_credit_debit_ratio'] = 0  # Default if either is missing

        credit_debit_agg = credit_debit_agg[[f'{txn_type}_credit_debit_ratio']].reset_index()
        merged_df = pd.merge(merged_df, credit_debit_agg, on='customer_id', how='left')
    else:
        merged_df[f'{txn_type}_credit_debit_ratio'] = 0

merged_df

import pandas as pd
import numpy as np

# --- Industry Riskiness (based on Credit/Debit Ratio) ---

industry_risk_mapping_cd = {
    'HIGH': 0.7,
    'MEDIUM': 0.4,
    'LOW': 0.1,
    'UNKNOWN': 0.2
}

merged_df['industry_code'] = merged_df['industry_code'].fillna('MISSING')

for txn_type in transaction_types:
    ratio_col = f'{txn_type}_credit_debit_ratio'  # Use credit/debit ratio
    if ratio_col in merged_df.columns:
        # 1. Calculate industry stats OUTSIDE apply for efficiency
        industry_stats_cd = merged_df.groupby('industry_code')[ratio_col].agg(['mean', 'std']).fillna(0)

        # Rename columns for easier access
        industry_stats_cd.columns = ['mean_ratio', 'std_ratio']

        # 2. Merge industry stats into merged_df for vectorized operations
        merged_df = pd.merge(merged_df, industry_stats_cd, left_on='industry_code', right_index=True, how='left', suffixes=('', '_industry'))

        # 3. Vectorized risk calculation
        mean_col = 'mean_ratio'
        std_col = 'std_ratio'
        risk_col_name = f'{txn_type}_industry_risk_cd'

        # Initialize risk column with 'UNKNOWN' as default (vectorized)
        merged_df[risk_col_name] = 'UNKNOWN'

        # Vectorized conditions and assignment using boolean indexing and .loc
        condition_high = merged_df[ratio_col] > (merged_df[mean_col] + 2 * merged_df[std_col])
        condition_medium = (merged_df[ratio_col] > (merged_df[mean_col] + merged_df[std_col])) & (~condition_high) # Medium but not High
        condition_low = ~(condition_high | condition_medium) # Not High and Not Medium, so Low (or already UNKNOWN if industry_code was missing originally)

        merged_df.loc[condition_high, risk_col_name] = 'HIGH'
        merged_df.loc[condition_medium, risk_col_name] = 'MEDIUM'
        merged_df.loc[condition_low & (merged_df[risk_col_name] == 'UNKNOWN'), risk_col_name] = 'LOW' # Only set to LOW if it was UNKNOWN initially, otherwise it remains UNKNOWN if industry code was missing

        # 4. Vectorized risk score mapping
        score_col_name = f'{txn_type}_industry_risk_score_cd'
        merged_df[score_col_name] = merged_df[risk_col_name].map(industry_risk_mapping_cd).fillna(industry_risk_mapping_cd['UNKNOWN'])

        # 5. Remove merged stats columns (optional cleanup - if you don't need mean_ratio and std_ratio in the final df)
        # merged_df.drop(columns=['mean_ratio', 'std_ratio'], inplace=True)


print(merged_df.head()) # Print head to verify, remove for full dataset processing
print(merged_df.info()) # Print info to verify data types, remove for full dataset processing

import pandas as pd
import numpy as np

# --- Industry Riskiness (based on Debit/Credit Ratio) ---
industry_risk_mapping_dc = {
    'HIGH': 0.8,
    'MEDIUM': 0.5,
    'LOW': 0.2,
    'UNKNOWN': 0.3
}

merged_df['industry_code'] = merged_df['industry_code'].fillna('MISSING')

for txn_type in transaction_types:
    ratio_col = f'{txn_type}_debit_credit_ratio'  # Use debit/credit ratio
    if ratio_col in merged_df.columns:
        # 1. Calculate industry stats OUTSIDE apply for efficiency
        industry_stats_dc = merged_df.groupby('industry_code')[ratio_col].agg(['mean', 'std']).fillna(0)

        # Rename columns for easier access
        industry_stats_dc.columns = ['mean_ratio', 'std_ratio']

        # 2. Merge industry stats into merged_df for vectorized operations
        merged_df = pd.merge(merged_df, industry_stats_dc, left_on='industry_code', right_index=True, how='left', suffixes=('', '_industry_dc')) # Added suffix to distinguish

        # 3. Vectorized risk calculation
        mean_col = 'mean_ratio_industry_dc' # Use suffixed column names
        std_col = 'std_ratio_industry_dc'  # Use suffixed column names
        risk_col_name = f'{txn_type}_industry_risk_dc'

        # Initialize risk column with 'UNKNOWN' as default (vectorized)
        merged_df[risk_col_name] = 'UNKNOWN'

        # Vectorized conditions and assignment using boolean indexing and .loc
        condition_high = merged_df[ratio_col] > (merged_df[mean_col] + 2 * merged_df[std_col])
        condition_medium = (merged_df[ratio_col] > (merged_df[mean_col] + merged_df[std_col])) & (~condition_high) # Medium but not High
        condition_low = ~(condition_high | condition_medium) # Not High and Not Medium, so Low (or already UNKNOWN if industry_code was missing originally)

        merged_df.loc[condition_high, risk_col_name] = 'HIGH'
        merged_df.loc[condition_medium, risk_col_name] = 'MEDIUM'
        merged_df.loc[condition_low & (merged_df[risk_col_name] == 'UNKNOWN'), risk_col_name] = 'LOW' # Only set to LOW if it was UNKNOWN initially, otherwise it remains UNKNOWN if industry code was missing

        # 4. Vectorized risk score mapping
        score_col_name = f'{txn_type}_industry_risk_score_dc'
        merged_df[score_col_name] = merged_df[risk_col_name].map(industry_risk_mapping_dc).fillna(industry_risk_mapping_dc['UNKNOWN'])

        # 5. Remove merged stats columns (optional cleanup)
        merged_df.drop(columns=['mean_ratio_industry_dc', 'std_ratio_industry_dc'], inplace=True) # Dropping suffixed columns


print(merged_df.head()) # Print head to verify, remove for full dataset processing
print(merged_df.info()) # Print info to verify data types, remove for full dataset processing

# 5. Location Risk (Example)
high_risk_countries = ['other', 'GB','SE','IE', 'US']
merged_df['location_risk_score'] = 0
if 'abm_country' in merged_df:
  merged_df.loc[merged_df['abm_country'].isin(high_risk_countries), 'location_risk_score'] = 0.7
if 'card_country' in merged_df:
  merged_df.loc[merged_df['card_country'].isin(high_risk_countries), 'location_risk_score'] = 0.7

# 6. KYC Data Features
if 'established_date' in merged_df:
    merged_df['days_since_established'] = (merged_df['transaction_date'].max() - merged_df['established_date']).dt.days
    merged_df['days_since_established'] = merged_df['days_since_established'].fillna(merged_df['days_since_established'].median())
else:
    merged_df['days_since_established'] = 0

# --- Handling Missing Values ---
for col in merged_df.select_dtypes(include=np.number).columns:
    merged_df[col] = merged_df[col].fillna(merged_df[col].median())
# for col in merged_df.select_dtypes(include='object').columns:
#     merged_df[col] = merged_df[col].fillna('UNKNOWN')

print(merged_df.head())
print(merged_df.info())
print(merged_df.isnull().sum())

final_df = merged_df


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1494097 entries, 0 to 1494096
Data columns (total 90 columns):
 #   Column                     Non-Null Count    Dtype         
---  ------                     --------------    -----         
 0   abm_abm_id                 20657 non-null    object        
 1   customer_id                1494097 non-null  object        
 2   abm_transaction_amount     20657 non-null    float64       
 3   abm_debit_credit           20657 non-null    object        
 4   abm_cash_indicator         20657 non-null    object        
 5   abm_country                18867 non-null    object        
 6   abm_province               12059 non-null    object        
 7   abm_city                   18867 non-null    object        
 8   abm_transaction_date       20657 non-null    datetime64[ns]
 9   abm_transaction_time       20657 non-null    object        
 10  is_abm                     1494097 non-null  int64         
 11  is_card                    1494097 no

  merged_df.loc[merged_df['abm_country'].isin(high_risk_countries), 'location_risk_score'] = 0.7


             abm_abm_id       customer_id  abm_transaction_amount  \
0  ABM00000000000000006  SYNCID0000000014                   25.41   
1  ABM00000000000000008  SYNCID0000000034                  238.17   
2  ABM00000000000000009  SYNCID0000000034                 1655.43   
3  ABM00000000000000010  SYNCID0000000034                  620.69   
4  ABM00000000000000011  SYNCID0000000034                  323.70   

  abm_debit_credit abm_cash_indicator abm_country abm_province abm_city  \
0           credit              False          CA           SK   REGINA   
1            debit               True          CA           ON  TORONTO   
2           credit              False          CA          NaN    other   
3           credit               True          CA          NaN    other   
4            debit               True          CA          NaN    other   

  abm_transaction_date abm_transaction_time  ...  cheque_industry_risk_dc  \
0           2022-11-16             17:37:41  ...         

In [9]:
final_df.to_csv("merged_data.csv")

In [41]:
transaction_columns = [col for col in merged_df.columns if col.endswith('_transaction_amount')]

industry_columns = [col for col in merged_df.columns if col.endswith('_industry_risk_score_dc')]

location_columns = [col for col in merged_df.columns if (col.endswith('_country') or col.endswith('_province')  or col.endswith('_city'))]

dummy_cols = [col for col in merged_df.columns if col.startswith('is_')]

id_cols = [col for col in merged_df.columns if col.endswith('_id')]

id_cols.remove('customer_id')

date_cols = [col for col in merged_df.columns if col.endswith('_date')]

debit_credit_cols = [col for col in merged_df.columns if col.endswith('_debit_credit')]

timing_cols = [col for col in merged_df.columns if col.endswith('_transaction_time')]

days_since_cols = [col for col in merged_df.columns if col.startswith('days_since_last_')]


merged_df.drop(transaction_columns, inplace = True,axis =1)

merged_df.drop(days_since_cols, inplace = True,axis =1)

merged_df.drop(location_columns, inplace = True,axis =1)

merged_df.drop(dummy_cols, inplace = True,axis =1)

merged_df.drop(id_cols, inplace = True,axis =1)

merged_df.drop(date_cols, inplace = True,axis =1)

merged_df.drop(debit_credit_cols, inplace = True,axis =1)

merged_df.drop(timing_cols, inplace = True,axis =1)

merged_df.drop(['card_merchant_category','abm_cash_indicator'],inplace=True,axis=1)

merged_df.drop('location_risk_score', inplace=True, axis=1)

merged_df.drop('card_ecommerce_ind', inplace=True, axis=1)

merged_df.drop('industry_industry', inplace=True, axis=1)

merged_df = merged_df.drop_duplicates()

merged_df.shape


(16226, 97)

In [10]:
first_rows_df = merged_df

        customer_id abm_abm_id  abm_transaction_amount abm_debit_credit  \
0  SYNCID0000000000       None                  523.41             None   
1  SYNCID0000000001       None                  523.41             None   
2  SYNCID0000000002       None                  523.41             None   
3  SYNCID0000000004       None                  523.41             None   
4  SYNCID0000000005       None                  523.41             None   

  abm_cash_indicator abm_country abm_province abm_city abm_transaction_date  \
0               None        None         None     None                  NaT   
1               None        None         None     None                  NaT   
2               None        None         None     None                  NaT   
3               None        None         None     None                  NaT   
4               None        None         None     None                  NaT   

  abm_transaction_time  ...  cheque_industry_risk_dc  \
0                 

In [16]:
print(first_rows_df)

            customer_id abm_abm_id  abm_transaction_amount abm_debit_credit  \
0      SYNCID0000000000       None                  523.41             None   
1      SYNCID0000000001       None                  523.41             None   
2      SYNCID0000000002       None                  523.41             None   
3      SYNCID0000000004       None                  523.41             None   
4      SYNCID0000000005       None                  523.41             None   
...                 ...        ...                     ...              ...   
16221  SYNCID0000017178       None                  523.41             None   
16222  SYNCID0000017179       None                  523.41             None   
16223  SYNCID0000017180       None                  523.41             None   
16224  SYNCID0000017181       None                  523.41             None   
16225  SYNCID0000017182       None                  523.41             None   

      abm_cash_indicator abm_country abm_province a

In [23]:

first_few_rows = first_rows_df.head()

# Save to a txt file (tab-separated)
first_few_rows.to_csv("customer_embeddings.txt", sep='\t', index=False)


In [27]:


# Determine output directory
OUTPUT_DIR = '/mnt/output' if os.path.exists('/mnt/output') else '.'
print(f"Using output directory: {OUTPUT_DIR}")

# Create the output directory if it doesn't exist (important for local runs)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Construct the full output file path
output_path = os.path.join(OUTPUT_DIR, "customer_embeddings.txt")


Using output directory: .


In [18]:
df=first_rows_df

In [20]:
import random

class AML_Env:
    def __init__(self, df, initial_thresholds, workload_range):
        self.df = df
        self.thresholds = initial_thresholds
        self.workload_range = workload_range
        self.state = self.get_initial_state()
        self.action_space = [0, 1, 2, 3, 4, 5]
        self.reset()

    def get_initial_state(self):
        return [
            self.thresholds["cheque_credit_amount"],
            self.thresholds["cheque_debit_amount"],
            self.thresholds["cheque_credit_debit_ratio"],
            sum(self.workload_range) // 2
        ]

    def reset(self):
        self.state = self.get_initial_state()
        print("Environment reset. Initial state:", self.state)
        return self.state

    def step(self, action):
        print(f"Applying action {action}")
        self.apply_action(action)
        flagged_companies = self.run_rule_based_filtering()
        print("Flagged companies count:", len(flagged_companies))
        print("Flagged companies sample:", flagged_companies[['customer_id']].head().to_dict('records'))
        feedback = self.simulate_human_review(flagged_companies)
        print("Human review feedback:", feedback)
        reward = self.calculate_reward(feedback)
        print("Calculated reward:", reward)
        self.state = self.update_state(feedback)
        done = self.is_done()
        return self.state, reward, done, {}

    def apply_action(self, action):
        adjustments = [1.1, 0.9, 1.1, 0.9, 1.1, 0.9]
        keys = ["cheque_credit_amount", "cheque_debit_amount", "cheque_credit_debit_ratio"]
        index = action // 2
        self.thresholds[keys[index]] *= adjustments[action]
        print(f"Updated thresholds: {self.thresholds}")

    def run_rule_based_filtering(self):
        # Access 'cheque_transaction_amount' based on 'cheque_debit_credit' column
        flagged = self.df[
            (self.df['cheque_debit_credit'] == 'credit') & (self.df['cheque_transaction_amount'] > self.thresholds["cheque_credit_amount"]) |
            (self.df['cheque_debit_credit'] == 'debit') & (self.df['cheque_transaction_amount'] > self.thresholds["cheque_debit_amount"]) |
            (self.df['cheque_credit_debit_ratio'] > self.thresholds["cheque_credit_debit_ratio"])
        ]
        return flagged

    def simulate_human_review(self, flagged_companies):
        """Simulates human review, handling missing 'days_since_creation'."""

        if flagged_companies.empty:
            return {"true_positives": [], "false_positives": []}

        analyst_skill = random.choice(["novice", "intermediate", "expert"])
        confirmation_bias = random.uniform(0.0, 0.2)

        # --- Feature Probabilities ---

        # 1. Flagging Score (as before)
        flagged_companies['risk_score'] = (
            (flagged_companies['cheque_transaction_amount'] / self.thresholds["cheque_credit_amount"]) +  # Use 'cheque_transaction_amount' here
            (flagged_companies['cheque_transaction_amount'] / self.thresholds["cheque_debit_amount"]) +    # Use 'cheque_transaction_amount' here
            (flagged_companies['cheque_credit_debit_ratio'] / self.thresholds["cheque_credit_debit_ratio"])
        )
        # 2. New Account (Handle Missing 'days_since_creation')
        if 'days_since_creation' in flagged_companies.columns:
            flagged_companies['new_account_prob'] = 1.0 / (1.0 + flagged_companies['days_since_creation'])
        else:
            flagged_companies['new_account_prob'] = 0.5



        # --- Combine Probabilities ---
        flagged_companies['true_positive_prob'] = (
            0.7 * flagged_companies['risk_score'] +
            0.3 * flagged_companies['new_account_prob']
        )

        if analyst_skill == "novice":
            flagged_companies['true_positive_prob'] *= 1.2  # Novices are less accurate
        elif analyst_skill == "expert":
            flagged_companies['true_positive_prob'] *= 0.8  # Experts are more accurate

        flagged_companies['true_positive_prob'] += confirmation_bias
        flagged_companies['true_positive_prob'] = flagged_companies['true_positive_prob'].clip(0, 1)  # Ensure 0-1 range

        # --- Determine True/False Positives ---
        # Ensure at least 10% are true positives, but don't exceed the flagged count.
        min_true_positives = int(len(flagged_companies) * 0.1)
        num_true_positives = min_true_positives + sum(random.random() < prob for prob in flagged_companies['true_positive_prob'])
        num_true_positives = min(num_true_positives, len(flagged_companies))
        num_true_positives = max(num_true_positives, 0) # Ensure non-negative

        # Get indices of top probabilities, then select customer IDs.
        true_positive_indices = np.argsort(flagged_companies['true_positive_prob'])[::-1][:num_true_positives]
        true_positives = flagged_companies.iloc[true_positive_indices]['customer_id'].tolist()


        false_positives = [
                    cid for cid in flagged_companies['customer_id'].tolist()
                    if cid not in true_positives
                ]

        # Introduce error rate
        error_rate = 0.05
        final_true_positives = []
        final_false_positives = []

        for tp in true_positives:
            if random.random() < error_rate:
                final_false_positives.append(tp)  # Misclassified as FP
            else:
                final_true_positives.append(tp)

        for fp in false_positives:
            if random.random() < error_rate:
                final_true_positives.append(fp)  # Misclassified as TP
            else:
                final_false_positives.append(fp)

        return {"true_positives": final_true_positives, "false_positives": final_false_positives}

    def calculate_reward(self, feedback):
        tp, fp = len(feedback["true_positives"]), len(feedback["false_positives"])
        workload = tp + fp
        reward = tp * 10 - fp * 5

        if workload < self.workload_range[0]:
            reward -= (self.workload_range[0] - workload) * 2  # Penalize low workload
        elif workload > self.workload_range[1]:
            reward -= (workload - self.workload_range[1]) * 2  # Penalize high workload

        return reward

    def update_state(self, feedback):
        workload = len(feedback["true_positives"]) + len(feedback["false_positives"])
        self.state[-1] = workload  # Update workload in the state
        print("Updated state:", self.state)
        return self.state

    def is_done(self):
        return True  # Or define your stopping criteria (e.g., number of steps)

# Initialize thresholds and environment
initial_thresholds = {
    "cheque_credit_amount": 50000,
    "cheque_debit_amount": 50000,
    "cheque_credit_debit_ratio": 5.0
}
workload_range = (5, 50)

env = AML_Env(df, initial_thresholds, workload_range)

# --- Q-Learning ---
q_table = {}  # (state, action) -> Q-value
num_episodes = 100
#alpha = 0.1  # Learning rate
#gamma = 0.9  # Discount factor
#epsilon = 0.1 # Exploration rate

for episode in range(num_episodes):
    state = tuple(env.reset())  # Ensure state is hashable
    done = False

    while not done:
        # Choose action (epsilon-greedy)
        if state in q_table:
            # Exploit: Choose action with highest Q-value
            action = max(q_table[state], key=q_table[state].get, default=random.choice(env.action_space))
        else:
            # Explore: Choose a random action
            action = random.choice(env.action_space)

        print(f"Training Episode {episode} | State: {state} | Taking Action: {action}")

        # Take action and observe the next state, reward, and done signal
        next_state, reward, done, _ = env.step(action)
        next_state = tuple(next_state) # Make next_state hashable

        # Q-table update
        if state not in q_table:
            q_table[state] = {}  # Initialize actions for this state

        # No need for alpha, gamma or epsilon because this a simple Q-table
        q_table[state][action] = reward

        state = next_state

# --- Testing ---
print("\n--- Testing ---")
state = tuple(env.reset())  # Reset environment for testing
done = False
while not done:
    if state in q_table:
        action = max(q_table[state], key=q_table[state].get, default=random.choice(env.action_space)) # Exploit learned policy
    else:
        action = random.choice(env.action_space)  # Fallback to random if state unseen
    print(f"Testing with state {state}, taking action {action}")
    next_state, reward, done, _ = env.step(action)
    print(f"New state: {next_state}, Reward: {reward}, Done: {done}")
    state = tuple(next_state)


Environment reset. Initial state: [50000, 50000, 5.0, 27]
Environment reset. Initial state: [50000, 50000, 5.0, 27]
Training Episode 0 | State: (50000, 50000, 5.0, 27) | Taking Action: 0
Applying action 0
Updated thresholds: {'cheque_credit_amount': 55000.00000000001, 'cheque_debit_amount': 50000, 'cheque_credit_debit_ratio': 5.0}
Flagged companies count: 2189
Flagged companies sample: [{'customer_id': 'SYNCID0000000006'}, {'customer_id': 'SYNCID0000000008'}, {'customer_id': 'SYNCID0000000019'}, {'customer_id': 'SYNCID0000000038'}, {'customer_id': 'SYNCID0000000065'}]
Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005653', 'SYNCID0000005672', 'SYNCID0000005667', 'SYNCID0000005656', 'SYNCID0000005655', 'SYNCID0000005654', 'SYNCID0000005648', 'SYNCID0000005347', 'SYNCID0000005643', 'SYNCID0000005642', 'SYNCID0000005629', 'SYNCID0000005621', 'SYNCID0000005615', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'S

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005871', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005766', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005841', 'SYNCID0000005848', 'SYNCID0000005853', 'SYNCID0000005857', 'SYNCID0000005866', 'SYNCID0000005876', 'SYNCID0000005578', 'SYNCID0000005877', 'SYNCID0000005891', 'SYNCID0000005899', 'SYNCID0000005918', 'SYNCID0000005922', 'SYNCID0000005956', 'SYNCID0000005959', 'SYNCID0000005960', 'SYNCID0000005963', 'SYNCID0000005967', 'SYNCID0000005995', 'SYNCID0000006011', 'SYNCID0000006022', 'SYNCID0000006027', 'SYNCID0000005751', 'SYNCID0000005744', 'SYNCID0000005743', 'SYNCID0000005742', 'SYNCID0000005585', 'SYNCID0000005587', 'SYNCID0000005591', 'SYNCID0000005595', 'SYNCID0000005600', 'SYNCID0000005605', 'SYNCID0000005613', 'SYNCID0000005615', 'SYNCID0000005621', 'SYNCID0000005629', 'SYNCID0000005639', 'SYNCID0000005642

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005654', 'SYNCID0000005684', 'SYNCID0000005672', 'SYNCID0000005671', 'SYNCID0000005667', 'SYNCID0000005656', 'SYNCID0000005655', 'SYNCID0000005653', 'SYNCID0000005358', 'SYNCID0000005648', 'SYNCID0000005643', 'SYNCID0000005642', 'SYNCID0000005629', 'SYNCID0000005621', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005766', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005615', 'SYNCID0000005613', 'SYNCID0000005605', 'SYNCID0000005523', 'SYNCID0000005373', 'SYNCID0000005389', 'SYNCID0000005391', 'SYNCID0000005431', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005465', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494', 'SYNCID0000005517

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005654', 'SYNCID0000005684', 'SYNCID0000005672', 'SYNCID0000005671', 'SYNCID0000005667', 'SYNCID0000005656', 'SYNCID0000005655', 'SYNCID0000005653', 'SYNCID0000005358', 'SYNCID0000005648', 'SYNCID0000005643', 'SYNCID0000005642', 'SYNCID0000005639', 'SYNCID0000005629', 'SYNCID0000005621', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005766', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005615', 'SYNCID0000005613', 'SYNCID0000005605', 'SYNCID0000005523', 'SYNCID0000005373', 'SYNCID0000005389', 'SYNCID0000005391', 'SYNCID0000005431', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005465', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005493

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000006022', 'SYNCID0000005871', 'SYNCID0000005876', 'SYNCID0000005877', 'SYNCID0000005891', 'SYNCID0000005899', 'SYNCID0000005918', 'SYNCID0000005922', 'SYNCID0000005956', 'SYNCID0000005959', 'SYNCID0000005960', 'SYNCID0000005963', 'SYNCID0000005967', 'SYNCID0000005995', 'SYNCID0000006011', 'SYNCID0000006027', 'SYNCID0000005857', 'SYNCID0000006046', 'SYNCID0000006055', 'SYNCID0000006059', 'SYNCID0000006062', 'SYNCID0000006115', 'SYNCID0000006122', 'SYNCID0000006129', 'SYNCID0000006134', 'SYNCID0000006139', 'SYNCID0000006144', 'SYNCID0000006146', 'SYNCID0000006149', 'SYNCID0000005866', 'SYNCID0000005853', 'SYNCID0000006167', 'SYNCID0000005727', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005734

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005712', 'SYNCID0000005778', 'SYNCID0000005773', 'SYNCID0000005771', 'SYNCID0000005766', 'SYNCID0000005756', 'SYNCID0000005755', 'SYNCID0000005751', 'SYNCID0000005744', 'SYNCID0000005743', 'SYNCID0000005734', 'SYNCID0000005727', 'SYNCID0000005724', 'SYNCID0000005723', 'SYNCID0000005722', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005876', 'SYNCID0000005918', 'SYNCID0000005915', 'SYNCID0000005899', 'SYNCID0000005891', 'SYNCID0000005886', 'SYNCID0000005877', 'SYNCID0000005871', 'SYNCID0000005836', 'SYNCID0000005857', 'SYNCID0000005854', 'SYNCID0000005853', 'SYNCID0000005848', 'SYNCID0000005841', 'SYNCID0000005721', 'SYNCID0000005705', 'SYNCID0000005956', 'SYNCID0000005684', 'SYNCID0000005591', 'SYNCID0000005587', 'SYNCID0000005585', 'SYNCID0000005584', 'SYNCID0000005578', 'SYNCID0000005575', 'SYNCID0000005571', 'SYNCID0000005570', 'SYNCID0000005564', 'SYNCID0000005557', 'SYNCID0000005548

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005734', 'SYNCID0000005727', 'SYNCID0000005724', 'SYNCID0000005723', 'SYNCID0000005722', 'SYNCID0000005721', 'SYNCID0000005712', 'SYNCID0000005705', 'SYNCID0000005684', 'SYNCID0000005672', 'SYNCID0000005671', 'SYNCID0000005667', 'SYNCID0000005656', 'SYNCID0000005655', 'SYNCID0000005654', 'SYNCID0000005742', 'SYNCID0000005744', 'SYNCID0000005435', 'SYNCID0000005751', 'SYNCID0000005854', 'SYNCID0000005853', 'SYNCID0000005848', 'SYNCID0000005841', 'SYNCID0000005836', 'SYNCID0000005807', 'SYNCID0000005789', 'SYNCID0000005780', 'SYNCID0000005778', 'SYNCID0000005773', 'SYNCID0000005771', 'SYNCID0000005766', 'SYNCID0000005756', 'SYNCID0000005755', 'SYNCID0000005653', 'SYNCID0000005648', 'SYNCID0000005643', 'SYNCID0000005642', 'SYNCID0000005548', 'SYNCID0000005540', 'SYNCID0000005537', 'SYNCID0000005534', 'SYNCID0000005524', 'SYNCID0000005523', 'SYNCID0000005522', 'SYNCID0000005521', 'SYNCID0000005517', 'SYNCID0000005494

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000006062', 'SYNCID0000005918', 'SYNCID0000005922', 'SYNCID0000005956', 'SYNCID0000005959', 'SYNCID0000005960', 'SYNCID0000005963', 'SYNCID0000005967', 'SYNCID0000005995', 'SYNCID0000006011', 'SYNCID0000006022', 'SYNCID0000006027', 'SYNCID0000006046', 'SYNCID0000006055', 'SYNCID0000006059', 'SYNCID0000006084', 'SYNCID0000005891', 'SYNCID0000006115', 'SYNCID0000006122', 'SYNCID0000006129', 'SYNCID0000006134', 'SYNCID0000006139', 'SYNCID0000006142', 'SYNCID0000006144', 'SYNCID0000006146', 'SYNCID0000006149', 'SYNCID0000006156', 'SYNCID0000006167', 'SYNCID0000006172', 'SYNCID0000006177', 'SYNCID0000006189', 'SYNCID0000005899', 'SYNCID0000005877', 'SYNCID0000006209', 'SYNCID0000005751', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000006055', 'SYNCID0000005891', 'SYNCID0000005899', 'SYNCID0000005918', 'SYNCID0000005922', 'SYNCID0000005956', 'SYNCID0000005960', 'SYNCID0000005967', 'SYNCID0000005995', 'SYNCID0000006011', 'SYNCID0000006022', 'SYNCID0000006027', 'SYNCID0000006046', 'SYNCID0000006059', 'SYNCID0000005876', 'SYNCID0000006062', 'SYNCID0000006084', 'SYNCID0000006115', 'SYNCID0000006122', 'SYNCID0000006129', 'SYNCID0000006134', 'SYNCID0000006139', 'SYNCID0000006142', 'SYNCID0000006144', 'SYNCID0000006146', 'SYNCID0000006149', 'SYNCID0000006156', 'SYNCID0000006167', 'SYNCID0000006172', 'SYNCID0000005877', 'SYNCID0000005871', 'SYNCID0000006189', 'SYNCID0000005743', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000006129', 'SYNCID0000005960', 'SYNCID0000005963', 'SYNCID0000005967', 'SYNCID0000005995', 'SYNCID0000006011', 'SYNCID0000006022', 'SYNCID0000006027', 'SYNCID0000006046', 'SYNCID0000006055', 'SYNCID0000006059', 'SYNCID0000006062', 'SYNCID0000006084', 'SYNCID0000006115', 'SYNCID0000006122', 'SYNCID0000006134', 'SYNCID0000006228', 'SYNCID0000006142', 'SYNCID0000006144', 'SYNCID0000006146', 'SYNCID0000006149', 'SYNCID0000006156', 'SYNCID0000006167', 'SYNCID0000006172', 'SYNCID0000006177', 'SYNCID0000006189', 'SYNCID0000006203', 'SYNCID0000006209', 'SYNCID0000006212', 'SYNCID0000006217', 'SYNCID0000005956', 'SYNCID0000005922', 'SYNCID0000005918', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005766', 'SYNCID0000005771', 'SYNCID0000005773

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005789', 'SYNCID0000005841', 'SYNCID0000005848', 'SYNCID0000005853', 'SYNCID0000005857', 'SYNCID0000005866', 'SYNCID0000005871', 'SYNCID0000005876', 'SYNCID0000005877', 'SYNCID0000005899', 'SYNCID0000005918', 'SYNCID0000005922', 'SYNCID0000005956', 'SYNCID0000005959', 'SYNCID0000005960', 'SYNCID0000005963', 'SYNCID0000005967', 'SYNCID0000005995', 'SYNCID0000006011', 'SYNCID0000006022', 'SYNCID0000006027', 'SYNCID0000006046', 'SYNCID0000006055', 'SYNCID0000006059', 'SYNCID0000006062', 'SYNCID0000006084', 'SYNCID0000006115', 'SYNCID0000006122', 'SYNCID0000006129', 'SYNCID0000005605', 'SYNCID0000005778', 'SYNCID0000005621', 'SYNCID0000005629', 'SYNCID0000005639', 'SYNCID0000005642', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005995', 'SYNCID0000005866', 'SYNCID0000005871', 'SYNCID0000005876', 'SYNCID0000005877', 'SYNCID0000005891', 'SYNCID0000005899', 'SYNCID0000005918', 'SYNCID0000005956', 'SYNCID0000005959', 'SYNCID0000005960', 'SYNCID0000005963', 'SYNCID0000005967', 'SYNCID0000005985', 'SYNCID0000006011', 'SYNCID0000005853', 'SYNCID0000006022', 'SYNCID0000006027', 'SYNCID0000006046', 'SYNCID0000006055', 'SYNCID0000006059', 'SYNCID0000006062', 'SYNCID0000006084', 'SYNCID0000006115', 'SYNCID0000006122', 'SYNCID0000006129', 'SYNCID0000006134', 'SYNCID0000006139', 'SYNCID0000006142', 'SYNCID0000006144', 'SYNCID0000005857', 'SYNCID0000005848', 'SYNCID0000005723', 'SYNCID0000005642', 'SYNCID0000005643', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000006115', 'SYNCID0000006144', 'SYNCID0000006142', 'SYNCID0000006139', 'SYNCID0000006134', 'SYNCID0000006129', 'SYNCID0000006122', 'SYNCID0000006084', 'SYNCID0000005995', 'SYNCID0000006062', 'SYNCID0000006059', 'SYNCID0000006055', 'SYNCID0000006046', 'SYNCID0000006027', 'SYNCID0000006022', 'SYNCID0000006146', 'SYNCID0000006149', 'SYNCID0000006156', 'SYNCID0000006167', 'SYNCID0000006172', 'SYNCID0000006177', 'SYNCID0000006189', 'SYNCID0000006203', 'SYNCID0000006209', 'SYNCID0000006212', 'SYNCID0000006217', 'SYNCID0000006224', 'SYNCID0000006228', 'SYNCID0000006234', 'SYNCID0000006238', 'SYNCID0000006241', 'SYNCID0000006259', 'SYNCID0000006011', 'SYNCID0000005985', 'SYNCID0000004683', 'SYNCID0000005755', 'SYNCID0000005789', 'SYNCID0000005780', 'SYNCID0000005778', 'SYNCID0000005773', 'SYNCID0000005771', 'SYNCID0000005756', 'SYNCID0000005751', 'SYNCID0000005967', 'SYNCID0000005744', 'SYNCID0000005743', 'SYNCID0000005742

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000006011', 'SYNCID0000005854', 'SYNCID0000005853', 'SYNCID0000005848', 'SYNCID0000005841', 'SYNCID0000005836', 'SYNCID0000005807', 'SYNCID0000005789', 'SYNCID0000005780', 'SYNCID0000005778', 'SYNCID0000005771', 'SYNCID0000005756', 'SYNCID0000005755', 'SYNCID0000005751', 'SYNCID0000005744', 'SYNCID0000005857', 'SYNCID0000005866', 'SYNCID0000005871', 'SYNCID0000005936', 'SYNCID0000005985', 'SYNCID0000005967', 'SYNCID0000005963', 'SYNCID0000005960', 'SYNCID0000005959', 'SYNCID0000005956', 'SYNCID0000005922', 'SYNCID0000005876', 'SYNCID0000005918', 'SYNCID0000005915', 'SYNCID0000005899', 'SYNCID0000005891', 'SYNCID0000005886', 'SYNCID0000005877', 'SYNCID0000005743', 'SYNCID0000005742', 'SYNCID0000005734', 'SYNCID0000005595', 'SYNCID0000005621', 'SYNCID0000005615', 'SYNCID0000005613', 'SYNCID0000005605', 'SYNCID0000005600', 'SYNCID0000005591', 'SYNCID0000005642', 'SYNCID0000005587', 'SYNCID0000005585', 'SYNCID0000005584

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005877', 'SYNCID0000005956', 'SYNCID0000005918', 'SYNCID0000005899', 'SYNCID0000005891', 'SYNCID0000005886', 'SYNCID0000005876', 'SYNCID0000005960', 'SYNCID0000005871', 'SYNCID0000005866', 'SYNCID0000005857', 'SYNCID0000005854', 'SYNCID0000005853', 'SYNCID0000005848', 'SYNCID0000005959', 'SYNCID0000005963', 'SYNCID0000006142', 'SYNCID0000006059', 'SYNCID0000006134', 'SYNCID0000006129', 'SYNCID0000006122', 'SYNCID0000006115', 'SYNCID0000006084', 'SYNCID0000006062', 'SYNCID0000006055', 'SYNCID0000005967', 'SYNCID0000006046', 'SYNCID0000006027', 'SYNCID0000006022', 'SYNCID0000006011', 'SYNCID0000005995', 'SYNCID0000005985', 'SYNCID0000005841', 'SYNCID0000005807', 'SYNCID0000005789', 'SYNCID0000005653', 'SYNCID0000005672', 'SYNCID0000005671', 'SYNCID0000005656', 'SYNCID0000005655', 'SYNCID0000005648', 'SYNCID0000005780', 'SYNCID0000005643', 'SYNCID0000005642', 'SYNCID0000005639', 'SYNCID0000005629', 'SYNCID0000005621

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005724', 'SYNCID0000005722', 'SYNCID0000005721', 'SYNCID0000005684', 'SYNCID0000005672', 'SYNCID0000005671', 'SYNCID0000005667', 'SYNCID0000005656', 'SYNCID0000005655', 'SYNCID0000005654', 'SYNCID0000005653', 'SYNCID0000005648', 'SYNCID0000005643', 'SYNCID0000005642', 'SYNCID0000005723', 'SYNCID0000005727', 'SYNCID0000006331', 'SYNCID0000005734', 'SYNCID0000005848', 'SYNCID0000005841', 'SYNCID0000005836', 'SYNCID0000005807', 'SYNCID0000005789', 'SYNCID0000005780', 'SYNCID0000005778', 'SYNCID0000005771', 'SYNCID0000005756', 'SYNCID0000005755', 'SYNCID0000005751', 'SYNCID0000005744', 'SYNCID0000005743', 'SYNCID0000005742', 'SYNCID0000005639', 'SYNCID0000005629', 'SYNCID0000005621', 'SYNCID0000005615', 'SYNCID0000005534', 'SYNCID0000005524', 'SYNCID0000005523', 'SYNCID0000005522', 'SYNCID0000005521', 'SYNCID0000005517', 'SYNCID0000005494', 'SYNCID0000005493', 'SYNCID0000005471', 'SYNCID0000005465', 'SYNCID0000005457

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005648', 'SYNCID0000005734', 'SYNCID0000005727', 'SYNCID0000005724', 'SYNCID0000005723', 'SYNCID0000005722', 'SYNCID0000005721', 'SYNCID0000005712', 'SYNCID0000005684', 'SYNCID0000005672', 'SYNCID0000005671', 'SYNCID0000005667', 'SYNCID0000005656', 'SYNCID0000005655', 'SYNCID0000005654', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005807', 'SYNCID0000005857', 'SYNCID0000005854', 'SYNCID0000005853', 'SYNCID0000005848', 'SYNCID0000005841', 'SYNCID0000005836', 'SYNCID0000005789', 'SYNCID0000005751', 'SYNCID0000005780', 'SYNCID0000005778', 'SYNCID0000005773', 'SYNCID0000005771', 'SYNCID0000005756', 'SYNCID0000005755', 'SYNCID0000005643', 'SYNCID0000005871', 'SYNCID0000005642', 'SYNCID0000005548', 'SYNCID0000005540', 'SYNCID0000005537', 'SYNCID0000005534', 'SYNCID0000005524', 'SYNCID0000005523', 'SYNCID0000005522', 'SYNCID0000005521', 'SYNCID0000005494', 'SYNCID0000005493', 'SYNCID0000005476

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005836', 'SYNCID0000005866', 'SYNCID0000005857', 'SYNCID0000005854', 'SYNCID0000005853', 'SYNCID0000005848', 'SYNCID0000005841', 'SYNCID0000005807', 'SYNCID0000005789', 'SYNCID0000005780', 'SYNCID0000005778', 'SYNCID0000005773', 'SYNCID0000005771', 'SYNCID0000005756', 'SYNCID0000005877', 'SYNCID0000005751', 'SYNCID0000005960', 'SYNCID0000006011', 'SYNCID0000005995', 'SYNCID0000005985', 'SYNCID0000005967', 'SYNCID0000005963', 'SYNCID0000005959', 'SYNCID0000005886', 'SYNCID0000005956', 'SYNCID0000005922', 'SYNCID0000005918', 'SYNCID0000005915', 'SYNCID0000005899', 'SYNCID0000005755', 'SYNCID0000006027', 'SYNCID0000005613', 'SYNCID0000005643', 'SYNCID0000005642', 'SYNCID0000005639', 'SYNCID0000005629', 'SYNCID0000005621', 'SYNCID0000005615', 'SYNCID0000005605', 'SYNCID0000005653', 'SYNCID0000005600', 'SYNCID0000005595', 'SYNCID0000005591', 'SYNCID0000005587', 'SYNCID0000005585', 'SYNCID0000005584', 'SYNCID0000005648

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005876', 'SYNCID0000005922', 'SYNCID0000005918', 'SYNCID0000005899', 'SYNCID0000005886', 'SYNCID0000005871', 'SYNCID0000005959', 'SYNCID0000005866', 'SYNCID0000005857', 'SYNCID0000005854', 'SYNCID0000005853', 'SYNCID0000005848', 'SYNCID0000005841', 'SYNCID0000005956', 'SYNCID0000005960', 'SYNCID0000006142', 'SYNCID0000006055', 'SYNCID0000006129', 'SYNCID0000006122', 'SYNCID0000006115', 'SYNCID0000006062', 'SYNCID0000006059', 'SYNCID0000006046', 'SYNCID0000005963', 'SYNCID0000006027', 'SYNCID0000006022', 'SYNCID0000006011', 'SYNCID0000005995', 'SYNCID0000005985', 'SYNCID0000005967', 'SYNCID0000005807', 'SYNCID0000005789', 'SYNCID0000005780', 'SYNCID0000005648', 'SYNCID0000005671', 'SYNCID0000005667', 'SYNCID0000005656', 'SYNCID0000005655', 'SYNCID0000005654', 'SYNCID0000005653', 'SYNCID0000005643', 'SYNCID0000005778', 'SYNCID0000005642', 'SYNCID0000005639', 'SYNCID0000005629', 'SYNCID0000005621', 'SYNCID0000005613

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005956', 'SYNCID0000005780', 'SYNCID0000005778', 'SYNCID0000005773', 'SYNCID0000005771', 'SYNCID0000005756', 'SYNCID0000005755', 'SYNCID0000005751', 'SYNCID0000005744', 'SYNCID0000005743', 'SYNCID0000005742', 'SYNCID0000005727', 'SYNCID0000005724', 'SYNCID0000005723', 'SYNCID0000005722', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005836', 'SYNCID0000005877', 'SYNCID0000005922', 'SYNCID0000005918', 'SYNCID0000005915', 'SYNCID0000005899', 'SYNCID0000005891', 'SYNCID0000005886', 'SYNCID0000005876', 'SYNCID0000005841', 'SYNCID0000005871', 'SYNCID0000005866', 'SYNCID0000005857', 'SYNCID0000005854', 'SYNCID0000005853', 'SYNCID0000005848', 'SYNCID0000005721', 'SYNCID0000005712', 'SYNCID0000005571', 'SYNCID0000005591', 'SYNCID0000005587', 'SYNCID0000005585', 'SYNCID0000005584', 'SYNCID0000005578', 'SYNCID0000005575', 'SYNCID0000005570', 'SYNCID0000005600', 'SYNCID0000005564', 'SYNCID0000005557', 'SYNCID0000005548

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005848', 'SYNCID0000005876', 'SYNCID0000005871', 'SYNCID0000005866', 'SYNCID0000005857', 'SYNCID0000005854', 'SYNCID0000005853', 'SYNCID0000005841', 'SYNCID0000005886', 'SYNCID0000005836', 'SYNCID0000005807', 'SYNCID0000005789', 'SYNCID0000005780', 'SYNCID0000005778', 'SYNCID0000005773', 'SYNCID0000005877', 'SYNCID0000005891', 'SYNCID0000005756', 'SYNCID0000005967', 'SYNCID0000006027', 'SYNCID0000006022', 'SYNCID0000006018', 'SYNCID0000006011', 'SYNCID0000005995', 'SYNCID0000005985', 'SYNCID0000005963', 'SYNCID0000005899', 'SYNCID0000005960', 'SYNCID0000005959', 'SYNCID0000005956', 'SYNCID0000005922', 'SYNCID0000005918', 'SYNCID0000005915', 'SYNCID0000005771', 'SYNCID0000005755', 'SYNCID0000006055', 'SYNCID0000005615', 'SYNCID0000005648', 'SYNCID0000005643', 'SYNCID0000005642', 'SYNCID0000005639', 'SYNCID0000005629', 'SYNCID0000005621', 'SYNCID0000005613', 'SYNCID0000005654', 'SYNCID0000005605', 'SYNCID0000005600

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Flagged companies count: 2375
Flagged companies sample: [{'customer_id': 'SYNCID0000000006'}, {'customer_id': 'SYNCID0000000008'}, {'customer_id': 'SYNCID0000000019'}, {'customer_id': 'SYNCID0000000038'}, {'customer_id': 'SYNCID0000000065'}]
Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005727', 'SYNCID0000005723', 'SYNCID0000005722', 'SYNCID0000005721', 'SYNCID0000005712', 'SYNCID0000005705', 'SYNCID0000005684', 'SYNCID0000005672', 'SYNCID0000005671', 'SYNCID0000005667', 'SYNCID0000005656', 'SYNCID0000005655', 'SYNCID0000005654', 'SYNCID0000005653', 'SYNCID0000005648', 'SYNCID0000005643', 'SYNCID0000005724', 'SYNCID0000005734', 'SYNCID0000006338', 'SYNCID0000005742', 'SYNCID0000005853', 'SYNCID0000005848', 'SYNCID0000005841', 'SYNCID0000005836', 'SYNCID0000005807', 'SYNCID0000005789', 'SYNCID0000005780', 'SYNCID0000005773', 'SYNCID0000005771', 'SYNCID0000005756', 'SYNCID0000005755', 'SYNCID0000005751', 'SYNCID0000005744', 'SYNCID0000005743', 'SYNCID00000056

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Flagged companies sample: [{'customer_id': 'SYNCID0000000006'}, {'customer_id': 'SYNCID0000000008'}, {'customer_id': 'SYNCID0000000019'}, {'customer_id': 'SYNCID0000000038'}, {'customer_id': 'SYNCID0000000065'}]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005671', 'SYNCID0000005722', 'SYNCID0000005721', 'SYNCID0000005712', 'SYNCID0000005705', 'SYNCID0000005684', 'SYNCID0000005672', 'SYNCID0000005667', 'SYNCID0000005724', 'SYNCID0000005656', 'SYNCID0000005655', 'SYNCID0000005654', 'SYNCID0000005653', 'SYNCID0000005648', 'SYNCID0000005643', 'SYNCID0000005723', 'SYNCID0000005727', 'SYNCID0000005639', 'SYNCID0000005773', 'SYNCID0000005841', 'SYNCID0000005836', 'SYNCID0000005807', 'SYNCID0000005789', 'SYNCID0000005780', 'SYNCID0000005778', 'SYNCID0000005771', 'SYNCID0000005734', 'SYNCID0000005756', 'SYNCID0000005755', 'SYNCID0000005751', 'SYNCID0000005744', 'SYNCID0000005743', 'SYNCID0000005742', 'SYNCID0000005642', 'SYNCID0000005629', 'SYNCID0000005853', 'SYNCID0000005494', 'SYNCID0000005534', 'SYNCID0000005524', 'SYNCID0000005523', 'SYNCID0000005522', 'SYNCID0000005521', 'SYNCID0000005517', 'SYNCID0000005493', 'SYNCID0000005540', 'SYNCID0000005476', 'SYNCID0000005471

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Flagged companies count: 2302
Flagged companies sample: [{'customer_id': 'SYNCID0000000006'}, {'customer_id': 'SYNCID0000000008'}, {'customer_id': 'SYNCID0000000019'}, {'customer_id': 'SYNCID0000000038'}, {'customer_id': 'SYNCID0000000065'}]
Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005773', 'SYNCID0000005771', 'SYNCID0000005756', 'SYNCID0000005755', 'SYNCID0000005751', 'SYNCID0000005744', 'SYNCID0000005742', 'SYNCID0000005780', 'SYNCID0000005734', 'SYNCID0000005727', 'SYNCID0000005724', 'SYNCID0000005723', 'SYNCID0000005722', 'SYNCID0000005721', 'SYNCID0000005778', 'SYNCID0000005789', 'SYNCID0000005922', 'SYNCID0000005871', 'SYNCID0000005915', 'SYNCID0000005899', 'SYNCID0000005891', 'SYNCID0000005886', 'SYNCID0000005877', 'SYNCID0000005876', 'SYNCID0000005866', 'SYNCID0000005807', 'SYNCID0000005857', 'SYNCID0000005854', 'SYNCID0000005853', 'SYNCID0000005848', 'SYNCID0000005841', 'SYNCID0000005836', 'SYNCID0000005705', 'SYNCID0000005684', 'SYNCID00000056

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005727', 'SYNCID0000005755', 'SYNCID0000005751', 'SYNCID0000005744', 'SYNCID0000005734', 'SYNCID0000005724', 'SYNCID0000005771', 'SYNCID0000005723', 'SYNCID0000005722', 'SYNCID0000005721', 'SYNCID0000005712', 'SYNCID0000005705', 'SYNCID0000005684', 'SYNCID0000005756', 'SYNCID0000005773', 'SYNCID0000005671', 'SYNCID0000005854', 'SYNCID0000005886', 'SYNCID0000005877', 'SYNCID0000005876', 'SYNCID0000005871', 'SYNCID0000005866', 'SYNCID0000005857', 'SYNCID0000005853', 'SYNCID0000005778', 'SYNCID0000005848', 'SYNCID0000005841', 'SYNCID0000005807', 'SYNCID0000005789', 'SYNCID0000005780', 'SYNCID0000005672', 'SYNCID0000005667', 'SYNCID0000005494', 'SYNCID0000005548', 'SYNCID0000005578', 'SYNCID0000005575', 'SYNCID0000005571', 'SYNCID0000005570', 'SYNCID0000005564', 'SYNCID0000005557', 'SYNCID0000005540', 'SYNCID0000005585', 'SYNCID0000005537', 'SYNCID0000005534', 'SYNCID0000005524', 'SYNCID0000005523', 'SYNCID0000005522

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005684', 'SYNCID0000005724', 'SYNCID0000005723', 'SYNCID0000005722', 'SYNCID0000005721', 'SYNCID0000005712', 'SYNCID0000005705', 'SYNCID0000005672', 'SYNCID0000005734', 'SYNCID0000005671', 'SYNCID0000005667', 'SYNCID0000005656', 'SYNCID0000005655', 'SYNCID0000005654', 'SYNCID0000005653', 'SYNCID0000005727', 'SYNCID0000005742', 'SYNCID0000005643', 'SYNCID0000005780', 'SYNCID0000005853', 'SYNCID0000005848', 'SYNCID0000005841', 'SYNCID0000005807', 'SYNCID0000005789', 'SYNCID0000005778', 'SYNCID0000005743', 'SYNCID0000005773', 'SYNCID0000005771', 'SYNCID0000005756', 'SYNCID0000005751', 'SYNCID0000005744', 'SYNCID0000005648', 'SYNCID0000005642', 'SYNCID0000005857', 'SYNCID0000005521', 'SYNCID0000005540', 'SYNCID0000005537', 'SYNCID0000005534', 'SYNCID0000005524', 'SYNCID0000005523', 'SYNCID0000005522', 'SYNCID0000005557', 'SYNCID0000005494', 'SYNCID0000005493', 'SYNCID0000005476', 'SYNCID0000005471', 'SYNCID0000005457

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005876', 'SYNCID0000005918', 'SYNCID0000005915', 'SYNCID0000005899', 'SYNCID0000005891', 'SYNCID0000005886', 'SYNCID0000005877', 'SYNCID0000005871', 'SYNCID0000005956', 'SYNCID0000005866', 'SYNCID0000005854', 'SYNCID0000005853', 'SYNCID0000005848', 'SYNCID0000005841', 'SYNCID0000005922', 'SYNCID0000005959', 'SYNCID0000005789', 'SYNCID0000006027', 'SYNCID0000006115', 'SYNCID0000006062', 'SYNCID0000006059', 'SYNCID0000006055', 'SYNCID0000006046', 'SYNCID0000006022', 'SYNCID0000006018', 'SYNCID0000006011', 'SYNCID0000005995', 'SYNCID0000005985', 'SYNCID0000005807', 'SYNCID0000005780', 'SYNCID0000006129', 'SYNCID0000005643', 'SYNCID0000005667', 'SYNCID0000005656', 'SYNCID0000005655', 'SYNCID0000005654', 'SYNCID0000005653', 'SYNCID0000005648', 'SYNCID0000005642', 'SYNCID0000005672', 'SYNCID0000005639', 'SYNCID0000005629', 'SYNCID0000005621', 'SYNCID0000005615', 'SYNCID0000005613', 'SYNCID0000005605', 'SYNCID0000005671

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Flagged companies count: 2222
Flagged companies sample: [{'customer_id': 'SYNCID0000000006'}, {'customer_id': 'SYNCID0000000008'}, {'customer_id': 'SYNCID0000000019'}, {'customer_id': 'SYNCID0000000038'}, {'customer_id': 'SYNCID0000000065'}]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005671', 'SYNCID0000005722', 'SYNCID0000005721', 'SYNCID0000005705', 'SYNCID0000005684', 'SYNCID0000005672', 'SYNCID0000005854', 'SYNCID0000005656', 'SYNCID0000005655', 'SYNCID0000005654', 'SYNCID0000005653', 'SYNCID0000005648', 'SYNCID0000005643', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005841', 'SYNCID0000005848', 'SYNCID0000005642', 'SYNCID0000005540', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005465', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005493', 'SYNCID0000005494', 'SYNCID0000005517', 'SYNCID0000005523', 'SYNCID0000005524', 'SYNCID0000005534', 'SYNCID0000005537', 'SYNCID0000005548', 'SYNCID0000005621', 'SYNCID0000005564

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005734', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005742', 'SYNCID0000005866', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005848', 'SYNCID0000005853', 'SYNCID0000005648', 'SYNCID0000005643', 'SYNCID0000005642', 'SYNCID0000005639', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494', 'SYNCID0000005517', 'SYNCID0000005523', 'SYNCID0000005524', 'SYNCID0000005534', 'SYNCID0000005537', 'SYNCID0000005540', 'SYNCID0000005548', 'SYNCID0000005564', 'SYNCID0000005570

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005734', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005742', 'SYNCID0000005866', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005841', 'SYNCID0000005848', 'SYNCID0000005853', 'SYNCID0000005648', 'SYNCID0000005643', 'SYNCID0000005642', 'SYNCID0000005639', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494', 'SYNCID0000005517', 'SYNCID0000005523', 'SYNCID0000005524', 'SYNCID0000005534', 'SYNCID0000005537', 'SYNCID0000005540

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Flagged companies count: 2153
Flagged companies sample: [{'customer_id': 'SYNCID0000000006'}, {'customer_id': 'SYNCID0000000008'}, {'customer_id': 'SYNCID0000000019'}, {'customer_id': 'SYNCID0000000038'}, {'customer_id': 'SYNCID0000000065'}]
Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005722', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005841', 'SYNCID0000005848', 'SYNCID0000005853', 'SYNCID0000005857', 'SYNCID0000005866', 'SYNCID0000005871', 'SYNCID0000005876', 'SYNCID0000005877', 'SYNCID0000005891', 'SYNCID0000005918', 'SYNCID0000005922', 'SYNCID0000005956', 'SYNCID0000005959', 'SYNCID0000005723', 'SYNCID0000005721', 'SYNCID0000005963', 'SYNCID0000005705', 'SYNCID0000005548', 'SYNCID0000005564', 'SYNCID00000055

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Flagged companies count: 2159
Flagged companies sample: [{'customer_id': 'SYNCID0000000006'}, {'customer_id': 'SYNCID0000000008'}, {'customer_id': 'SYNCID0000000019'}, {'customer_id': 'SYNCID0000000038'}, {'customer_id': 'SYNCID0000000065'}]
Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005705', 'SYNCID0000005621', 'SYNCID0000005629', 'SYNCID0000005639', 'SYNCID0000005642', 'SYNCID0000005643', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005721', 'SYNCID0000005613', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005615', 'SYNCID0000005605', 'SYNCID0000005346', 'SYNCID0000005523', 'SYNCID00000053

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000005807', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005841', 'SYNCID0000005537', 'SYNCID0000005848', 'SYNCID0000005853', 'SYNCID0000005857', 'SYNCID0000005866', 'SYNCID0000005871', 'SYNCID0000005876', 'SYNCID0000005877', 'SYNCID0000005891', 'SYNCID0000005899', 'SYNCID0000005918', 'SYNCID0000005922', 'SYNCID0000005956', 'SYNCID0000005959', 'SYNCID0000005960', 'SYNCID0000005723', 'SYNCID0000005722', 'SYNCID0000005721', 'SYNCID0000005705', 'SYNCID0000005564', 'SYNCID0000005570', 'SYNCID0000005575', 'SYNCID0000005578', 'SYNCID0000005584', 'SYNCID0000005585', 'SYNCID0000005587', 'SYNCID0000005591', 'SYNCID0000005595', 'SYNCID0000005605', 'SYNCID0000005613', 'SYNCID0000005615', 'SYNCID0000005621

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005807', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005841', 'SYNCID0000005722', 'SYNCID0000005848', 'SYNCID0000005853', 'SYNCID0000005857', 'SYNCID0000005866', 'SYNCID0000005871', 'SYNCID0000005876', 'SYNCID0000005877', 'SYNCID0000005891', 'SYNCID0000005899', 'SYNCID0000005918', 'SYNCID0000005922', 'SYNCID0000005956', 'SYNCID0000005959', 'SYNCID0000005723', 'SYNCID0000005721', 'SYNCID0000005963', 'SYNCID0000005605', 'SYNCID0000005540', 'SYNCID0000005548', 'SYNCID0000005564', 'SYNCID0000005570', 'SYNCID0000005571', 'SYNCID0000005575', 'SYNCID0000005578', 'SYNCID0000005584', 'SYNCID0000005585', 'SYNCID0000005587', 'SYNCID0000005591', 'SYNCID0000005595', 'SYNCID0000005600', 'SYNCID0000005613', 'SYNCID0000005705

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005853', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005841', 'SYNCID0000005643', 'SYNCID0000005642', 'SYNCID0000005639', 'SYNCID0000005564', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494', 'SYNCID0000005517', 'SYNCID0000005523', 'SYNCID0000005524', 'SYNCID0000005534', 'SYNCID0000005537', 'SYNCID0000005540', 'SYNCID0000005548', 'SYNCID0000005570

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005639', 'SYNCID0000005643', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005642', 'SYNCID0000005629', 'SYNCID0000006319', 'SYNCID0000005391', 'SYNCID0000005431', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494', 'SYNCID0000005517', 'SYNCID0000005523', 'SYNCID0000005524', 'SYNCID0000005534', 'SYNCID0000005537', 'SYNCID0000005540', 'SYNCID0000005548

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005642', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005841', 'SYNCID0000005643', 'SYNCID0000005639', 'SYNCID0000008465', 'SYNCID0000005629', 'SYNCID0000005431', 'SYNCID0000005435', 'SYNCID0000005457', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494', 'SYNCID0000005517', 'SYNCID0000005523', 'SYNCID0000005524', 'SYNCID0000005534', 'SYNCID0000005537', 'SYNCID0000005540', 'SYNCID0000005548

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005721', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005751', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005841', 'SYNCID0000005848', 'SYNCID0000005853', 'SYNCID0000005857', 'SYNCID0000005871', 'SYNCID0000005876', 'SYNCID0000005877', 'SYNCID0000005891', 'SYNCID0000005899', 'SYNCID0000005918', 'SYNCID0000005922', 'SYNCID0000005956', 'SYNCID0000005705', 'SYNCID0000005960', 'SYNCID0000005684', 'SYNCID0000005537', 'SYNCID0000005540', 'SYNCID0000005548', 'SYNCID0000005564', 'SYNCID0000005570', 'SYNCID0000005571', 'SYNCID0000005575', 'SYNCID0000005578', 'SYNCID0000005584', 'SYNCID0000005585', 'SYNCID0000005587', 'SYNCID0000005591', 'SYNCID0000005595', 'SYNCID0000005600', 'SYNCID0000005605', 'SYNCID0000005615', 'SYNCID0000005621', 'SYNCID0000005639

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000006084', 'SYNCID0000005778', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005841', 'SYNCID0000005848', 'SYNCID0000005853', 'SYNCID0000005857', 'SYNCID0000005866', 'SYNCID0000005876', 'SYNCID0000005877', 'SYNCID0000005891', 'SYNCID0000005899', 'SYNCID0000005918', 'SYNCID0000005922', 'SYNCID0000005956', 'SYNCID0000005959', 'SYNCID0000005960', 'SYNCID0000005963', 'SYNCID0000005967', 'SYNCID0000005995', 'SYNCID0000006022', 'SYNCID0000006027', 'SYNCID0000006046', 'SYNCID0000006059', 'SYNCID0000005773', 'SYNCID0000005771', 'SYNCID0000005756', 'SYNCID0000005654', 'SYNCID0000005585', 'SYNCID0000005587', 'SYNCID0000005591', 'SYNCID0000005595', 'SYNCID0000005600', 'SYNCID0000005605', 'SYNCID0000005613', 'SYNCID0000005615', 'SYNCID0000005639', 'SYNCID0000005642', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005722

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005615', 'SYNCID0000005629', 'SYNCID0000005639', 'SYNCID0000005642', 'SYNCID0000005643', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005621', 'SYNCID0000005613', 'SYNCID0000005605', 'SYNCID0000005373', 'SYNCID0000005389', 'SYNCID0000005391', 'SYNCID0000005431', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494', 'SYNCID0000005517', 'SYNCID0000005523', 'SYNCID0000005524', 'SYNCID0000005534', 'SYNCID0000005537', 'SYNCID0000005540

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005615', 'SYNCID0000005639', 'SYNCID0000005642', 'SYNCID0000005643', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005613', 'SYNCID0000006313', 'SYNCID0000005605', 'SYNCID0000005362', 'SYNCID0000005373', 'SYNCID0000005391', 'SYNCID0000005431', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494', 'SYNCID0000005517', 'SYNCID0000005523', 'SYNCID0000005524', 'SYNCID0000005534', 'SYNCID0000005537

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005615', 'SYNCID0000005629', 'SYNCID0000005639', 'SYNCID0000005642', 'SYNCID0000005643', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005621', 'SYNCID0000005613', 'SYNCID0000006313', 'SYNCID0000005605', 'SYNCID0000005362', 'SYNCID0000005373', 'SYNCID0000005389', 'SYNCID0000005391', 'SYNCID0000005431', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494', 'SYNCID0000005523', 'SYNCID0000005524', 'SYNCID0000005534

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000005918', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005841', 'SYNCID0000005848', 'SYNCID0000005853', 'SYNCID0000005857', 'SYNCID0000005866', 'SYNCID0000005871', 'SYNCID0000005876', 'SYNCID0000005877', 'SYNCID0000005891', 'SYNCID0000005899', 'SYNCID0000005922', 'SYNCID0000006129', 'SYNCID0000005956', 'SYNCID0000005959', 'SYNCID0000005960', 'SYNCID0000005963', 'SYNCID0000005967', 'SYNCID0000005995', 'SYNCID0000006022', 'SYNCID0000006027', 'SYNCID0000006046', 'SYNCID0000006055', 'SYNCID0000006059', 'SYNCID0000006062', 'SYNCID0000005778', 'SYNCID0000005773', 'SYNCID0000005771', 'SYNCID0000005756', 'SYNCID0000005587', 'SYNCID0000005591', 'SYNCID0000005595', 'SYNCID0000005600', 'SYNCID0000005605', 'SYNCID0000005613', 'SYNCID0000005615', 'SYNCID0000005621', 'SYNCID0000005639', 'SYNCID0000005642', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005672', 'SYNCID0000005684

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Flagged companies count: 2027
Flagged companies sample: [{'customer_id': 'SYNCID0000000006'}, {'customer_id': 'SYNCID0000000008'}, {'customer_id': 'SYNCID0000000038'}, {'customer_id': 'SYNCID0000000065'}, {'customer_id': 'SYNCID0000000068'}]
Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005773', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005841', 'SYNCID0000005848', 'SYNCID0000005853', 'SYNCID0000005866', 'SYNCID0000005871', 'SYNCID0000005876', 'SYNCID0000005877', 'SYNCID0000005891', 'SYNCID0000005899', 'SYNCID0000005918', 'SYNCID0000005922', 'SYNCID0000005956', 'SYNCID0000005959', 'SYNCID0000005960', 'SYNCID0000005963', 'SYNCID0000005967', 'SYNCID0000005995', 'SYNCID0000006027', 'SYNCID0000006046', 'SYNCID0000006055', 'SYNCID0000006059', 'SYNCID0000006062', 'SYNCID0000005778', 'SYNCID0000005771', 'SYNCID0000008716', 'SYNCID0000005585', 'SYNCID0000005587', 'SYNCID0000005591', 'SYNCID0000005595', 'SYNCID0000005605', 'SYNCID00000056

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] += confirmation_bias
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = flagged_companies['true_positive_prob'].clip(0, 1)  # Ensure 0-1 range
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score']

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005724', 'SYNCID0000005643', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005727', 'SYNCID0000005639', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005629', 'SYNCID0000005389', 'SYNCID0000005548', 'SYNCID0000005431', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494', 'SYNCID0000005517', 'SYNCID0000005523', 'SYNCID0000005534', 'SYNCID0000005537', 'SYNCID0000005540', 'SYNCID0000005564', 'SYNCID0000005621', 'SYNCID0000005570', 'SYNCID0000005575

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Flagged companies count: 2022
Flagged companies sample: [{'customer_id': 'SYNCID0000000006'}, {'customer_id': 'SYNCID0000000008'}, {'customer_id': 'SYNCID0000000038'}, {'customer_id': 'SYNCID0000000065'}, {'customer_id': 'SYNCID0000000068'}]
Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005727', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005734', 'SYNCID0000005642', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005841', 'SYNCID0000005643', 'SYNCID0000005639', 'SYNCID0000006319', 'SYNCID0000005564', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID00000054

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005613', 'SYNCID0000005629', 'SYNCID0000005639', 'SYNCID0000005643', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005734', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005615', 'SYNCID0000005605', 'SYNCID0000005346', 'SYNCID0000005600', 'SYNCID0000005358', 'SYNCID0000005362', 'SYNCID0000005373', 'SYNCID0000005389', 'SYNCID0000005391', 'SYNCID0000005431', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494', 'SYNCID0000005517', 'SYNCID0000005523', 'SYNCID0000005524', 'SYNCID0000005534', 'SYNCID0000005537', 'SYNCID0000005540

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005615', 'SYNCID0000005629', 'SYNCID0000005639', 'SYNCID0000005642', 'SYNCID0000005643', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005756', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005621', 'SYNCID0000005613', 'SYNCID0000005347', 'SYNCID0000005605', 'SYNCID0000005362', 'SYNCID0000005373', 'SYNCID0000005389', 'SYNCID0000005391', 'SYNCID0000005431', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494', 'SYNCID0000005517', 'SYNCID0000005523', 'SYNCID0000005524', 'SYNCID0000005534', 'SYNCID0000005537

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005734', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005742', 'SYNCID0000005648', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005841', 'SYNCID0000005848', 'SYNCID0000005653', 'SYNCID0000005643', 'SYNCID0000005857', 'SYNCID0000005571', 'SYNCID0000005457', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494', 'SYNCID0000005517', 'SYNCID0000005523', 'SYNCID0000005524', 'SYNCID0000005534', 'SYNCID0000005537', 'SYNCID0000005540', 'SYNCID0000005548', 'SYNCID0000005564', 'SYNCID0000005570', 'SYNCID0000005575

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005595', 'SYNCID0000005605', 'SYNCID0000005613', 'SYNCID0000005615', 'SYNCID0000005621', 'SYNCID0000005629', 'SYNCID0000005639', 'SYNCID0000005642', 'SYNCID0000005643', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005591', 'SYNCID0000005771', 'SYNCID0000005587', 'SYNCID0000005342', 'SYNCID0000005346', 'SYNCID0000005347', 'SYNCID0000005358', 'SYNCID0000005362', 'SYNCID0000005373', 'SYNCID0000005389', 'SYNCID0000005391', 'SYNCID0000005431', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005724', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005727', 'SYNCID0000005642', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005780', 'SYNCID0000005807', 'SYNCID0000005643', 'SYNCID0000005639', 'SYNCID0000005848', 'SYNCID0000005564', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494', 'SYNCID0000005517', 'SYNCID0000005523', 'SYNCID0000005524', 'SYNCID0000005534', 'SYNCID0000005537', 'SYNCID0000005540', 'SYNCID0000005548', 'SYNCID0000005570', 'SYNCID0000005629', 'SYNCID0000005571', 'SYNCID0000005578', 'SYNCID0000005584

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000008475', 'SYNCID0000005643', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005684', 'SYNCID0000005705', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005780', 'SYNCID0000005789', 'SYNCID0000005807', 'SYNCID0000005642', 'SYNCID0000005639', 'SYNCID0000005548', 'SYNCID0000005431', 'SYNCID0000005435', 'SYNCID0000005446', 'SYNCID0000005457', 'SYNCID0000005471', 'SYNCID0000005476', 'SYNCID0000005494', 'SYNCID0000005517', 'SYNCID0000005523', 'SYNCID0000005524', 'SYNCID0000005534', 'SYNCID0000005537', 'SYNCID0000005540', 'SYNCID0000005564', 'SYNCID0000005621', 'SYNCID0000005570

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Flagged companies count: 2143
Flagged companies sample: [{'customer_id': 'SYNCID0000000006'}, {'customer_id': 'SYNCID0000000008'}, {'customer_id': 'SYNCID0000000019'}, {'customer_id': 'SYNCID0000000038'}, {'customer_id': 'SYNCID0000000065'}]
Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005615', 'SYNCID0000005621', 'SYNCID0000005629', 'SYNCID0000005639', 'SYNCID0000005642', 'SYNCID0000005643', 'SYNCID0000005648', 'SYNCID0000005653', 'SYNCID0000005654', 'SYNCID0000005655', 'SYNCID0000005656', 'SYNCID0000005667', 'SYNCID0000005671', 'SYNCID0000005672', 'SYNCID0000005705', 'SYNCID0000005780', 'SYNCID0000005721', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005755', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005613', 'SYNCID0000005605', 'SYNCID0000005600', 'SYNCID0000005595', 'SYNCID00000053

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

Human review feedback: {'true_positives': ['SYNCID0000017182', 'SYNCID0000005780', 'SYNCID0000005722', 'SYNCID0000005723', 'SYNCID0000005724', 'SYNCID0000005727', 'SYNCID0000005734', 'SYNCID0000005742', 'SYNCID0000005743', 'SYNCID0000005744', 'SYNCID0000005751', 'SYNCID0000005756', 'SYNCID0000005771', 'SYNCID0000005773', 'SYNCID0000005778', 'SYNCID0000005705', 'SYNCID0000005807', 'SYNCID0000005841', 'SYNCID0000005848', 'SYNCID0000005853', 'SYNCID0000005857', 'SYNCID0000005866', 'SYNCID0000005871', 'SYNCID0000005876', 'SYNCID0000005899', 'SYNCID0000005918', 'SYNCID0000005922', 'SYNCID0000005721', 'SYNCID0000005684', 'SYNCID0000005523', 'SYNCID0000005595', 'SYNCID0000005534', 'SYNCID0000005537', 'SYNCID0000005540', 'SYNCID0000005548', 'SYNCID0000005564', 'SYNCID0000005570', 'SYNCID0000005575', 'SYNCID0000005578', 'SYNCID0000005584', 'SYNCID0000005585', 'SYNCID0000005587', 'SYNCID0000005591', 'SYNCID0000005600', 'SYNCID0000005672', 'SYNCID0000005605', 'SYNCID0000005615', 'SYNCID0000005621

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['risk_score'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['new_account_prob'] = 0.5
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  flagged_companies['true_positive_prob'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc

In [21]:
from datetime import time

import time
import random
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from datetime import time, datetime

# --- Agent Classes --- (No changes here)
class ABMAgent:
    def __init__(self):
        self.name = "ABM Agent"

    def calculate_risk(self, df):
        original_index = df.index
        #df['abm_agent_risk'] = 0
        if 'abm_cash_indicator' in df.columns:
            df['abm_cash_indicator'] = df['abm_cash_indicator'].astype(str).str.lower() == 'true'
            df.loc[(df['abm_cash_indicator'] == True) & (df['abm_debit_credit'].isin(['debit', '0'])), 'abm_agent_risk'] += 0.7

            ######## NEW ###############
            df.loc[(df["abm_cash_indicator"] == True) & (df["abm_transaction_amount"] >= 9000) & (df["abm_debit_credit"] == "credit"), 'abm_agent_risk'] += 0.3
            df.loc[(df["abm_cash_indicator"] == True) & (df["abm_transaction_amount"] >= 5000) & (df["abm_debit_credit"] == "debt"), 'abm_agent_risk'] += 0.5

            # NIGHT
            df['abm_transaction_time'] = pd.to_datetime(df['abm_transaction_time'], format='%H:%M:%S').dt.time
            start_time = time(23, 0)  # 23:00
            end_time = time(6, 0)  # 06:00
            df['abm_rest_time'] = df["abm_transaction_time"].apply(lambda x: x >= start_time or x < end_time)
            df.loc[(df["abm_cash_indicator"] == True) & (df['abm_rest_time'] == True), 'abm_agent_risk'] += 0.1


            df['abm_transaction_date'] = pd.to_datetime(df['abm_transaction_date'], format='%m/%d/%Y')
            df['abm_full_datetime'] = df['abm_transaction_date'].astype(str) + ' ' + df['abm_transaction_time'].astype(str)
            df['abm_full_datetime'] = pd.to_datetime(df['abm_full_datetime'])

            # freq
            time_window = '72H'
            df['abm_transaction_count'] = df.groupby(['customer_id', pd.Grouper(key='abm_full_datetime', freq=time_window)])['abm_transaction_amount'].transform('count')
            threshold = 5
            df.loc[(df['abm_transaction_count'] > threshold) & (df["abm_cash_indicator"] == True), 'abm_agent_risk'] += 0.2

            # dumpliacted amount
            duplicate_threshold = 3
            df['abm_duplicate_count'] = df.groupby(['customer_id', pd.Grouper(key='abm_full_datetime', freq=time_window), 'abm_transaction_amount'])['abm_transaction_amount'].transform('count')
            df.loc[df['abm_duplicate_count'] >= duplicate_threshold, 'abm_agent_risk'] += 0.2


        if 'abm_country' in df.columns:
            df['abm_country'] = df['abm_country'].fillna('UNKNOWN')
            df.loc[~df['abm_country'].isin(['Canada', 'USA', 'UNKNOWN']), 'abm_agent_risk'] += 0.3 ###### I don't understand, we only have CA, unknown, other in dataset

        ############## New #####################
        df.loc[df['abm_industry_risk_score_cd'] > 0.7, 'abm_agent_risk'] += 0.2
        df.loc[df['abm_industry_risk_score_dc'] > 0.7, 'abm_agent_risk'] += 0.2
        df.loc[df['avg_abm_amount'] > 8000, 'abm_agent_risk'] += 0.1
        df.loc[df['max_abm_amount'] > 10000, 'abm_agent_risk'] += 0.1
        df.loc[df['abm_count'] > 25, 'abm_agent_risk'] += 0.1
        df.loc[df['std_abm_amount'] > 6000, 'abm_agent_risk'] += 0.1
        df.index = original_index  # Restore the original index
        #df.reset_index(drop=False, inplace=True)
        return df['abm_agent_risk'], original_index

class CardAgent:
    def __init__(self):
        self.name = "Card Agent"

    def calculate_risk(self, df):
        original_index = df.index
        #df['card_agent_risk'] = 0

        if 'card_merchant_category' in df.columns:
            df['card_merchant_category'] = df['card_merchant_category'].fillna('UNKNOWN')
            high_risk_categories = ['gambling', 'jewelry', 'UNKNOWN']
            df.loc[df['card_merchant_category'].isin(high_risk_categories), 'card_agent_risk'] += 0.6

        if 'card_ecommerce_ind' in df.columns:
            df['card_ecommerce_ind'] = df['card_ecommerce_ind'].astype(str).str.lower() == 'true'
            df.loc[df['card_ecommerce_ind'] == True, 'card_agent_risk'] += 0.4

        ############## New ##################

        threshold_amount = 10000
        df.loc[df['card_transaction_amount'].abs() > threshold_amount, 'card_agent_risk'] += 0.4

        df.loc[df['card_country'] != df['kyc_country'], 'card_agent_risk'] += 0.1

        # Changes in consumption patterns are detected
        df['card_change_in_spending_pattern'] = df.groupby('customer_id')['card_transaction_amount'].rolling(5).std().reset_index(level=0, drop=True)
        df.loc[df['card_change_in_spending_pattern'] > df.groupby('customer_id')['card_transaction_amount'].transform('std'), 'card_agent_risk'] += 0.2


        # Convert transaction time and date to datetime format and combine them
        df['card_transaction_time'] = pd.to_datetime(df['card_transaction_time'], format='%H:%M:%S').dt.time
        df['card_transaction_date'] = pd.to_datetime(df['card_transaction_date'], format='%Y-%m-%d')
        df['card_full_datetime'] = pd.to_datetime(df['card_transaction_date'].astype(str) + ' ' + df['card_transaction_time'].astype(str))

        # Set the index to card_full_datetime for better manipulation
        df.set_index('card_full_datetime', inplace=True, drop=False)

        # Calculate frequent transactions within a 72-hour period
        df['card_frequent_transactions'] = df.groupby(['customer_id', pd.Grouper(freq='72H'), 'card_transaction_amount'])['card_transaction_amount'].transform('count')

        # Reset the index to merge back if needed
        df.reset_index(drop=True, inplace=True)

        # Apply risk score adjustment
        df.loc[df['card_frequent_transactions'] > 5, 'card_agent_risk'] += 0.4


        ########### New ########################
        df.loc[df['card_industry_risk_score_cd'] > 0.7, 'card_agent_risk'] += 0.2
        df.loc[df['card_industry_risk_score_dc'] > 0.7, 'card_agent_risk'] += 0.2
        df.loc[df['avg_card_amount'] > 11000, 'card_agent_risk'] += 0.2
        df.loc[df['avg_card_amount'] < 0, 'card_agent_risk'] += 0.2
        df.loc[df['max_card_amount'] > 100000, 'card_agent_risk'] += 0.2
        df.loc[df['card_count'] > 150, 'card_agent_risk'] += 0.5
        df.loc[df['std_card_amount'] > 1500, 'card_agent_risk'] += 0.1
        df.index = original_index  # Restore the original index
        #df.reset_index(drop=False, inplace=True)
        return df['card_agent_risk'], original_index


# class ChequeAgent:
#     def __init__(self):
#         self.name = "Cheque Agent"

#     def calculate_risk(self, df):
#         original_index = df.index
#         #df['cheque_agent_risk'] = 0

#         ######### New ###########
#         df['cheque_transaction_date'] = pd.to_datetime(df['cheque_transaction_date'])
#         df.set_index('cheque_transaction_date', inplace=True, drop=False)

#         # 使用 transform 而不是 size，确保返回的 Series 与原始 DataFrame 的索引一致
#         df['cheque_transaction_frequency'] = df.groupby('customer_id')['cheque_transaction_date'].transform(lambda x: x.resample('M').count())

#         df.loc[df['cheque_transaction_frequency'] > 7, 'cheque_agent_risk'] += 0.3

        # # df['quick_succession'] = df.groupby('customer_id').rolling('72H')['cheque_transaction_amount'].count()
        # # df.loc[df['quick_succession'] > 5, 'cheque_agent_risk'] += 0.3  # Adjust risk if more than 5 cheques in 2 days
        # ##################################################

        # df.loc[(df['cheque_debit_credit'] == "credit") & (df["cheque_transaction_amount"] > 5000), 'cheque_agent_risk'] += 0.1
        # df.loc[(df['cheque_debit_credit'] == "debt") & (df["cheque_transaction_amount"] > 5000), 'cheque_agent_risk'] += 0.1

        # df.loc[df["cheque_transaction_amount"] > 20000, 'cheque_agent_risk'] += 0.2

        # df['cheque_transaction_date'] = pd.to_datetime(df['cheque_transaction_date'])
        # df['cheque_is_weekday'] = df['cheque_transaction_date'].apply(lambda x: x.weekday() < 5)
        # df.loc[(df['cheque_is_weekday'] == False) & (df["cheque_transaction_amount"] > 10000), 'cheque_agent_risk'] += 0.5

        # ####### New ##########
        # df.loc[df['cheque_industry_risk_score_cd'] > 0.7, 'cheque_agent_risk'] += 0.2
        # df.loc[df['cheque_industry_risk_score_dc'] > 0.7, 'cheque_agent_risk'] += 0.2
        # df.loc[df['avg_cheque_amount'] > 8000, 'cheque_agent_risk'] += 0.2
        # df.loc[df['max_cheque_amount'] > 50000, 'cheque_agent_risk'] += 0.2
        # df.loc[df['cheque_count'] > 150, 'cheque_agent_risk'] += 0.5
        # df.loc[df['std_cheque_amount'] > 50000, 'cheque_agent_risk'] += 0.1
        # df.index = original_index  # Restore the original index
        # #df.reset_index(drop=False, inplace=True)
        # return df['cheque_agent_risk'], original_index
class ChequeAgent:
    def __init__(self):
        self.name = "Cheque Agent"

    def calculate_risk(self, df):
        original_index = df.index
        #df['cheque_agent_risk'] = 0

        ######### New ###########
        df['cheque_transaction_date'] = pd.to_datetime(df['cheque_transaction_date'])
        df.set_index('cheque_transaction_date', inplace=True, drop=False)

        # 使用 transform 而不是 size，确保返回的 Series 与原始 DataFrame 的索引一致
        df['cheque_transaction_frequency'] = df.groupby('customer_id')['cheque_transaction_date'].transform(lambda x: x.resample('M').count())

        df.loc[df['cheque_transaction_frequency'] > 7, 'cheque_agent_risk'] += 0.3

        # 使用 groupby 和 transform 计算 72 小时内的交易次数
        df['quick_succession'] = df.groupby('customer_id')['cheque_transaction_date'].transform(
    lambda x: x.groupby(pd.Grouper(freq='72H')).count()
)

        df.loc[df['quick_succession'] > 5, 'cheque_agent_risk'] += 0.3  # 如果 72 小时内超过 5 笔交易，增加风险
        ##################################################

        df.loc[(df['cheque_debit_credit'] == "credit") & (df["cheque_transaction_amount"] > 5000), 'cheque_agent_risk'] += 0.1
        df.loc[(df['cheque_debit_credit'] == "debt") & (df["cheque_transaction_amount"] > 5000), 'cheque_agent_risk'] += 0.1

        df.loc[df["cheque_transaction_amount"] > 20000, 'cheque_agent_risk'] += 0.2

        df['cheque_transaction_date'] = pd.to_datetime(df['cheque_transaction_date'])
        df['cheque_is_weekday'] = df['cheque_transaction_date'].apply(lambda x: x.weekday() < 5)
        df.loc[(df['cheque_is_weekday'] == False) & (df["cheque_transaction_amount"] > 10000), 'cheque_agent_risk'] += 0.5

        ####### New ##########
        df.loc[df['cheque_industry_risk_score_cd'] > 0.7, 'cheque_agent_risk'] += 0.2
        df.loc[df['cheque_industry_risk_score_dc'] > 0.7, 'cheque_agent_risk'] += 0.2
        df.loc[df['avg_cheque_amount'] > 8000, 'cheque_agent_risk'] += 0.2
        df.loc[df['max_cheque_amount'] > 50000, 'cheque_agent_risk'] += 0.2
        df.loc[df['cheque_count'] > 150, 'cheque_agent_risk'] += 0.5
        df.loc[df['std_cheque_amount'] > 50000, 'cheque_agent_risk'] += 0.1
        df.index = original_index  # Restore the original index
        #df.reset_index(drop=False, inplace=True)
        return df['cheque_agent_risk'], original_index

class EFTAEMTAgent: ############### why put together?
    def __init__(self, name):
        self.name = name

    def calculate_risk(self, df):
        original_index = df.index
        #df[f'{self.name.lower()}_agent_risk'] = 0
        ######################
        df['transaction_date1'] = pd.to_datetime(df[f'{self.name.lower()}_transaction_date'])
        df['is_weekday'] = df['transaction_date1'].apply(lambda x: x.weekday() < 5)

        df["transaction_time1"] = pd.to_datetime(df[f'{self.name.lower()}_transaction_time'], format='%H:%M:%S').dt.time
        start_time = time(23, 0)  # 23:00
        end_time = time(6, 0)  # 06:00
        df['rest_time'] = df["transaction_time1"].apply(lambda x: x >= start_time or x < end_time)

        df.loc[(df['is_weekday'] == False) & (df[f'{self.name.lower()}_transaction_amount'] > 1000), f'{self.name.lower()}_agent_risk'] += 0.1
        df.loc[(df['rest_time'] == True) & (df[f'{self.name.lower()}_transaction_amount'] > 1000), f'{self.name.lower()}_agent_risk'] += 0.1
        #####################
        df.loc[df[f'{self.name.lower()}_transaction_amount'] > 2000, f'{self.name.lower()}_agent_risk'] += 0.2


        ########## New ##########
        df.loc[df[f'{self.name.lower()}_industry_risk_score_cd'] > 0.7, f'{self.name.lower()}_agent_risk'] += 0.2
        df.loc[df[f'{self.name.lower()}_industry_risk_score_dc'] > 0.7, f'{self.name.lower()}_agent_risk'] += 0.2
        df.loc[df[f'avg_{self.name.lower()}_amount'] > 2000, f'{self.name.lower()}_agent_risk'] += 0.2
        df.loc[df[f'max_{self.name.lower()}_amount'] > 50000, f'{self.name.lower()}_agent_risk'] += 0.2
        df.loc[df[f'{self.name.lower()}_count'] > 1200, f'{self.name.lower()}_agent_risk'] += 0.2

        df.index = original_index  # Restore the original index
        #df.reset_index(drop=False, inplace=True)
        return df[f'{self.name.lower()}_agent_risk'], original_index


# class WireAgent:
#     def __init__(self):
#         self.name = "Wire Agent"

#     def calculate_risk(self, df):
#         original_index = df.index
#         #df['wire_agent_risk'] = 0
#         if 'wire_transaction_amount' in df.columns:
#             df.loc[df['wire_transaction_amount'] > 50000, 'wire_agent_risk'] += 0.2
#             df.loc[df['wire_transaction_amount'] > 100000, 'wire_agent_risk'] += 0.5

#         ################## New ##########
#         df['wire_transaction_date'] = pd.to_datetime(df['wire_transaction_date'])
#         df['wire_is_weekday'] = df['wire_transaction_date'].apply(lambda x: x.weekday() < 5)
#         df.sort_values('wire_transaction_date', inplace=True)
#         df.set_index('wire_transaction_date', inplace=True)
#         df.loc[df['wire_is_weekday'] == False, 'wire_agent_risk'] += 0.5

#         # Frequent, large wire transfers
#         ###############
#         # df['frequent_large_transfers'] = df.groupby('customer_id')['wire_transaction_amount'].transform(
#         #     lambda x: (x > 50000).rolling('24H').sum())
#         # df.loc[df['frequent_large_transfers'] >= 3, 'wire_agent_risk'] += 0.4

#         high_value_transfers = df[df['wire_transaction_amount'] > 50000].groupby(['customer_id', df.index.date]).size()
#         high_value_transfers = high_value_transfers.reset_index(name='counts')
#         high_value_transfers.columns = ['customer_id', 'transaction_date', 'high_value_counts']

#         # 重新将高交易次数信息合并回原始DataFrame
#         df = df.merge(high_value_transfers, how='left', left_on=['customer_id', df.index.date], right_on=['customer_id', 'transaction_date'])
#         df.loc[df['high_value_counts'] >= 3, 'wire_agent_risk'] += 0.4

#         ############### New ##
#         df.loc[df['wire_industry_risk_score_cd'] > 0.7, 'wire_agent_risk'] += 0.2
#         df.loc[df['wire_industry_risk_score_dc'] > 0.7, 'wire_agent_risk'] += 0.2
#         df.loc[df['avg_wire_amount'] > 150000, 'wire_agent_risk'] += 0.2
#         df.loc[df['max_wire_amount'] > 500000, 'wire_agent_risk'] += 0.2
#         df.loc[df['wire_count'] > 2, 'wire_agent_risk'] += 0.1
#         df.loc[df['std_wire_amount'] > 20000, 'wire_agent_risk'] += 0.2
#         #df.reset_index(drop=False, inplace=True)
#         df.index = original_index  # Restore the original index
#         return df['wire_agent_risk'], original_index
class WireAgent:
    def __init__(self):
        self.name = "Wire Agent"

    def calculate_risk(self, df):
        original_index = df.index
        #df['wire_agent_risk'] = 0
        if 'wire_transaction_amount' in df.columns:
            df.loc[df['wire_transaction_amount'] > 50000, 'wire_agent_risk'] += 0.2
            df.loc[df['wire_transaction_amount'] > 100000, 'wire_agent_risk'] += 0.5

        ################## New ##########
        df['wire_transaction_date'] = pd.to_datetime(df['wire_transaction_date'])
        df['wire_is_weekday'] = df['wire_transaction_date'].apply(lambda x: x.weekday() < 5)
        df.sort_values('wire_transaction_date', inplace=True)
        df.set_index('wire_transaction_date', inplace=True)
        df.loc[df['wire_is_weekday'] == False, 'wire_agent_risk'] += 0.5

        # Frequent, large wire transfers
        ###############
        # 过滤出高价值交易
        high_value_df = df[df['wire_transaction_amount'] > 50000]

        # 确保分组依据和过滤后的 DataFrame 长度一致
        high_value_transfers = high_value_df.groupby(['customer_id', high_value_df.index.date]).size()
        high_value_transfers = high_value_transfers.reset_index(name='counts')
        high_value_transfers.columns = ['customer_id', 'transaction_date', 'high_value_counts']

        # 重新将高交易次数信息合并回原始 DataFrame
        df = df.merge(high_value_transfers, how='left', left_on=['customer_id', df.index.date], right_on=['customer_id', 'transaction_date'])
        df.loc[df['high_value_counts'] >= 3, 'wire_agent_risk'] += 0.4

        ############### New ##
        df.loc[df['wire_industry_risk_score_cd'] > 0.7, 'wire_agent_risk'] += 0.2
        df.loc[df['wire_industry_risk_score_dc'] > 0.7, 'wire_agent_risk'] += 0.2
        df.loc[df['avg_wire_amount'] > 150000, 'wire_agent_risk'] += 0.2
        df.loc[df['max_wire_amount'] > 500000, 'wire_agent_risk'] += 0.2
        df.loc[df['wire_count'] > 2, 'wire_agent_risk'] += 0.1
        df.loc[df['std_wire_amount'] > 20000, 'wire_agent_risk'] += 0.2
        #df.reset_index(drop=False, inplace=True)
        df.index = original_index  # Restore the original index
        return df['wire_agent_risk'], original_index

# Cell 2: RiskAssessmentAgent (with detailed print statements for debugging)
class RiskAssessmentAgent:
    def __init__(self, thresholds):
        self.thresholds = thresholds  # Initial thresholds
        self.abm_agent = ABMAgent()
        self.card_agent = CardAgent()
        self.cheque_agent = ChequeAgent()
        self.eft_agent = EFTAEMTAgent("EFT")
        self.emt_agent = EFTAEMTAgent("EMT")
        self.wire_agent = WireAgent()
        self.agents = [self.abm_agent, self.card_agent, self.cheque_agent,
                       self.eft_agent, self.emt_agent, self.wire_agent]

    def calculate_risk_score(self, df):
        # Pre-allocate agent risk columns
        for agent in self.agents:
            df.loc[:, f"{agent.name.lower().replace(' ', '_')}_risk"] = 0.0

        df['abm_agent_risk'] = 0
        df['card_agent_risk'] = 0
        df['cheque_agent_risk'] = 0
        df['eft_agent_risk'] = 0
        df['emt_agent_risk'] = 0
        df['wire_agent_risk'] = 0

        # Calculate individual agent risks
        for agent in self.agents:
            if agent == self.abm_agent:
                risks, indices = agent.calculate_risk(df.loc[df["is_abm"] == 1])
                df.loc[df.loc[df["is_abm"] == 1].index, 'abm_agent_risk'] = risks  # Update the main DataFrame
            if agent == self.card_agent:
                risks, indices = agent.calculate_risk(df.loc[df["is_card"] == 1])
                df.loc[df.loc[df["is_card"] == 1].index, 'card_agent_risk'] = risks
            if agent == self.cheque_agent:
                risks, indices = agent.calculate_risk(df.loc[df["is_cheque"] == 1])
                df.loc[df.loc[df["is_cheque"] == 1].index, 'cheque_agent_risk'] = risks
            if agent == self.eft_agent:
                risks, indices = agent.calculate_risk(df.loc[df["is_eft"] == 1])
                df.loc[df.loc[df["is_eft"] == 1].index, 'eft_agent_risk'] = risks
            if agent == self.emt_agent:
                risks, indices = agent.calculate_risk(df.loc[df["is_emt"] == 1])
                df.loc[df.loc[df["is_emt"] == 1].index, 'emt_agent_risk'] = risks
            if agent == self.wire_agent:
                risks, indices = agent.calculate_risk(df.loc[df["is_wire"] == 1])
                df.loc[df.loc[df["is_wire"] == 1].index, 'wire_agent_risk'] = risks
        # for agent in self.agents:
        #     agent.calculate_risk(df)

        # --- Aggregate Risk Scores ---

        # 1. Calculate Transaction-Level Aggregates (Correctly, per customer)
        transaction_cols = []
        count_cols = []  # List to store count columns
        print("Calculating transaction-level aggregates...")
        for txn_type in ['abm', 'card', 'cheque', 'eft', 'emt', 'wire']:
            amount_col = f'{txn_type}_transaction_amount'
            if amount_col in df.columns:
                print(f"  Found transaction amount column: {amount_col}")
                transaction_cols.append(amount_col)
                # Count transactions:  Check for presence of a transaction ID or amount > 0
                count_col = f'{txn_type}_count'  # Look for pre-calculated count
                if count_col in df.columns:
                    print(f"    Using existing count column: {count_col}")
                    count_cols.append(count_col)
                else:
                    #If no count column, create a temporary one.
                    print(f"    Creating temporary count column: {txn_type}_present")
                    df.loc[:,f'{txn_type}_present'] = (df[amount_col].notna() & (df[amount_col] >0 )).astype(int)
                    count_cols.append(f'{txn_type}_present')
            else:
                print(f"  Transaction amount column not found: {amount_col}")

        # Sum across the found transaction amount and count columns.
        print("  Summing transaction amounts...")
        df.loc[:, 'total_transaction_amount'] = df[transaction_cols].sum(axis=1, skipna=True)
        print("  Summing transaction counts...")
        df.loc[:, 'transaction_count'] = df[count_cols].sum(axis=1, skipna=True)


        # Clean up temp columns
        for txn_type in ['abm', 'card', 'cheque', 'eft', 'emt', 'wire']:
            temp_count_col = f'{txn_type}_present'
            if temp_count_col in df.columns:
                del df[temp_count_col]

        # Calculate derived columns, with handling for missing columns
        print("  Calculating average transaction amount...")
        df.loc[:, 'avg_transaction_amount'] = df['total_transaction_amount'] / (df['transaction_count'] + 1e-9)


        max_cols = [f'{txn_type}_transaction_amount' for txn_type in ['abm', 'card', 'cheque', 'eft', 'emt', 'wire'] if f'{txn_type}_transaction_amount' in df.columns]
        if max_cols:
            print("  Calculating max transaction amount...")
            df.loc[:, 'max_transaction_amount'] = df[max_cols].max(axis=1, skipna=True) #find the max between all those columns
        else:
            print("  No transaction amount columns found. Setting max_transaction_amount to 0.")
            df.loc[:, 'max_transaction_amount'] = 0


        # 2. Base Risk Score
        print("Calculating base risk score...")
        numeric_thresholds = {k: float(v) if isinstance(v, (int, float)) else 0.0 for k, v in self.thresholds.items()}
        df.loc[:, 'risk_score'] = 0.0  # Initialize

        df.loc[:, 'risk_score'] = (
                (df['total_transaction_amount'] / (numeric_thresholds.get('total_transaction_amount', 1) + 1e-9)) +
                (df['avg_transaction_amount'] / (numeric_thresholds.get('avg_transaction_amount', 1) + 1e-9)) +
                (df['max_transaction_amount'] / (numeric_thresholds.get('max_transaction_amount', 1) + 1e-9)) +
                (df['transaction_count'] / (numeric_thresholds.get('transaction_count', 1) + 1e-9))
            )
        df.loc[:, 'risk_score'] = df['risk_score'] / (df['risk_score'].max(skipna=True) + 1e-9)


        # 3. Industry Risk Probability
        print("Calculating industry risk probability...")
        df.loc[:, 'industry_risk_prob'] = 0.0  # Initialize
        for txn_type in ['abm', 'card', 'cheque', 'eft', 'emt', 'wire']:
            risk_col = f'{txn_type}_industry_risk_score_dc'
            if risk_col in df.columns:
                df.loc[:, 'industry_risk_prob'] += df[risk_col].fillna(0)

        # 4. Recency Probability
        print("Calculating recency probability...")
        if 'days_since_last_abm' in df.columns:
            df.loc[:, 'recency_prob'] = 1.0 / (1.0 + df['days_since_last_abm'].fillna(0))
        else:
            df.loc[:, 'recency_prob'] = 0.0

        # 5. Combined Risk Score
        print("Calculating combined risk score...")
        df.loc[:, 'combined_risk_score'] = 0.0 #Pre-allocate
        df.loc[:, 'combined_risk_score'] = (
            0.3 * df.get('risk_score', 0) +
            0.2 * df.get('industry_risk_prob', 0) +
            0.1 * df.get('recency_prob', 0) +
            0.1 * df.get('abm_agent_risk', 0) +
            0.1 * df.get('card_agent_risk', 0) +
            0.05 * df.get('cheque_agent_risk', 0) +
            0.05 * df.get('eft_agent_risk', 0) +
            0.05 * df.get('emt_agent_risk', 0) +
            0.05 * df.get('wire_agent_risk', 0)
        )
        df.loc[:, 'combined_risk_score'] = df['combined_risk_score'] / (df['combined_risk_score'].max(skipna=True) + 1e-9) # avoid division by zero
        print("Risk score calculation complete.") #Final print
        return df

    def adjust_thresholds(self, action):
        """Adjusts thresholds based on the Q-learning agent's action."""
        adjustment_factor = 0.05  # 5% change
        threshold_keys = list(self.thresholds.keys())
        num_thresholds = len(threshold_keys)

        for i in range(num_thresholds):
            threshold_index = (action + i) % (num_thresholds * 3)  # Wrap around

            if threshold_index < num_thresholds:
                # Decrease threshold
                key = threshold_keys[threshold_index]
                self.thresholds[key] = max(0, self.thresholds[key] * (1 - adjustment_factor))
            elif threshold_index < 2 * num_thresholds:
                # Increase threshold
                key = threshold_keys[threshold_index - num_thresholds]
                self.thresholds[key] *= (1 + adjustment_factor)
            # Else: Do nothing (no-op)

        return self.thresholds  # Return updated thresholds

#Cell 3 (Modified Review Agent)
class ReviewAgent:
    def __init__(self, skill, bias, learning_rate=0.01, discount_factor=0.9):
        self.skill = skill
        self.bias = bias
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.q_table = {}

    def get_state(self, df):
      risk_score = df['combined_risk_score']
      industry_risk = df.filter(like='_industry_risk_score_dc').sum(axis=1)

      # Use qcut and handle potential errors robustly
      try:
          risk_levels = pd.qcut(risk_score, q=[0, 0.33, 0.66, 1.0], labels=['LOW', 'MEDIUM', 'HIGH'], duplicates='drop').astype(str)
      except ValueError:
          # Handle cases where qcut might fail (e.g., all values the same)
          risk_levels = pd.Series(['MEDIUM'] * len(df), index=df.index).astype(str)

      try:
          industry_risk_levels = pd.qcut(industry_risk, q=[0, 0.33, 0.66, 1.0], labels=['LOW', 'MEDIUM', 'HIGH'], duplicates='drop').astype(str)
      except ValueError:
           industry_risk_levels = pd.Series(['MEDIUM'] * len(df), index=df.index).astype(str)

      states = risk_levels.str.cat(industry_risk_levels, sep='_')

      return states

    def choose_action(self, states):
        actions = []
        for state in states:
            if self.skill == "novice":
                explore_prob = 0.4
            elif self.skill == "intermediate":
                explore_prob = 0.2
            elif self.skill == "expert":
                explore_prob = 0.1

            if random.random() < explore_prob:
                action = random.choice([0, 1])
                actions.append(action)
            else:
                if state not in self.q_table:
                    action = random.choice([0, 1])
                    actions.append(action)
                else:
                    action = max(self.q_table[state], key=self.q_table[state].get, default=random.choice([0, 1]))
                    actions.append(action)
        return np.array(actions)

    def review(self, df):
        if 'is_true_positive' not in df.columns:
                df.loc[:,'is_true_positive'] = 0  # Use .loc
        if 'agent_true_positive_prob' not in df.columns:
            df.loc[:,'agent_true_positive_prob'] = 0 # Use .loc

        states = self.get_state(df)
        actions = self.choose_action(states)
        probs = df['combined_risk_score'].copy()  # This is fine, it creates a copy
        if self.skill == "novice":
            probs *= 1.2  # Modify the copy directly
        elif self.skill == "expert":
            probs *= 0.8
        probs += self.bias
        probs = np.clip(probs, 0, 1)

        df.loc[:, 'is_true_positive'] = actions  # Use .loc
        df.loc[:, 'agent_true_positive_prob'] = probs # Use .loc
        return df

    def update_q_table(self, states, actions, rewards, next_states):
        for state, action, reward, next_state in zip(states, actions, rewards, next_states):
            if state not in self.q_table:
                self.q_table[state] = {0: 0, 1: 0}
            if next_state not in self.q_table:
                self.q_table[next_state] = {0: 0, 1: 0}
            old_value = self.q_table[state][action]
            next_max = max(self.q_table[next_state].values(), default=0)
            new_value = (1 - self.learning_rate) * old_value + \
                        self.learning_rate * (reward + self.discount_factor * next_max)
            self.q_table[state][action] = new_value

#Cell 3.1
class MetaReviewAgent:
    def __init__(self):
        self.agent_weights = {}

    def aggregate_reviews(self, df, review_agents):
        if not self.agent_weights:
            for agent in review_agents:
                self.agent_weights[agent] = 1.0

        df.loc[:,'aggregated_prediction'] = 0.0 # Use .loc
        total_weight = sum(self.agent_weights.values())

        for agent in review_agents:
            df.loc[:,'aggregated_prediction'] += self.agent_weights[agent] * df['agent_true_positive_prob'] # Use .loc

        df.loc[:,'aggregated_prediction'] /= (total_weight + 1e-9) # Use .loc
        return df

    def update_weights(self, df, review_agents, learning_rate=0.01):
        for agent in review_agents:
            error = abs(df['is_true_positive'].values - df['agent_true_positive_prob'].values).mean()
            self.agent_weights[agent] *= (1 - learning_rate * error)
            self.agent_weights[agent] = max(0.01, self.agent_weights[agent])

# Cell 4: Environment Class (Run this fourth)
class Environment:
    def __init__(self, data, initial_thresholds):
        self.data = data
        self.initial_thresholds = initial_thresholds
        self.risk_agent = RiskAssessmentAgent(initial_thresholds.copy())  # Initialize with a *copy*
        self.review_agent_novice = ReviewAgent(skill="novice", bias=0.1)
        self.review_agent_intermediate = ReviewAgent(skill="intermediate", bias=0.0)
        self.review_agent_expert = ReviewAgent(skill="expert", bias=-0.1)
        self.review_agents = [self.review_agent_novice, self.review_agent_intermediate, self.review_agent_expert]
        self.meta_review_agent = MetaReviewAgent()
        self.state = list(initial_thresholds.values())   # Initial state
        self.action_space = list(range(len(self.initial_thresholds) * 3)) # Modified action space
        self.current_batch = 0 #To iterate through the dataframe
        self.batch_size = 20 # Instance of batch size

    def reset(self):
        """Resets the environment to its initial state."""
        self.risk_agent.thresholds = self.initial_thresholds.copy()  # Reset thresholds
        self.state = list(self.initial_thresholds.values())  # Reset state
        self.current_batch = 0  # Reset batch counter
        return self.state  # Return the initial state

    def step(self, action):
      # 1. Adjust thresholds based on the action
        #print(f"AML_Env: Applying action {action}") #Removed to reduce output
        self.risk_agent.adjust_thresholds(action)  # Adjust thresholds
        self.state = list(self.risk_agent.thresholds.values())  # Update the environment's state
        #print(f"AML_Env: Updated thresholds: {self.risk_agent.thresholds}") #Removed to reduce output

        # 2. Run a *complete* simulation with the new thresholds
        #f1, self.data = run_simulation(self.num_records, self.risk_agent.thresholds.copy(), self.review_agents, self.meta_review_agent)  # Pass thresholds to run_simulation, get data
        f1, batch_data = self.run_simulation_batch(self.risk_agent.thresholds.copy(), self.review_agents, self.meta_review_agent)

        # 3. Calculate reward (e.g., based on F1-score)
        reward = f1

        # 4. Determine if the episode is done (e.g., fixed number of steps or end of data)
        done = True if self.current_batch >= (len(self.data)/self.batch_size) else False

        return self.state, reward, done, {}  # Return next_state, reward, done, info

    def run_simulation_batch(self, thresholds, review_agents, meta_review_agent):

        # Get a batch of data
        start_index = self.current_batch * self.batch_size
        end_index = min((self.current_batch + 1) * self.batch_size, len(self.data))
        batch_data = self.data.iloc[start_index:end_index]

        # If no data in batch, return default values (important to prevent errors)
        if len(batch_data) == 0:
            return 0.0, pd.DataFrame()  # Return 0 reward and empty DataFrame

        self.current_batch += 1

        # Initialize and run the risk assessment agent on the batch.
        risk_agent = RiskAssessmentAgent(thresholds)
        batch_data = risk_agent.calculate_risk_score(batch_data)

        # Review process
        for agent in review_agents:
            batch_data = agent.review(batch_data)

        batch_data = meta_review_agent.aggregate_reviews(batch_data, review_agents)
        meta_review_agent.update_weights(batch_data, review_agents)

        # Calculate F1-score for the batch (ground truth vs. aggregated prediction)
        # Use a threshold (e.g., 0.5) on the aggregated_prediction to get binary predictions.
        batch_data['predicted_fraud'] = (batch_data['aggregated_prediction'] > 0.5).astype(int)

        # Ensure 'is_fraud' exists and handle potential missing values
        if 'is_fraud' in batch_data.columns:
             # Calculate F1 score, handling potential errors
            try:
                f1 = f1_score(batch_data['is_fraud'], batch_data['predicted_fraud'])
            except ValueError as e:
                print(f"ValueError in f1_score calculation: {e}")
                f1 = 0.0  # Default value if F1 calculation fails

        else:
            f1 = 0.0
            print("Column 'is_fraud' not found in batch data. Setting F1-score to 0.")

        return f1, batch_data


    def get_current_data(self):
        return self.data

# Cell 5: Q-Learning and Simulation Setup

# --- Q-Learning ---
def get_env_state(env):
    discretized_state = []
    for threshold in env.state:
        discretized_state.append(round(threshold, 2))
    return tuple(discretized_state)

def run_q_learning(env, num_episodes=5):
    q_table = {}  # {state: {action: q_value}}
    initial_exploration_rate = 1.0
    exploration_decay = 0.995
    min_exploration_rate = 0.1
    exploration_rate = initial_exploration_rate
    learning_rate = 0.1
    discount_factor = 0.9

    for episode in range(num_episodes):
        state = env.reset()
        state = get_env_state(env)
        done = False

        while not done:
            if random.uniform(0, 1) < exploration_rate:
                action = random.choice(env.action_space)
            else:
                if state in q_table:
                    action = max(q_table[state], key=q_table[state].get, default=random.choice(env.action_space))
                else:
                    action = random.choice(env.action_space)

            next_state, reward, done, _ = env.step(action)
            next_state = get_env_state(env)

            if state not in q_table:
                q_table[state] = {}
                for a in env.action_space:
                    q_table[state][a] = 0

            old_value = q_table[state].get(action, 0)

            if next_state not in q_table:
                q_table[next_state] = {}
                for a in env.action_space:
                    q_table[next_state][a] = 0
            next_max = max(q_table[next_state].values(), default=0)

            new_value = (1 - learning_rate) * old_value + \
                        learning_rate * (reward + discount_factor * next_max)
            q_table[state][action] = new_value

            state = next_state

        exploration_rate = max(min_exploration_rate, exploration_rate * exploration_decay)
        print(f"Episode {episode}: Exploration Rate = {exploration_rate}, Reward: {reward}") #Added reward to print
    return q_table


In [None]:
# --- Data Loading ---  (This is now in its own cell)
def load_data(file_path):
    df = first_rows_df.loc[first_rows_df['customer_id'].isin(unusual_cust.index)]
    df = df.reset_index(drop=True)
    #df = df.iloc[0:10000]
    return df

# --- Main Execution ---
if __name__ == "__main__":
    # 1. Load your data
    data = merged_df.loc[merged_df['customer_id'].isin(unusual_cust.index)]
    data=data.reset_index(drop=True)

    # data = data.iloc[0:10000]
    # data = data.iloc[1:10000]

    # 2. Define initial thresholds
    initial_thresholds = {
        'total_transaction_amount': 5000,
        'avg_transaction_amount': 1000,
        'max_transaction_amount': 15000,
        'transaction_count': 10
    }

    # 3. Create the Environment
    env = Environment(data, initial_thresholds)

    # 4. Run Q-learning to train the agent
    q_table = run_q_learning(env)

    print("\nFinal Q-table:")
    print(q_table)

    # 5. Test the trained agent.
    print("\n--- Testing Phase ---")
    state = env.reset()
    state = get_env_state(env)
    done = False

    # Get the final thresholds selected by the agent
    final_thresholds = env.risk_agent.thresholds
    print(f"\nFinal Thresholds Selected by Agent: {final_thresholds}")

    # Use those thresholds to run the simulation ONCE on the ENTIRE dataset
    test_risk_agent = RiskAssessmentAgent(final_thresholds)
    test_review_agents = [ReviewAgent(skill="novice", bias=0.1),
                         ReviewAgent(skill="intermediate", bias=0.0),
                         ReviewAgent(skill="expert", bias=-0.1)]

    test_meta_review_agent = MetaReviewAgent()

    # Calculate risk scores using the final thresholds
    data = test_risk_agent.calculate_risk_score(data)

    for agent in test_review_agents:
      data = agent.review(data)
    data = test_meta_review_agent.aggregate_reviews(data,test_review_agents)


     # Calculate F1-score for the entire dataset
    if 'is_fraud' in data.columns:
      data['predicted_fraud'] = (data['aggregated_prediction'] > 0.5).astype(int)
      final_f1 = f1_score(data['is_fraud'], data['predicted_fraud'])
      print(f"\nFinal F1-score on the entire dataset: {final_f1}")
    else:
        print("\n'is_fraud' column not found. Cannot calculate final F1-score.")


In [None]:

# 5. Get best parameters and flagged companies (INLINE FUNCTION)
def get_best_parameters_and_flagged(df, q_table, initial_thresholds):
    """
    Finds best thresholds from Q-table, applies them, and flags companies.
    """
    env = Environment(df, initial_thresholds)  # Create Environment
    # Initialize review agents *inside* the Environment (important for consistency)
    env.review_agents = [
        ReviewAgent(skill="novice", bias=0.1),
        ReviewAgent(skill="intermediate", bias=0.0),
        ReviewAgent(skill="expert", bias=-0.1)
    ]
    env.meta_review_agent = MetaReviewAgent()

    best_state = None
    best_state_reward = float('-inf')

    for state, actions in q_table.items():
        total_reward_for_state = sum(actions.values())
        if total_reward_for_state > best_state_reward:
            best_state_reward = total_reward_for_state
            best_state = state

    if best_state is None:
        print("Warning: Q-table empty. Using initial thresholds.")
        best_thresholds = initial_thresholds
    else:
        # Correctly extract thresholds from the state tuple
        threshold_keys = list(initial_thresholds.keys())
        best_thresholds = {key: best_state[i] for i, key in enumerate(threshold_keys)}
        print(f"Best state found: {best_state}")


    # --- Use the best thresholds for risk assessment and flagging ---
    risk_agent = RiskAssessmentAgent(best_thresholds)  # Create NEW RiskAssessmentAgent
    data_with_risk_scores = risk_agent.calculate_risk_score(df.copy())  # Use a COPY

    # Run the review process.
    for agent in env.review_agents:
        data_with_risk_scores = agent.review(data_with_risk_scores)
    data_with_risk_scores = env.meta_review_agent.aggregate_reviews(data_with_risk_scores, env.review_agents)

    # Flag companies based on combined_risk_score
    best_flagged_companies = data_with_risk_scores[data_with_risk_scores['combined_risk_score'] > 0.7]
    return best_thresholds, best_flagged_companies


best_thresholds, best_flagged_companies = get_best_parameters_and_flagged(
    data.copy(), q_table, initial_thresholds
)

print("\n--- Best Thresholds and Flagged Companies ---")
print("Best Thresholds:", best_thresholds)
print("\nFlagged Companies with Best Thresholds:")
if not best_flagged_companies.empty:
    print(best_flagged_companies[[
        'customer_id', 'total_transaction_amount', 'avg_transaction_amount',
        'max_transaction_amount', 'transaction_count', 'combined_risk_score'
    ]])
else:
    print("No companies were flagged with the best thresholds.")

values_counts = best_flagged_companies['customer_id'].value_counts()
values_counts

frequency_of_counts = values_counts.value_counts()
sorted_frequency_of_counts = frequency_of_counts.sort_index(ascending=False)

# This will display the frequency of counts sorted by the count values (i.e., how many times each frequency appears, sorted by the frequency itself)
print(sorted_frequency_of_counts)

filtered_value_counts = values_counts[values_counts > 1]
filtered_value_counts

len(filtered_value_counts)

df = first_rows_df.loc[first_rows_df['customer_id'].isin(unusual_cust.index)]
# df = df.iloc[1:10000]
len(df['customer_id'].unique())

In [None]:
values_counts = best_flagged_companies['customer_id'].value_counts()
values_counts

In [None]:
# prompt: best_flagged_companies['customer_id'] should be saved as txt

import os
import pandas as pd



# Determine output directory
OUTPUT_DIR = '/mnt/output' if os.path.exists('/mnt/output') else '.'
print(f"Using output directory: {OUTPUT_DIR}")

# Create the output directory if it doesn't exist (important for local runs)
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Construct the full output file path
output_path = os.path.join(OUTPUT_DIR, 'task_1_RL_ids.txt')

# Save customer IDs to the text file
with open(output_path, 'w') as f:
    for customer_id in best_flagged_companies['customer_id']:
        f.write(str(customer_id) + '\n')

print(f"Saved customer IDs to {output_path}")


In [None]:

frequency_of_counts = values_counts.value_counts()
sorted_frequency_of_counts = frequency_of_counts.sort_index(ascending=False)

# This will display the frequency of counts sorted by the count values (i.e., how many times each frequency appears, sorted by the frequency itself)
print(sorted_frequency_of_counts)

In [None]:
filtered_value_counts = values_counts[values_counts > 0]
filtered_value_counts

In [None]:
len(filtered_value_counts)

In [None]:
df = merged_df.loc[merged_df['customer_id'].isin(unusual_cust.index)]
# df = df.iloc[1:10000]
len(df['customer_id'].unique())