In [None]:
import pandas as pd

# Load the data from CSV files
dim_data = pd.read_csv('/mnt/data/dim.csv')
fact_data = pd.read_csv('/mnt/data/fact.csv')

# Convert transaction time to datetime object for easier manipulation
fact_data['TXN_TM'] = pd.to_datetime(fact_data['TXN_TM'])

# Sort the transactions by time to ensure the order is correct for subsequent operations
fact_data_sorted = fact_data.sort_values(by='TXN_TM')

# Helper function to get the statement period based on a transaction date
def get_statement_period(txn_date):
    # Statement period starts on the 22nd of the previous month and ends on the 21st of the transaction month
    statement_start = txn_date.replace(day=22) - pd.DateOffset(months=1)
    statement_end = txn_date.replace(day=21)
    return statement_start, statement_end

# Helper function to calculate the due date of payment
def get_payment_due_date(txn_date):
    # Payment due date is the 5th of the month following the transaction month
    due_date = txn_date.replace(day=5) + pd.DateOffset(months=1)
    return due_date

# Add a column for the statement period start and end, and the payment due date to the payments dataframe
payments = fact_data_sorted[fact_data_sorted['TML_WEB_AP_NM'] == PAYMENT_TYPE_CODE].copy()
payments[['STATEMENT_START', 'STATEMENT_END']] = payments['TXN_TM'].apply(
    lambda x: pd.Series(get_statement_period(x))
)
payments['PAYMENT_DUE_DATE'] = payments['TXN_TM'].apply(get_payment_due_date)

# Now we'll perform the analysis to identify the suspicious customers
# ...

# The rest of the analysis code would go here, which I will provide and execute to get the final results.


In [None]:
# Continuing from where we left off...

# Define a function to check for spendings that occurred within 30 minutes after the payment
def check_suspicious_activity(payment_row, spendings):
    # Get spendings that happened after the payment, within 30 minutes, and within the same statement period
    post_payment_spendings = spendings[
        (spendings['CIF hash'] == payment_row['CIF hash']) &
        (spendings['CARD_NBR hash'] == payment_row['CARD_NBR hash']) &
        (spendings['TXN_TM'] > payment_row['TXN_TM']) &
        (spendings['TXN_TM'] <= payment_row['TXN_TM'] + pd.DateOffset(minutes=30)) &
        (spendings['TXN_TM'] <= payment_row['PAYMENT_DUE_DATE'])
    ]
    # Sum the spendings and compare with the payment amount
    total_spending = post_payment_spendings['NET_CASH_FLOW_AMT_LCY'].sum()
    return total_spending >= 0.9 * payment_row['TOTAL_PAYMENTS']

# Apply the function to each high utilization payment to find suspicious activities
high_utilization_payments['SUSPICIOUS_ACTIVITY'] = high_utilization_payments.apply(
    lambda row: check_suspicious_activity(row, spendings), axis=1
)

# Select the required columns to output
suspicious_activities = high_utilization_payments[high_utilization_payments['SUSPICIOUS_ACTIVITY']]
output_data = suspicious_activities[['CIF hash', 'CARD_NBR hash']].copy()
output_data['LABEL'] = 'Suspicious'

# Save the output to a CSV file
output_file_path = '/mnt/data/suspicious_customers_updated.csv'
output_data.to_csv(output_file_path, index=False)

output_file_path


In [None]:
import pandas as pd
from datetime import timedelta

# Load the data from CSV files
dim_data = pd.read_csv('/mnt/data/dim.csv')
fact_data = pd.read_csv('/mnt/data/fact.csv')

# Sort the fact data by transaction time
fact_data_sorted = fact_data.sort_values(by='TXN_TM')

# Define the constants for transaction codes
PAYMENT_TYPE_CODE = 'OPTP0028'  # Code for payment transactions
SPENDING_TYPE_CODE = 'OPTP0000'  # Code for spending transactions

# Filter out the transactions for payment and spending
payments = fact_data_sorted[fact_data_sorted['TML_WEB_AP_NM'] == PAYMENT_TYPE_CODE].copy()
spendings = fact_data_sorted[fact_data_sorted['TML_WEB_AP_NM'] == SPENDING_TYPE_CODE].copy()

# Calculate the statement period for each payment transaction
payments['STATEMENT_START'] = payments['TXN_TM'].apply(
    lambda x: (x - pd.DateOffset(months=1)).replace(day=22) if x.day > 21 else x.replace(day=22)
)
payments['STATEMENT_END'] = payments['TXN_TM'].apply(
    lambda x: x.replace(day=21) if x.day > 21 else (x - pd.DateOffset(months=1)).replace(day=21)
)

# Calculate the payment due date
payments['PAYMENT_DUE_DATE'] = payments['STATEMENT_END'].apply(
    lambda x: (x + pd.DateOffset(months=1)).replace(day=5)
)

# Merge the payments with the credit limit information
payments_with_limit = payments.merge(dim_data, on=['CIF hash', 'CARD_NBR hash'])

# Check if the payment is greater than or equal to 90% of the credit limit
payments_with_limit['HIGH_UTILIZATION'] = payments_with_limit.apply(
    lambda row: row['NET_CASH_FLOW_AMT_LCY'] >= 0.9 * row['CREDIT_LIMIT'], axis=1
)

# Identify high utilization payments
high_utilization_payments = payments_with_limit[payments_with_limit['HIGH_UTILIZATION']]

# Now for each high utilization payment, check for spending transactions within 30 minutes
# ... (rest of the code that checks for spending transactions)


In [None]:
import pandas as pd
from datetime import datetime, timedelta

# Load the data from CSV files
dim_data = pd.read_csv('/mnt/data/dim.csv')
fact_data = pd.read_csv('/mnt/data/fact.csv')

# Sort the fact_data by transaction time in ascending order
fact_data_sorted = fact_data.sort_values(by='TXN_TM')

# Define the transaction type codes
PAYMENT_TYPE_CODE = 'OPTP0028'  # Code for payment transactions
SPENDING_TYPE_CODE = 'OPTP0000'  # Code for spending transactions

# Calculate the statement period for each transaction
def get_statement_period(txn_date):
    if txn_date.day >= 22:
        statement_start = txn_date.replace(day=22, month=txn_date.month - 1)
    else:
        statement_start = txn_date.replace(day=22, month=txn_date.month - 2)
    statement_end = statement_start.replace(day=21, month=statement_start.month + 1)
    return statement_start, statement_end

fact_data_sorted['STATEMENT_START'], fact_data_sorted['STATEMENT_END'] = zip(
    *fact_data_sorted['TXN_TM'].apply(get_statement_period)
)

# Calculate the statement balance for each customer-card-statement period
statement_balances = fact_data_sorted.groupby(['CIF hash', 'CARD_NBR hash', 'STATEMENT_START', 'STATEMENT_END'])
statement_balances = statement_balances['NET_CASH_FLOW_AMT_LCY'].sum().reset_index(name='STATEMENT_BALANCE')

# Merge the statement balances with the credit limit information
statement_balances = statement_balances.merge(dim_data, on=['CIF hash', 'CARD_NBR hash'])

# Identify high utilization cases
statement_balances['HIGH_UTILIZATION'] = statement_balances['STATEMENT_BALANCE'] >= 0.9 * statement_balances['CREDIT_LIMIT']

# Filter for high utilization cases
high_utilization_cases = statement_balances[statement_balances['HIGH_UTILIZATION']]

# Now, for each high utilization case, find if there are any spendings within 30 minutes after paying off the debt
def find_suspicious_transactions(row, transactions):
    # Filter transactions for the customer and card within the statement period
    customer_transactions = transactions[
        (transactions['CIF hash'] == row['CIF hash']) &
        (transactions['CARD_NBR hash'] == row['CARD_NBR hash']) &
        (transactions['TXN_TM'] >= row['STATEMENT_START']) &
        (transactions['TXN_TM'] <= row['STATEMENT_END'])
    ]
    # Find payment transactions
    payments = customer_transactions[customer_transactions['TML_WEB_AP_NM'] == PAYMENT_TYPE_CODE]
    # For each payment, check if there is a subsequent spending within 30 minutes
    for _, payment in payments.iterrows():
        spending_after_payment = customer_transactions[
            (customer_transactions['TXN_TM'] > payment['TXN_TM']) &
            (customer_transactions['TXN_TM'] <= payment['TXN_TM'] + timedelta(minutes=30)) &
            (customer_transactions['TML_WEB_AP_NM'] == SPENDING_TYPE_CODE)
        ]
        # If there is a spending that matches the amount paid closely, flag as suspicious
        if not spending_after_payment.empty:
            spending_sum = spending_after_payment['NET_CASH_FLOW_AMT_LCY'].sum()
            if spending_sum >= 0.9 * payment['NET_CASH_FLOW_AMT_LCY']:
                return True
    return False

# Apply the function to find suspicious transactions
high_utilization_cases['SUSPICIOUS_ACTIVITY'] = high_util
