In [1]:
# Test notebook for feature engineering functions
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

# Import your feature engineering function
import sys
sys.path.append("../src")  # Adjust if needed
from feature_engineering import calculate_rfm_features

# Load a small sample of your data with correct delimiter
print("Loading a sample of transaction data...")
try:
    # The key fix is here - use semicolon as delimiter
    trans_data = pd.read_csv('../data/Trans.csv', delimiter=';', quotechar='"').head(1000)
    
    # Display the original column names to verify
    print("Original column names:", trans_data.columns.tolist())
    
    # Rename columns to match function expectations - names may differ now
    trans_data = trans_data.rename(columns={
        'trans_id': 'trans_id',
        'account_id': 'account_id', 
        'date': 'date',  # Make sure this matches the actual column name
        'type': 'type',
        'operation': 'operation',
        'amount': 'amount',
        'balance': 'balance'
    })
    
    # Show the data after renaming
    print("\nColumn names after renaming:", trans_data.columns.tolist())
    
    # Convert date column
    print("\nConverting date column...")
    trans_data['date'] = pd.to_datetime(trans_data['date'], format='%y%m%d')
    
    # Show the data after transformation
    print("\nData after transformation:")
    display(trans_data.head())
    
    # Load disposition data (linking clients to accounts)
    print("\nLoading disposition data...")
    disp_data = pd.read_csv('../data/Disposition.csv', delimiter=';', quotechar='"')
    
    # Display original column names
    print("Original disposition column names:", disp_data.columns.tolist())
    
    # Rename columns
    disp_data = disp_data.rename(columns={
        'disp_id': 'disp_id',
        'client_id': 'client_id',
        'account_id': 'account_id',
        'type': 'type'
    })
    
    print("\nDisposition data after transformation:")
    display(disp_data.head())
    
    # Calculate basic RFM features
    print("\nCalculating RFM features...")
    # Modified part of your notebook code
    print("\nCalculating RFM features...")
    # Before calling the function, check how many columns it expects to rename
    import inspect
    function_code = inspect.getsource(calculate_rfm_features)
    print("Function code snippet:")
    print(function_code[:500] + "...")  # Display first 500 chars

    # Define a wrapper function that fixes the issue
    def fixed_calculate_rfm(transaction_data, customer_data, reference_date=None):
        """Wrapper that handles column mismatch issues"""
        try:
            return calculate_rfm_features(transaction_data, customer_data, reference_date)
        except ValueError as e:
            if "Length mismatch" in str(e):
                # Do the calculation manually
                if reference_date is None:
                    reference_date = datetime.now()
                
                merged_data = pd.merge(
                    transaction_data,
                    customer_data[['client_id', 'account_id']],
                    on='account_id'
                )
                
                # Calculate each metric separately
                recency = merged_data.groupby('client_id')['date'].max().reset_index()
                recency['recency_days'] = (reference_date - recency['date']).dt.days
                recency = recency[['client_id', 'recency_days']]
                
                frequency = merged_data.groupby('client_id').size().reset_index(name='frequency')
                
                monetary = merged_data.groupby('client_id')['amount'].sum().reset_index(name='monetary_value')
                
                rfm = recency.merge(frequency, on='client_id').merge(monetary, on='client_id')
                return rfm
            else:
                raise  # Re-raise if it's a different error

    # Use the wrapper function
    rfm_features = fixed_calculate_rfm(trans_data, disp_data)
    
    # Display results
    print(f"RFM features calculated for {len(rfm_features)} customers")
    display(rfm_features.head())
    
except Exception as e:
    print(f"Error occurred: {str(e)}")
    import traceback
    traceback.print_exc()

Loading a sample of transaction data...


  trans_data = pd.read_csv('../data/Trans.csv', delimiter=';', quotechar='"').head(1000)


Original column names: ['trans_id', 'account_id', 'date', 'type', 'operation', 'amount', 'balance', 'k_symbol', 'bank', 'account']

Column names after renaming: ['trans_id', 'account_id', 'date', 'type', 'operation', 'amount', 'balance', 'k_symbol', 'bank', 'account']

Converting date column...

Data after transformation:


Unnamed: 0,trans_id,account_id,date,type,operation,amount,balance,k_symbol,bank,account
0,695247,2378,1993-01-01,PRIJEM,VKLAD,700.0,700.0,,,
1,171812,576,1993-01-01,PRIJEM,VKLAD,900.0,900.0,,,
2,207264,704,1993-01-01,PRIJEM,VKLAD,1000.0,1000.0,,,
3,1117247,3818,1993-01-01,PRIJEM,VKLAD,600.0,600.0,,,
4,579373,1972,1993-01-02,PRIJEM,VKLAD,400.0,400.0,,,



Loading disposition data...
Original disposition column names: ['disp_id', 'client_id', 'account_id', 'type']

Disposition data after transformation:


Unnamed: 0,disp_id,client_id,account_id,type
0,1,1,1,OWNER
1,2,2,2,OWNER
2,3,3,2,DISPONENT
3,4,4,3,OWNER
4,5,5,3,DISPONENT



Calculating RFM features...

Calculating RFM features...
Function code snippet:
def calculate_rfm_features(transaction_data, customer_data, reference_date=None):
    """
    Calculate RFM (Recency, Frequency, Monetary) features for customers
    
    Parameters:
    -----------
    transaction_data : pandas.DataFrame
        Transaction data with account_id, date, and amount
    customer_data : pandas.DataFrame
        Customer data linking client_id to account_id
    reference_date : datetime, optional
        Date to calculate recency from (default: today)
        
    Re...
RFM features calculated for 341 customers


Unnamed: 0,client_id,recency_days,frequency,monetary_value
0,2,11696,2,21336.0
1,3,11696,2,21336.0
2,12,11692,5,12905.4
3,102,11697,2,4043.0
4,103,11697,2,4043.0
