In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set a style for plots
sns.set_style("whitegrid")

print("Loading the generated datasets...")

# Load all the CSV files into pandas DataFrames
try:
    customers_df = pd.read_csv('customers.csv')
    merchants_df = pd.read_csv('merchants.csv')
    transactions_df = pd.read_csv('transactions.csv')
    relationships_df = pd.read_csv('relationships.csv')
    ground_truth_df = pd.read_csv('stressed_customers_ground_truth.csv')

    # --- Crucial Data Type Conversion ---
    # Convert the 'timestamp' column from a string to a datetime object
    # This is essential for any time-based feature engineering
    transactions_df['timestamp'] = pd.to_datetime(transactions_df['timestamp'])

    print("\n--- All files loaded successfully! ---")
    print("\nHere's a preview of the transactions data:")

    # Display the first few rows and info to confirm it's loaded correctly
    print(transactions_df.head())
    print("\nData types for transactions_df:")
    transactions_df.info()

except FileNotFoundError as e:
    print(f"--- ERROR ---")
    print(f"File not found: {e.filename}")
    print("Please make sure you have run the 'generate_data.py' script successfully in the same folder.")



Loading the generated datasets...

--- All files loaded successfully! ---

Here's a preview of the transactions data:
                   timestamp customer_id   merchant_id  amount   type
0 2025-02-14 14:57:32.041373       C0000         M0017   39.46  DEBIT
1 2025-02-14 18:57:32.041373       C0001  LENDINGAPP01  164.86  DEBIT
2 2025-02-14 21:57:32.041373       C0001        SALARY  139.39  DEBIT
3 2025-02-14 18:57:32.041373       C0002         M0012  120.67  DEBIT
4 2025-02-14 09:57:32.041373       C0003         M0003   56.75  DEBIT

Data types for transactions_df:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279296 entries, 0 to 279295
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   timestamp    279296 non-null  datetime64[ns]
 1   customer_id  279296 non-null  object        
 2   merchant_id  279296 non-null  object        
 3   amount       279296 non-null  float64       
 4   type         2

In [3]:
# Create the target variable 'is_stressed'
# This is our "ground truth" that the model will learn from.
customers_df['is_stressed'] = customers_df['customer_id'].isin(ground_truth_df['customer_id']).astype(int)

print("Target variable 'is_stressed' added to customers_df.")
print(customers_df['is_stressed'].value_counts())


Target variable 'is_stressed' added to customers_df.
is_stressed
0    400
1    100
Name: count, dtype: int64


In [4]:
# 1. Identify all salary transactions
salary_df = transactions_df[transactions_df['merchant_id'] == 'SALARY'].copy()

# 2. Establish Baseline: For each customer, find their typical salary day
# We'll use the median day of the month as the baseline.
salary_df['day'] = salary_df['timestamp'].dt.day
baseline_salary_day = salary_df.groupby('customer_id')['day'].median().to_dict()

# 3. Calculate Deviation: Find transactions that are later than the baseline
def is_late(row):
    customer_baseline = baseline_salary_day.get(row['customer_id'])
    if customer_baseline is None:
        return 0
    # A simple definition of "late" is more than 2 days after the median day
    return 1 if row['day'] > customer_baseline + 2 else 0

salary_df['is_late_salary'] = salary_df.apply(is_late, axis=1)

# 4. Feature Creation: Count late salaries for each customer
late_salary_counts = salary_df.groupby('customer_id')['is_late_salary'].sum().reset_index()
late_salary_counts.rename(columns={'is_late_salary': 'feature_late_salary_count'}, inplace=True)

# Merge this new feature back into our main customers dataframe
customers_df = pd.merge(customers_df, late_salary_counts, on='customer_id', how='left').fillna(0)

print("Feature 'feature_late_salary_count' created.")
print(customers_df[['customer_id', 'is_stressed', 'feature_late_salary_count']].head())

# Let's see if the feature is more prominent for stressed customers
print("\nAverage late salary count:")
print(customers_df.groupby('is_stressed')['feature_late_salary_count'].mean())


Feature 'feature_late_salary_count' created.
  customer_id  is_stressed  feature_late_salary_count
0       C0000            0                         12
1       C0001            0                         20
2       C0002            0                         16
3       C0003            0                         14
4       C0004            0                         15

Average late salary count:
is_stressed
0    12.84
1    11.17
Name: feature_late_salary_count, dtype: float64


In [5]:
# 1. Identify Lending App merchants
lending_app_ids = merchants_df[merchants_df['category'] == 'LendingApp']['merchant_id'].tolist()

# 2. Filter for transactions to these apps
lending_transactions = transactions_df[transactions_df['merchant_id'].isin(lending_app_ids)]

# 3. Feature Creation: Count these transactions for each customer
lending_tx_counts = lending_transactions.groupby('customer_id').size().reset_index(name='feature_lending_app_tx_count')

# Merge this new feature back into our main customers dataframe
customers_df = pd.merge(customers_df, lending_tx_counts, on='customer_id', how='left').fillna(0)

print("\nFeature 'feature_lending_app_tx_count' created.")
# Let's see if this feature is also more prominent for stressed customers
print("\nAverage lending app transaction count:")
print(customers_df.groupby('is_stressed')['feature_lending_app_tx_count'].mean())



Feature 'feature_lending_app_tx_count' created.

Average lending app transaction count:
is_stressed
0    23.81
1    76.68
Name: feature_lending_app_tx_count, dtype: float64
