## 1. Setup and Data Loading

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
from datetime import datetime

# Add src directory to path to import our modules
sys.path.append('../src')
from data_processing import load_data, preprocess_data, generate_rfms_features, create_risk_proxy, get_feature_correlations

# Set plot style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

: 

In [None]:

# Load and preprocess the data
DATA_PATH = '../data/raw/Xente_Train.csv'  # Adjust path as needed
raw_df = load_data(DATA_PATH)
df = preprocess_data(raw_df)

: 

# Cell 5 - Markdown
## 2. Overview of the Data
# 
# Understand the structure of the dataset, including the number of rows, columns, data types, and missing values.

In [None]:

print(f"Dataset shape: {df.shape}")
print("\nData Types:")
print(df.info())

print("\nMissing Values (%):")
print(df.isnull().sum() / len(df) * 100)

In [None]:
## 3. Feature Engineering: RFMS + Proxy Variable
# 
# Now we'll use our processing functions to engineer the behavioral features and create the risk proxy. This is the core of our analysis.

In [None]:
# Define a snapshot date for recency calculation (e.g., one day after the last transaction)
snapshot_date = df['TransactionStartTime'].max() + pd.Timedelta(days=1)

# Generate RFMS features
rfms_features = generate_rfms_features(df, snapshot_date)

# Create the risk proxy
# We need the fraud result from the original dataframe for this
fraud_data = df[['CustomerId', 'FraudResult']]
final_features_df = create_risk_proxy(rfms_features, fraud_data)

print("Generated Features and Proxy Variable DataFrame:")
final_features_df.head()

## 4. Distribution of Engineered Features
# 
# Let's visualize the distributions of our new `Recency`, `Frequency`, `Monetary`, and `Std_Dev_Amount` features.

In [None]:

fig, axes = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Distribution of Engineered RFMS Features', fontsize=16)

sns.histplot(final_features_df['Recency'], ax=axes[0, 0], kde=True, bins=50).set_title('Recency Distribution')
sns.histplot(final_features_df['Frequency'], ax=axes[0, 1], kde=True, bins=50).set_title('Frequency Distribution (Log Scale)')
axes[0, 1].set_xscale('log')  # Frequency is often highly skewed

sns.histplot(final_features_df['Monetary'], ax=axes[1, 0], kde=True, bins=50).set_title('Monetary Distribution (Log Scale)')
axes[1, 0].set_xscale('log')  # Monetary is often highly skewed

sns.histplot(final_features_df['Std_Dev_Amount'], ax=axes[1, 1], kde=True, bins=50).set_title('Spending Volatility (Std. Dev.)')

plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


### Box Plots for Outlier Detection

In [None]:

fig, axes = plt.subplots(1, 4, figsize=(20, 5))
fig.suptitle('Outlier Analysis using Box Plots', fontsize=16)
sns.boxplot(y=final_features_df['Recency'], ax=axes[0])
sns.boxplot(y=final_features_df['Frequency'], ax=axes[1])
sns.boxplot(y=final_features_df['Monetary'], ax=axes[2])
sns.boxplot(y=final_features_df['Std_Dev_Amount'], ax=axes[3])
plt.tight_layout(rect=[0, 0.03, 1, 0.95])
plt.show()


## 5. Feature Correlation with Risk Proxy
# 
# This is a critical step to validate our feature engineering. We need to see if the features we created have a strong relationship with the `high_risk` proxy variable. This will inform our feature selection for the model.

In [None]:
# Cell 14 - Code
feature_corr = get_feature_correlations(final_features_df[['Recency', 'Frequency', 'Monetary', 'Std_Dev_Amount', 'high_risk']], 'high_risk')

plt.figure(figsize=(8, 6))
sns.heatmap(feature_corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation of RFMS Features with High-Risk Proxy')
plt.show()

print("Correlation values:")
print(feature_corr)