In [4]:
import sys
import os

# Add the project root (one level up from src) to sys.path
sys.path.append(os.path.abspath(".."))
import pandas as pd

In [5]:
# Load the dataset (adjust the path if needed)
df = pd.read_csv("../data/raw/data.csv")

In [6]:
from src.data_processing import CustomerAggregateFeatures
agg = CustomerAggregateFeatures()
agg_df = agg.fit_transform(df)
agg_df.head()

Unnamed: 0,CustomerId,TotalTransactionAmount,AvgTransactionAmount,TransactionCount,StdTransactionAmount
0,CustomerId_1,-10000.0,-10000.0,1,
1,CustomerId_10,-10000.0,-10000.0,1,
2,CustomerId_1001,20000.0,4000.0,5,6558.963333
3,CustomerId_1002,4225.0,384.090909,11,560.498966
4,CustomerId_1003,20000.0,3333.333333,6,6030.478146


In [7]:
from src.data_processing import TimeFeaturesExtractor

time_features = TimeFeaturesExtractor()
time_df = time_features.fit_transform(df)
time_df.head()

Unnamed: 0,CustomerId,TransactionHour,TransactionDay,TransactionMonth,TransactionYear
0,CustomerId_4406,2,15,11,2018
1,CustomerId_4406,2,15,11,2018
2,CustomerId_4683,2,15,11,2018
3,CustomerId_988,3,15,11,2018
4,CustomerId_988,3,15,11,2018


In [8]:
# First, aggregate time features per customer
time_agg = time_df.groupby('CustomerId').agg(
    MostCommonHour=('TransactionHour', lambda x: x.mode()[0] if not x.mode().empty else x.iloc[0]),
    MostCommonDay=('TransactionDay', lambda x: x.mode()[0] if not x.mode().empty else x.iloc[0]),
    MostCommonMonth=('TransactionMonth', lambda x: x.mode()[0] if not x.mode().empty else x.iloc[0]),
    MostCommonYear=('TransactionYear', lambda x: x.mode()[0] if not x.mode().empty else x.iloc[0])
).reset_index()

# Now merge with agg_df
customer_df = pd.merge(agg_df, time_agg, on='CustomerId', how='left')

# Preview the combined customer-level dataset
customer_df.head()


Unnamed: 0,CustomerId,TotalTransactionAmount,AvgTransactionAmount,TransactionCount,StdTransactionAmount,MostCommonHour,MostCommonDay,MostCommonMonth,MostCommonYear
0,CustomerId_1,-10000.0,-10000.0,1,,16,21,11,2018
1,CustomerId_10,-10000.0,-10000.0,1,,16,21,11,2018
2,CustomerId_1001,20000.0,4000.0,5,6558.963333,8,16,11,2018
3,CustomerId_1002,4225.0,384.090909,11,560.498966,14,9,11,2018
4,CustomerId_1003,20000.0,3333.333333,6,6030.478146,14,1,2,2019


In [9]:
from src.data_processing import CategoricalEncoder

# Choose columns to encode from the original df (still at transaction level)
categorical_cols = ['CurrencyCode', 'ProductCategory', 'ChannelId']

# Run encoder
encoder = CategoricalEncoder(columns=categorical_cols)
encoded_df = encoder.fit_transform(df)

# Aggregate to customer level (use mean — works since 1/0 values)
encoded_grouped = encoded_df.groupby('CustomerId').mean().reset_index()

# Merge with customer_df
customer_df = pd.merge(customer_df, encoded_grouped, on='CustomerId', how='left')

# Preview the final DataFrame
customer_df.head()


Unnamed: 0,CustomerId,TotalTransactionAmount,AvgTransactionAmount,TransactionCount,StdTransactionAmount,MostCommonHour,MostCommonDay,MostCommonMonth,MostCommonYear,CurrencyCode_UGX,...,ProductCategory_movies,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5
0,CustomerId_1,-10000.0,-10000.0,1,,16,21,11,2018,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,CustomerId_10,-10000.0,-10000.0,1,,16,21,11,2018,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,CustomerId_1001,20000.0,4000.0,5,6558.963333,8,16,11,2018,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.6,0.0
3,CustomerId_1002,4225.0,384.090909,11,560.498966,14,9,11,2018,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.545455,0.454545,0.0
4,CustomerId_1003,20000.0,3333.333333,6,6030.478146,14,1,2,2019,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.666667,0.0


In [10]:
# Aggregate the label per customer: 1 if customer had any fraud, else 0
fraud_df = df.groupby('CustomerId')['FraudResult'].max().reset_index()
fraud_df.rename(columns={'FraudResult': 'AnyFraud'}, inplace=True)

# Merge into customer_df
customer_df = pd.merge(customer_df, fraud_df, on='CustomerId', how='left')
customer_df.head()

Unnamed: 0,CustomerId,TotalTransactionAmount,AvgTransactionAmount,TransactionCount,StdTransactionAmount,MostCommonHour,MostCommonDay,MostCommonMonth,MostCommonYear,CurrencyCode_UGX,...,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,AnyFraud
0,CustomerId_1,-10000.0,-10000.0,1,,16,21,11,2018,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1,CustomerId_10,-10000.0,-10000.0,1,,16,21,11,2018,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,CustomerId_1001,20000.0,4000.0,5,6558.963333,8,16,11,2018,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.4,0.6,0.0,0
3,CustomerId_1002,4225.0,384.090909,11,560.498966,14,9,11,2018,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.545455,0.454545,0.0,0
4,CustomerId_1003,20000.0,3333.333333,6,6030.478146,14,1,2,2019,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.666667,0.0,0


In [11]:
from src.data_processing import build_numeric_pipeline

# Identify numeric columns (excluding CustomerId and target)
numeric_columns = customer_df.select_dtypes(include=['float64', 'int64']).columns.tolist()
numeric_columns = [col for col in numeric_columns if col not in ['CustomerId', 'AnyFraud']]

# Build and fit-transform pipeline
numeric_pipeline = build_numeric_pipeline(numeric_columns)
X_scaled = numeric_pipeline.fit_transform(customer_df)

# Convert back to DataFrame
scaled_df = pd.DataFrame(X_scaled, columns=numeric_columns)
scaled_df['CustomerId'] = customer_df['CustomerId'].values
scaled_df['AnyFraud'] = customer_df['AnyFraud'].values  # re-attach label

scaled_df.head()


Unnamed: 0,TotalTransactionAmount,AvgTransactionAmount,TransactionCount,StdTransactionAmount,CurrencyCode_UGX,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5,CustomerId,AnyFraud
0,-0.066891,-0.153364,-0.253459,0.0,0.0,2.073016,-0.228416,-1.904511,-0.099954,-0.01635,-0.106712,-0.052922,-0.172187,-0.230127,-0.12953,3.137802,-2.80444,-0.147303,CustomerId_1,0
1,-0.066891,-0.153364,-0.253459,0.0,0.0,2.073016,-0.228416,-1.904511,-0.099954,-0.01635,-0.106712,-0.052922,-0.172187,-0.230127,-0.12953,3.137802,-2.80444,-0.147303,CustomerId_10,0
2,-0.055849,-0.06987,-0.212186,-0.105976,0.0,-0.039359,-0.228416,0.268985,-0.099954,-0.01635,-0.106712,-0.052922,-0.172187,-0.230127,-0.12953,0.480118,-0.342663,-0.147303,CustomerId_1001,0
3,-0.061655,-0.091435,-0.150278,-0.168036,0.0,0.152675,-0.228416,0.071395,-0.099954,-0.01635,-0.106712,-0.052922,-0.172187,-0.230127,-0.12953,1.124405,-0.939458,-0.147303,CustomerId_1002,0
4,-0.055849,-0.073846,-0.201868,-0.111444,0.0,0.312703,-0.228416,-0.093264,-0.099954,-0.01635,-0.106712,-0.052922,-0.172187,-0.230127,-0.12953,0.18482,-0.069133,-0.147303,CustomerId_1003,0


In [12]:
scaled_df.to_csv("../data/raw/featured.csv", index=False)