# Load & Prepare Dataset

In [30]:
import pandas as pd

# Load raw data
df = pd.read_csv("../data/raw/data.csv", parse_dates=["TransactionStartTime"])

# Preview
df.head()


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15 02:18:49+00:00,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15 02:19:08+00:00,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15 02:44:21+00:00,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15 03:32:55+00:00,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15 03:34:21+00:00,2,0


# Create Aggregate Features (Per Customer)

In [31]:
# Group by CustomerId
agg_features = df.groupby("CustomerId").agg({
    "Amount": ["sum", "mean", "std", "count"],
    "Value": ["mean"]
}).reset_index()

# Rename columns
agg_features.columns = ['CustomerId', 'TotalAmount', 'AvgAmount', 'StdAmount', 'TransactionCount', 'AvgValue']


# Extract Time-based Features

In [32]:
df["TransactionHour"] = df["TransactionStartTime"].dt.hour
df["TransactionDay"] = df["TransactionStartTime"].dt.day
df["TransactionMonth"] = df["TransactionStartTime"].dt.month
df["TransactionYear"] = df["TransactionStartTime"].dt.year

time_features = df.groupby("CustomerId").agg({
    "TransactionHour": "mean",
    "TransactionDay": "nunique",
    "TransactionMonth": "nunique"
}).reset_index()

time_features.columns = ["CustomerId", "AvgHour", "ActiveDays", "ActiveMonths"]



# Encode Categorical Variables

In [33]:
#One-Hot Encoding
categorical_cols = ["ProductCategory", "ChannelId", "CurrencyCode", "PricingStrategy"]

available_categorical_cols = [col for col in categorical_cols if col in df.columns and not df[col].isnull().all()]

df_encoded = pd.get_dummies(df[available_categorical_cols])

# df = df.drop(columns=available_categorical_cols)
# df = pd.concat([df, df_encoded], axis=1)


#Label Encoding (for tree models)
from sklearn.preprocessing import LabelEncoder

label_cols = [col for col in categorical_cols if col in df.columns and not df[col].isnull().all()]
label_encoders = {}

for col in label_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le


# Handle Missing Values

In [34]:
df.isnull().sum()

# Impute numerical missing
df["Amount"].fillna(df["Amount"].median(), inplace=True)

# Impute categorical with mode
df["ProductCategory"].fillna(df["ProductCategory"].mode()[0], inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Amount"].fillna(df["Amount"].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["ProductCategory"].fillna(df["ProductCategory"].mode()[0], inplace=True)


# Normalize/Standardize Numerical Features

In [35]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler

scaler = StandardScaler()  # or MinMaxScaler()
num_cols = ["TotalAmount", "AvgAmount", "StdAmount", "TransactionCount", "AvgValue"]

scaled_values = scaler.fit_transform(agg_features[num_cols])
agg_features[num_cols] = scaled_values


# Weight of Evidence (WoE) Encoding

In [38]:
import category_encoders as ce

# Select only categorical columns for WoE
woe_cols = ["ProductCategory", "ChannelId", "CurrencyCode", "PricingStrategy"]

# Create WoE encoder
encoder = ce.WOEEncoder(cols=woe_cols)

# Fit and transform
df_merged["label"] = df_merged["FraudResult"]
df_woe = df_merged.copy()
df_woe[woe_cols] = encoder.fit_transform(df_merged[woe_cols], df_merged["label"])


# Combine All Features into Final Dataset

In [39]:
# Merge all customer-level features
final_df = agg_features.merge(time_features, on="CustomerId")

# Optional: merge with label
final_df = final_df.merge(df[["CustomerId", "FraudResult"]].drop_duplicates(), on="CustomerId")
