# Feature Engineering Pipeline

In [1]:
# Import required libraries and feature processing function
import pandas as pd
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from src.data_processing import process_features


In [2]:
# Load raw transaction data
df = pd.read_csv('../data/raw/data.csv')

In [3]:
df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   TransactionId         95662 non-null  object 
 1   BatchId               95662 non-null  object 
 2   AccountId             95662 non-null  object 
 3   SubscriptionId        95662 non-null  object 
 4   CustomerId            95662 non-null  object 
 5   CurrencyCode          95662 non-null  object 
 6   CountryCode           95662 non-null  int64  
 7   ProviderId            95662 non-null  object 
 8   ProductId             95662 non-null  object 
 9   ProductCategory       95662 non-null  object 
 10  ChannelId             95662 non-null  object 
 11  Amount                95662 non-null  float64
 12  Value                 95662 non-null  int64  
 13  TransactionStartTime  95662 non-null  object 
 14  PricingStrategy       95662 non-null  int64  
 15  FraudResult        

In [5]:
# Remove prefix strings to keep IDs clean for all *_Id columns (only if dtype is object)
id_cols = [col for col in df.columns if col.endswith('Id')]
for col in id_cols:
    if df[col].dtype == 'object':
        df[col] = df[col].str.replace(f'{col}_', '').astype(int)
del id_cols, col

In [6]:
# Check if 'Value' is the absolute value of 'Amount'
if 'Value' in df.columns and 'Amount' in df.columns:
    is_abs = (df['Value'] == df['Amount'].abs()).all()
    print(f"All 'Value' are absolute of 'Amount': {is_abs}")
    # Show mismatches if any
    if not is_abs:
        print(df.loc[df['Value'] != df['Amount'].abs(), ['Value', 'Amount']].head())
else:
    print("'Value' or 'Amount' column not found in DataFrame.")

All 'Value' are absolute of 'Amount': False
    Value    Amount
3   21800  20000.00
34     68    -67.25
41  11200  10000.00
48   5750   5000.00
67   5750   5000.00


In [7]:
# Replace 'Value' with the absolute value of 'Amount'
if 'Value' in df.columns and 'Amount' in df.columns:
    df['Value'] = df['Amount'].abs().astype(int)
    print("'Value' column replaced with abs('Amount').")
else:
    print("'Value' or 'Amount' column not found in DataFrame.")

'Value' column replaced with abs('Amount').


In [8]:
print("The count of unique values in the columns:")
drop_columns = ['TransactionId', 'CurrencyCode', 'CountryCode']
print(df.shape)
print(df[drop_columns].nunique())

# Drop columns that are not needed for modeling cause transaction IDs is unique for every row, 
# currency code, and country code have only one value.
df.drop(columns=drop_columns, inplace=True, errors='ignore')

The count of unique values in the columns:
(95662, 16)
TransactionId    95662
CurrencyCode         1
CountryCode          1
dtype: int64


In [9]:
# Process features using the pipeline
features = process_features(df)
features.head()

Unnamed: 0,BatchId,AccountId,SubscriptionId,CustomerId,Amount,Value,PricingStrategy,FraudResult,transaction_hour,transaction_day,...,ProductCategory_airtime,ProductCategory_data_bundles,ProductCategory_financial_services,ProductCategory_movies,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ProductId_label
0,36123,3957,887,4406,-0.046371,-0.071953,2,0,2,15,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,15642,4841,3829,4406,-0.054643,-0.079915,2,0,2,15,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,19
2,53941,4229,222,4683,-0.050426,-0.076015,2,0,2,15,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,102363,648,2185,988,0.107717,0.082399,2,0,3,15,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,11
4,38780,4841,3829,988,-0.059704,-0.074846,2,0,3,15,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,19


In [10]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95662 entries, 0 to 95661
Data columns (total 36 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   BatchId                             95662 non-null  int64  
 1   AccountId                           95662 non-null  int64  
 2   SubscriptionId                      95662 non-null  int64  
 3   CustomerId                          95662 non-null  int64  
 4   Amount                              95662 non-null  float64
 5   Value                               95662 non-null  float64
 6   PricingStrategy                     95662 non-null  int64  
 7   FraudResult                         95662 non-null  int64  
 8   transaction_hour                    95662 non-null  int32  
 9   transaction_day                     95662 non-null  int32  
 10  transaction_month                   95662 non-null  int32  
 11  transaction_year                    95662

In [12]:
# Check for missing values in each column
missing_values = features.isnull().sum()

total_missing_values = missing_values.isnull().sum()

print(f"\nMissing values in the dataset:\n{total_missing_values}")


Missing values in the dataset:
0


In [13]:
# Save the processed features to a CSV file
features.to_csv('../data/processed/features.csv', index=False)
print('Features saved to ../data/processed/features.csv')

Features saved to ../data/processed/features.csv
