# Feature Engineering

In [47]:
import sys
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '../src')))

from F_data_processing import *


In [5]:
df = pd.read_csv('../data/data.csv')
df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0


# Aggregate Features

In [10]:
agg_features = AggregateFeatures(df)
#Sum of all transaction amounts for each customer.
agg_features.sum_all_transactions()
# Average Transaction Amount: 
agg_features.average_transaction_amount()
# Transaction Count:
agg_features.transaction_count()
# Std of Transaction Amounts:
agg_features.standard_deviation_amount()


# Final  after Aggregation 

In [11]:
final_df = agg_features.get_dataframe()
final_df.head()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult,TotalTransactionAmount,AverageTransactionAmount,TotalTransactions,StdTransactionAmount
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000,2018-11-15T02:18:49Z,2,0,109921.75,923.712185,119,3042.294251
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20,2018-11-15T02:19:08Z,2,0,109921.75,923.712185,119,3042.294251
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,ChannelId_3,500.0,500,2018-11-15T02:44:21Z,2,0,1000.0,500.0,2,0.0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,21800,2018-11-15T03:32:55Z,2,0,228727.2,6019.136842,38,17169.24161
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644,2018-11-15T03:34:21Z,2,0,228727.2,6019.136842,38,17169.24161


## Deriving Features


In [14]:
extracted_features = ExtractingFeatures(final_df)

In [15]:
# Transaction Hour:
extracted_features.transaction_hour()
# Transaction Day of Week:
extracted_features.transaction_day()
# Transaction Month:
extracted_features.transaction_month()
# Transaction Year:
extracted_features.transaction_year()


# Final for the Extraction of features 

In [16]:
extracted_features.get_dataframe()

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,...,PricingStrategy,FraudResult,TotalTransactionAmount,AverageTransactionAmount,TotalTransactions,StdTransactionAmount,TransactionHour,TransactionDay,TransactionMonth,TransactionYear
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256,ProviderId_6,ProductId_10,airtime,...,2,0,109921.75,923.712185,119,3042.294251,2,15,11,2018
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256,ProviderId_4,ProductId_6,financial_services,...,2,0,109921.75,923.712185,119,3042.294251,2,15,11,2018
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256,ProviderId_6,ProductId_1,airtime,...,2,0,1000.00,500.000000,2,0.000000,2,15,11,2018
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256,ProviderId_1,ProductId_21,utility_bill,...,2,0,228727.20,6019.136842,38,17169.241610,3,15,11,2018
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256,ProviderId_4,ProductId_6,financial_services,...,2,0,228727.20,6019.136842,38,17169.241610,3,15,11,2018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256,ProviderId_4,ProductId_6,financial_services,...,2,0,2438140.00,4255.043630,573,22554.029939,9,13,2,2019
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256,ProviderId_6,ProductId_10,airtime,...,2,0,58499.60,1360.455814,43,2274.756582,9,13,2,2019
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256,ProviderId_4,ProductId_6,financial_services,...,2,0,58499.60,1360.455814,43,2274.756582,9,13,2,2019
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256,ProviderId_6,ProductId_19,tv,...,2,0,851985.00,1625.925573,524,3207.920536,10,13,2,2019


 
# Encode Categorical Variables

In [None]:
#Determine columns for exclusion and identify categorical variables for encoding.
cols_to_drop = ['ProductId', 'BatchId',	'AccountId','ProviderId', 'SubscriptionId', 'Value','CountryCode','CurrencyCode']
# Categorical features
cat_features = ['ProductCategory', 'ChannelId']

final_df.drop(columns=cols_to_drop, inplace=True)

In [18]:
#Create a copy of df to preserve the original data.
df_encoded = final_df.copy().reset_index()

In [19]:
# one - Hot Encoding
df_encoded = pd.get_dummies(df_encoded, columns=cat_features, dtype=int)

In [29]:
# Remove the TransactionStartTime column from the dataset.
df_encoded.drop(columns=['TransactionStartTime'], inplace=True, errors='ignore')


In [30]:
df_encoded.head()

Unnamed: 0,index,TransactionId,CustomerId,Amount,PricingStrategy,FraudResult,TotalTransactionAmount,AverageTransactionAmount,TotalTransactions,StdTransactionAmount,...,ProductCategory_movies,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5
0,0,TransactionId_76871,CustomerId_4406,1000.0,2,0,109921.75,923.712185,119,3042.294251,...,0,0,0,0,0,0,0,0,1,0
1,1,TransactionId_73770,CustomerId_4406,-20.0,2,0,109921.75,923.712185,119,3042.294251,...,0,0,0,0,0,0,0,1,0,0
2,2,TransactionId_26203,CustomerId_4683,500.0,2,0,1000.0,500.0,2,0.0,...,0,0,0,0,0,0,0,0,1,0
3,3,TransactionId_380,CustomerId_988,20000.0,2,0,228727.2,6019.136842,38,17169.24161,...,0,0,0,0,0,1,0,0,1,0
4,4,TransactionId_28195,CustomerId_988,-644.0,2,0,228727.2,6019.136842,38,17169.24161,...,0,0,0,0,0,0,0,1,0,0


In [31]:
#Managing Null or Missing Data
df_encoded.isnull().sum()

index                                   0
TransactionId                           0
CustomerId                              0
Amount                                  0
PricingStrategy                         0
FraudResult                             0
TotalTransactionAmount                  0
AverageTransactionAmount                0
TotalTransactions                       0
StdTransactionAmount                  712
TransactionHour                         0
TransactionDay                          0
TransactionMonth                        0
TransactionYear                         0
ProductCategory_airtime                 0
ProductCategory_data_bundles            0
ProductCategory_financial_services      0
ProductCategory_movies                  0
ProductCategory_other                   0
ProductCategory_ticket                  0
ProductCategory_transport               0
ProductCategory_tv                      0
ProductCategory_utility_bill            0
ChannelId_ChannelId_1             

In [34]:
#Imputing missing StdTransactionAmount values using the median.
df_encoded['StdTransactionAmount'] = df_encoded['StdTransactionAmount'].fillna(df_encoded['StdTransactionAmount'].median())


# Scaling Numerical Features

In [35]:
#Normalization &  Standardization
columns_to_normalize= ['TotalTransactionAmount', 'AverageTransactionAmount']
columns_to_standardize = ['TotalTransactions', 'StdTransactionAmount']

In [36]:
#Normalization
normalize_columns(df_encoded, columns_to_standardize)

Unnamed: 0,index,TransactionId,CustomerId,Amount,PricingStrategy,FraudResult,TotalTransactionAmount,AverageTransactionAmount,TotalTransactions,StdTransactionAmount,...,ProductCategory_movies,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5
0,0,TransactionId_76871,CustomerId_4406,1000.0,2,0,109921.75,923.712185,0.028851,0.000919,...,0,0,0,0,0,0,0,0,1,0
1,1,TransactionId_73770,CustomerId_4406,-20.0,2,0,109921.75,923.712185,0.028851,0.000919,...,0,0,0,0,0,0,0,1,0,0
2,2,TransactionId_26203,CustomerId_4683,500.0,2,0,1000.00,500.000000,0.000244,0.000000,...,0,0,0,0,0,0,0,0,1,0
3,3,TransactionId_380,CustomerId_988,20000.0,2,0,228727.20,6019.136842,0.009046,0.005187,...,0,0,0,0,0,1,0,0,1,0
4,4,TransactionId_28195,CustomerId_988,-644.0,2,0,228727.20,6019.136842,0.009046,0.005187,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,95657,TransactionId_89881,CustomerId_3078,-1000.0,2,0,2438140.00,4255.043630,0.139853,0.006814,...,0,0,0,0,0,0,0,1,0,0
95658,95658,TransactionId_91597,CustomerId_3874,1000.0,2,0,58499.60,1360.455814,0.010269,0.000687,...,0,0,0,0,0,0,0,0,1,0
95659,95659,TransactionId_82501,CustomerId_3874,-20.0,2,0,58499.60,1360.455814,0.010269,0.000687,...,0,0,0,0,0,0,0,1,0,0
95660,95660,TransactionId_136354,CustomerId_1709,3000.0,2,0,851985.00,1625.925573,0.127873,0.000969,...,0,0,0,0,1,0,0,0,1,0


In [38]:
#Standardization
print(df_encoded.columns)

Index(['index', 'TransactionId', 'CustomerId', 'Amount', 'PricingStrategy',
       'FraudResult', 'TotalTransactionAmount', 'AverageTransactionAmount',
       'TotalTransactions', 'StdTransactionAmount', 'TransactionHour',
       'TransactionDay', 'TransactionMonth', 'TransactionYear',
       'ProductCategory_airtime', 'ProductCategory_data_bundles',
       'ProductCategory_financial_services', 'ProductCategory_movies',
       'ProductCategory_other', 'ProductCategory_ticket',
       'ProductCategory_transport', 'ProductCategory_tv',
       'ProductCategory_utility_bill', 'ChannelId_ChannelId_1',
       'ChannelId_ChannelId_2', 'ChannelId_ChannelId_3',
       'ChannelId_ChannelId_5'],
      dtype='object')


In [41]:
def standardize_columns(df, columns):
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

df_encoded = standardize_columns(df_encoded, columns_to_standardize)

In [42]:
df_encoded.head()

Unnamed: 0,index,TransactionId,CustomerId,Amount,PricingStrategy,FraudResult,TotalTransactionAmount,AverageTransactionAmount,TotalTransactions,StdTransactionAmount,...,ProductCategory_movies,ProductCategory_other,ProductCategory_ticket,ProductCategory_transport,ProductCategory_tv,ProductCategory_utility_bill,ChannelId_ChannelId_1,ChannelId_ChannelId_2,ChannelId_ChannelId_3,ChannelId_ChannelId_5
0,0,TransactionId_76871,CustomerId_4406,1000.0,2,0,109921.75,923.712185,-0.311831,-0.167524,...,0,0,0,0,0,0,0,0,1,0
1,1,TransactionId_73770,CustomerId_4406,-20.0,2,0,109921.75,923.712185,-0.311831,-0.167524,...,0,0,0,0,0,0,0,1,0,0
2,2,TransactionId_26203,CustomerId_4683,500.0,2,0,1000.0,500.0,-0.444993,-0.201719,...,0,0,0,0,0,0,0,0,1,0
3,3,TransactionId_380,CustomerId_988,20000.0,2,0,228727.2,6019.136842,-0.40402,-0.008737,...,0,0,0,0,0,1,0,0,1,0
4,4,TransactionId_28195,CustomerId_988,-644.0,2,0,228727.2,6019.136842,-0.40402,-0.008737,...,0,0,0,0,0,0,0,1,0,0


# Save the processed data to a new CSV file.


In [44]:
df_encoded.to_csv('../data/processed_data.csv', index=False)