In [1]:
%load_ext autoreload

%autoreload 2
%matplotlib inline

In [2]:
import os
import sys
sys.path.append('..')

from scripts.data_utils.loaders import *
from scripts.data_utils.cleaner import *
from scripts.utils.visualization import *
from scripts.data_utils.preprocess import *
from scripts.data_utils.feature_engineering import *

# import matplotlib
# matplotlib.use('TkAgg')

In [3]:
RESOURCEPATH = os.path.join('..', 'resources')
DATAPATH = os.path.join(RESOURCEPATH, 'data')
preprocessed_output_dir = os.path.join(DATAPATH, 'preprocessed')
raw_data_path = os.path.join(DATAPATH, 'raw')
plot_output_dir = os.path.join('..', 'screenshots', 'plots')
os.makedirs(plot_output_dir, exist_ok=True)

In [4]:
filename = 'data.csv'
file_path = os.path.join(raw_data_path, filename)
data_csv = load_data(file_path)
data_csv.shape

2025-01-29 04:11:09 - INFO - Loading data from ..\resources\data\raw\data.csv
2025-01-29 04:11:09 - INFO - Loading data from ..\resources\data\raw\data.csv
2025-01-29 04:11:09 - INFO - Loading data from ..\resources\data\raw\data.csv
2025-01-29 04:11:09 - INFO - Loading data from ..\resources\data\raw\data.csv
2025-01-29 04:11:09 - INFO - Loading data from ..\resources\data\raw\data.csv
2025-01-29 04:11:13 - INFO - Successfully loaded data from ..\resources\data\raw\data.csv
2025-01-29 04:11:13 - INFO - Successfully loaded data from ..\resources\data\raw\data.csv
2025-01-29 04:11:13 - INFO - Successfully loaded data from ..\resources\data\raw\data.csv
2025-01-29 04:11:13 - INFO - Successfully loaded data from ..\resources\data\raw\data.csv
2025-01-29 04:11:13 - INFO - Successfully loaded data from ..\resources\data\raw\data.csv


(95662, 16)

In [5]:
filename2 = 'data.xlsx'
file_path2 = os.path.join(raw_data_path, filename2)
data_xlsx = load_data(file_path2, sheet_name='data')
data_xlsx.shape

2025-01-29 04:11:13 - INFO - Loading data from ..\resources\data\raw\data.xlsx
2025-01-29 04:11:13 - INFO - Loading data from ..\resources\data\raw\data.xlsx
2025-01-29 04:11:13 - INFO - Loading data from ..\resources\data\raw\data.xlsx
2025-01-29 04:11:13 - INFO - Loading data from ..\resources\data\raw\data.xlsx
2025-01-29 04:11:13 - INFO - Loading data from ..\resources\data\raw\data.xlsx
2025-01-29 04:13:02 - INFO - Successfully loaded data from ..\resources\data\raw\data.xlsx
2025-01-29 04:13:02 - INFO - Successfully loaded data from ..\resources\data\raw\data.xlsx
2025-01-29 04:13:02 - INFO - Successfully loaded data from ..\resources\data\raw\data.xlsx
2025-01-29 04:13:02 - INFO - Successfully loaded data from ..\resources\data\raw\data.xlsx
2025-01-29 04:13:02 - INFO - Successfully loaded data from ..\resources\data\raw\data.xlsx


(95662, 18)

In [7]:
output_file = os.path.join(preprocessed_output_dir, "data_preprocessed")

irrelevant_columns = ['Unnamed: 16', 'Unnamed: 17']
numerical_columns = ['Amount', 'Value', 'PricingStrategy']
categorical_columns = ["CurrencyCode", "ProductCategory", "ChannelId", "ProviderId"]
date_column = "TransactionStartTime"

missing_value_strategies = {
        "CountryCode": "most_frequent",
        "AccountId": "most_frequent",
        "ProviderId": "most_frequent",
        "PricingStrategy": "median",
        "Value": "mean",
}
dtype_conversions = {
        "CountryCode": "int64",
        "CountryCode": "str",
}

data = data_xlsx.copy()

# data_preprocessedpreprocess_data(data, irrelevant_columns, categorical_columns, numerical_columns, 
#                                     missing_value_strategies, date_column, dtype_conversions, preprocessed_output_dir)
# data_preprocessed

In [8]:
data = drop_irrelevant_columns(data, irrelevant_columns)
data

2025-01-29 04:13:03 - INFO - Dropped columns: {'Unnamed: 16', 'Unnamed: 17'}. Remaining columns: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy', 'FraudResult']
2025-01-29 04:13:03 - INFO - Dropped columns: {'Unnamed: 16', 'Unnamed: 17'}. Remaining columns: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy', 'FraudResult']
2025-01-29 04:13:03 - INFO - Dropped columns: {'Unnamed: 16', 'Unnamed: 17'}. Remaining columns: ['TransactionId', 'BatchId', 'AccountId', 'SubscriptionId', 'CustomerId', 'CurrencyCode', 'CountryCode', 'ProviderId', 'ProductId', 'ProductCategory', 'ChannelId', 'Amount', 'Value', 'TransactionStartTime', 'PricingStrategy', '

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256.0,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000.0,2018-11-15T02:18:49Z,2.0,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,,ProviderId_4,ProductId_6,financial_services,ChannelId_2,3679.0,20.0,2018-11-15T02:19:08Z,2.0,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256.0,,ProductId_1,airtime,ChannelId_3,500.0,500.0,2018-11-15T02:44:21Z,2.0,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256.0,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,,2018-11-15T03:32:55Z,2.0,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644.0,2018-11-15T03:34:21Z,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-1000.0,1000.0,2019-02-13T09:54:09Z,2.0,0
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256.0,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000.0,2019-02-13T09:54:25Z,2.0,0
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20.0,2019-02-13T09:54:35Z,2.0,0
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256.0,ProviderId_6,ProductId_19,tv,ChannelId_3,3000.0,3000.0,2019-02-13T10:01:10Z,2.0,0


In [9]:
data = handle_missing_values(data, missing_value_strategies)
data

2025-01-29 04:13:04 - INFO - Imputed missing values in CountryCode using strategy: most_frequent
2025-01-29 04:13:04 - INFO - Imputed missing values in CountryCode using strategy: most_frequent
2025-01-29 04:13:04 - INFO - Imputed missing values in CountryCode using strategy: most_frequent
2025-01-29 04:13:04 - INFO - Imputed missing values in CountryCode using strategy: most_frequent
2025-01-29 04:13:04 - INFO - Imputed missing values in CountryCode using strategy: most_frequent
2025-01-29 04:13:04 - INFO - Imputed missing values in AccountId using strategy: most_frequent
2025-01-29 04:13:04 - INFO - Imputed missing values in AccountId using strategy: most_frequent
2025-01-29 04:13:04 - INFO - Imputed missing values in AccountId using strategy: most_frequent
2025-01-29 04:13:04 - INFO - Imputed missing values in AccountId using strategy: most_frequent
2025-01-29 04:13:04 - INFO - Imputed missing values in AccountId using strategy: most_frequent
2025-01-29 04:13:04 - INFO - Imputed mis

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256.0,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000.00000,2018-11-15T02:18:49Z,2.0,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,3679.0,20.00000,2018-11-15T02:19:08Z,2.0,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256.0,ProviderId_4,ProductId_1,airtime,ChannelId_3,500.0,500.00000,2018-11-15T02:44:21Z,2.0,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256.0,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,9900.64041,2018-11-15T03:32:55Z,2.0,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644.00000,2018-11-15T03:34:21Z,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-1000.0,1000.00000,2019-02-13T09:54:09Z,2.0,0
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256.0,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000.00000,2019-02-13T09:54:25Z,2.0,0
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20.00000,2019-02-13T09:54:35Z,2.0,0
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256.0,ProviderId_6,ProductId_19,tv,ChannelId_3,3000.0,3000.00000,2019-02-13T10:01:10Z,2.0,0


In [10]:
# data = drop_missing(data, columns)
# data

In [11]:
data = remove_duplicates(data)
data

2025-01-29 04:13:06 - INFO - Removed 0 duplicate rows.
2025-01-29 04:13:06 - INFO - Removed 0 duplicate rows.
2025-01-29 04:13:06 - INFO - Removed 0 duplicate rows.
2025-01-29 04:13:06 - INFO - Removed 0 duplicate rows.
2025-01-29 04:13:06 - INFO - Removed 0 duplicate rows.


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256.0,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000.00000,2018-11-15T02:18:49Z,2.0,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,3679.0,20.00000,2018-11-15T02:19:08Z,2.0,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256.0,ProviderId_4,ProductId_1,airtime,ChannelId_3,500.0,500.00000,2018-11-15T02:44:21Z,2.0,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256.0,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,9900.64041,2018-11-15T03:32:55Z,2.0,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644.00000,2018-11-15T03:34:21Z,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-1000.0,1000.00000,2019-02-13T09:54:09Z,2.0,0
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256.0,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000.00000,2019-02-13T09:54:25Z,2.0,0
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20.00000,2019-02-13T09:54:35Z,2.0,0
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256.0,ProviderId_6,ProductId_19,tv,ChannelId_3,3000.0,3000.00000,2019-02-13T10:01:10Z,2.0,0


In [12]:
data = validate_convert_date_column(data, date_column)
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[date_column] = pd.to_datetime((


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256.0,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000.00000,2018-11-15 05:18:49+03:00,2.0,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,3679.0,20.00000,2018-11-15 05:19:08+03:00,2.0,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256.0,ProviderId_4,ProductId_1,airtime,ChannelId_3,500.0,500.00000,2018-11-15 05:44:21+03:00,2.0,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256.0,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,9900.64041,2018-11-15 06:32:55+03:00,2.0,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644.00000,2018-11-15 06:34:21+03:00,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-1000.0,1000.00000,2019-02-13 12:54:09+03:00,2.0,0
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256.0,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000.00000,2019-02-13 12:54:25+03:00,2.0,0
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20.00000,2019-02-13 12:54:35+03:00,2.0,0
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256.0,ProviderId_6,ProductId_19,tv,ChannelId_3,3000.0,3000.00000,2019-02-13 13:01:10+03:00,2.0,0


In [13]:
data = convert_data_types(data, dtype_conversions)
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].astype(dtype, errors="ignore")
2025-01-29 04:13:12 - INFO - Converted column CountryCode to str
2025-01-29 04:13:12 - INFO - Converted column CountryCode to str
2025-01-29 04:13:12 - INFO - Converted column CountryCode to str
2025-01-29 04:13:12 - INFO - Converted column CountryCode to str
2025-01-29 04:13:12 - INFO - Converted column CountryCode to str


Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256.0,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000.00000,2018-11-15 05:18:49+03:00,2.0,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,3679.0,20.00000,2018-11-15 05:19:08+03:00,2.0,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256.0,ProviderId_4,ProductId_1,airtime,ChannelId_3,500.0,500.00000,2018-11-15 05:44:21+03:00,2.0,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256.0,ProviderId_1,ProductId_21,utility_bill,ChannelId_3,20000.0,9900.64041,2018-11-15 06:32:55+03:00,2.0,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-644.0,644.00000,2018-11-15 06:34:21+03:00,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-1000.0,1000.00000,2019-02-13 12:54:09+03:00,2.0,0
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256.0,ProviderId_6,ProductId_10,airtime,ChannelId_3,1000.0,1000.00000,2019-02-13 12:54:25+03:00,2.0,0
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256.0,ProviderId_4,ProductId_6,financial_services,ChannelId_2,-20.0,20.00000,2019-02-13 12:54:35+03:00,2.0,0
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256.0,ProviderId_6,ProductId_19,tv,ChannelId_3,3000.0,3000.00000,2019-02-13 13:01:10+03:00,2.0,0


In [14]:
data = standardize_categorical_columns(data, categorical_columns)
data

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[col] = data[col].str.strip().str.upper()
2025-01-29 04:13:13 - INFO - Standardized categorical column: CurrencyCode
2025-01-29 04:13:13 - INFO - Standardized categorical column: CurrencyCode
2025-01-29 04:13:13 - INFO - Standardized categorical column: CurrencyCode
2025-01-29 04:13:13 - INFO - Standardized categorical column: CurrencyCode
2025-01-29 04:13:13 - INFO - Standardized categorical column: CurrencyCode
2025-01-29 04:13:13 - INFO - Standardized categorical column: ProductCategory
2025-01-29 04:13:13 - INFO - Standardized categorical column: ProductCategory
2025-01-29 04:13:13 - INFO - Standardized categorical column: ProductCategory
2025-01-29 04:13:13 - INFO - Standardized categorical column: ProductCategory
20

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256.0,PROVIDERID_6,ProductId_10,AIRTIME,CHANNELID_3,1000.0,1000.00000,2018-11-15 05:18:49+03:00,2.0,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256.0,PROVIDERID_4,ProductId_6,FINANCIAL_SERVICES,CHANNELID_2,3679.0,20.00000,2018-11-15 05:19:08+03:00,2.0,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256.0,PROVIDERID_4,ProductId_1,AIRTIME,CHANNELID_3,500.0,500.00000,2018-11-15 05:44:21+03:00,2.0,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256.0,PROVIDERID_1,ProductId_21,UTILITY_BILL,CHANNELID_3,20000.0,9900.64041,2018-11-15 06:32:55+03:00,2.0,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256.0,PROVIDERID_4,ProductId_6,FINANCIAL_SERVICES,CHANNELID_2,-644.0,644.00000,2018-11-15 06:34:21+03:00,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95657,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256.0,PROVIDERID_4,ProductId_6,FINANCIAL_SERVICES,CHANNELID_2,-1000.0,1000.00000,2019-02-13 12:54:09+03:00,2.0,0
95658,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256.0,PROVIDERID_6,ProductId_10,AIRTIME,CHANNELID_3,1000.0,1000.00000,2019-02-13 12:54:25+03:00,2.0,0
95659,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256.0,PROVIDERID_4,ProductId_6,FINANCIAL_SERVICES,CHANNELID_2,-20.0,20.00000,2019-02-13 12:54:35+03:00,2.0,0
95660,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256.0,PROVIDERID_6,ProductId_19,TV,CHANNELID_3,3000.0,3000.00000,2019-02-13 13:01:10+03:00,2.0,0


In [15]:
preprocessedta = data.reset_index(drop=True)
preprocessedta

Unnamed: 0,TransactionId,BatchId,AccountId,SubscriptionId,CustomerId,CurrencyCode,CountryCode,ProviderId,ProductId,ProductCategory,ChannelId,Amount,Value,TransactionStartTime,PricingStrategy,FraudResult
0,TransactionId_76871,BatchId_36123,AccountId_3957,SubscriptionId_887,CustomerId_4406,UGX,256.0,PROVIDERID_6,ProductId_10,AIRTIME,CHANNELID_3,1000.0,1000.00000,2018-11-15 05:18:49+03:00,2.0,0
1,TransactionId_73770,BatchId_15642,AccountId_4841,SubscriptionId_3829,CustomerId_4406,UGX,256.0,PROVIDERID_4,ProductId_6,FINANCIAL_SERVICES,CHANNELID_2,3679.0,20.00000,2018-11-15 05:19:08+03:00,2.0,0
2,TransactionId_26203,BatchId_53941,AccountId_4229,SubscriptionId_222,CustomerId_4683,UGX,256.0,PROVIDERID_4,ProductId_1,AIRTIME,CHANNELID_3,500.0,500.00000,2018-11-15 05:44:21+03:00,2.0,0
3,TransactionId_380,BatchId_102363,AccountId_648,SubscriptionId_2185,CustomerId_988,UGX,256.0,PROVIDERID_1,ProductId_21,UTILITY_BILL,CHANNELID_3,20000.0,9900.64041,2018-11-15 06:32:55+03:00,2.0,0
4,TransactionId_28195,BatchId_38780,AccountId_4841,SubscriptionId_3829,CustomerId_988,UGX,256.0,PROVIDERID_4,ProductId_6,FINANCIAL_SERVICES,CHANNELID_2,-644.0,644.00000,2018-11-15 06:34:21+03:00,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95656,TransactionId_89881,BatchId_96668,AccountId_4841,SubscriptionId_3829,CustomerId_3078,UGX,256.0,PROVIDERID_4,ProductId_6,FINANCIAL_SERVICES,CHANNELID_2,-1000.0,1000.00000,2019-02-13 12:54:09+03:00,2.0,0
95657,TransactionId_91597,BatchId_3503,AccountId_3439,SubscriptionId_2643,CustomerId_3874,UGX,256.0,PROVIDERID_6,ProductId_10,AIRTIME,CHANNELID_3,1000.0,1000.00000,2019-02-13 12:54:25+03:00,2.0,0
95658,TransactionId_82501,BatchId_118602,AccountId_4841,SubscriptionId_3829,CustomerId_3874,UGX,256.0,PROVIDERID_4,ProductId_6,FINANCIAL_SERVICES,CHANNELID_2,-20.0,20.00000,2019-02-13 12:54:35+03:00,2.0,0
95659,TransactionId_136354,BatchId_70924,AccountId_1346,SubscriptionId_652,CustomerId_1709,UGX,256.0,PROVIDERID_6,ProductId_19,TV,CHANNELID_3,3000.0,3000.00000,2019-02-13 13:01:10+03:00,2.0,0


In [16]:
save_data(preprocessedta, output_file + ".csv")
save_data(preprocessedta, output_file + ".json")

2025-01-29 04:13:25 - INFO - Data saved to ..\resources\data\processed\data_preprocessed.csv
2025-01-29 04:13:25 - INFO - Data saved to ..\resources\data\processed\data_preprocessed.csv
2025-01-29 04:13:25 - INFO - Data saved to ..\resources\data\processed\data_preprocessed.csv
2025-01-29 04:13:25 - INFO - Data saved to ..\resources\data\processed\data_preprocessed.csv
2025-01-29 04:13:25 - INFO - Data saved to ..\resources\data\processed\data_preprocessed.csv
2025-01-29 04:13:33 - INFO - Data saved to ..\resources\data\processed\data_preprocessed.json
2025-01-29 04:13:33 - INFO - Data saved to ..\resources\data\processed\data_preprocessed.json
2025-01-29 04:13:33 - INFO - Data saved to ..\resources\data\processed\data_preprocessed.json
2025-01-29 04:13:33 - INFO - Data saved to ..\resources\data\processed\data_preprocessed.json
2025-01-29 04:13:33 - INFO - Data saved to ..\resources\data\processed\data_preprocessed.json
