# Data Preprocessing and Feature Engineering for Fraud Detection

This notebook performs data preprocessing and feature engineering on the fraud detection datasets:
1. Fraud_Data.csv - E-commerce transaction data
2. IpAddress_to_Country.csv - IP to country mapping
3. creditcard.csv - Bank transaction data

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

# Add the src directory to the path
if os.path.abspath('..') not in sys.path:
    sys.path.append(os.path.abspath('..'))

# Import custom modules
from src.load_data import load_data
from src.preprocessing import (
    handle_missing_values, clean_data, encode_categorical_features,
    scale_features, handle_class_imbalance, convert_ip_to_int
)
from src.feature_engineering import (
    add_time_features,
    add_transaction_features,
    add_amount_features,
)

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('viridis')

# Display all columns
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:.0f}'.format)

%load_ext autoreload
%autoreload 2

## **1. Load the Datasets**

In [11]:
# Load the datasets
fraud_data = load_data('../data/raw/Fraud_Data.csv')
ip_country = load_data('../data/raw/IpAddress_to_Country.csv')
creditcard = load_data('../data/raw/creditcard.csv')

## 2. Preprocess Fraud_Data.csv

In [12]:
# Clean data
fraud_data = clean_data(fraud_data)

Number of duplicates: 0


In [13]:
# Handle missing values
fraud_data = handle_missing_values(fraud_data)

Missing values before imputation:
Series([], dtype: int64)
Missing values after imputation:
Series([], dtype: int64)


In [14]:
fraud_data['ip_address']

0         732758369
1         350311388
2        2621473820
3        3840542444
4         415583117
            ...    
151107   3451154527
151108   2439047221
151109   2748470524
151110   3601174708
151111   4103824511
Name: ip_address, Length: 151112, dtype: float64

In [15]:
ip_country['lower_bound_ip_address']

0          16777216
1          16777472
2          16777728
3          16778240
4          16779264
            ...    
138841   3758092288
138842   3758093312
138843   3758095360
138844   3758095872
138845   3758096128
Name: lower_bound_ip_address, Length: 138846, dtype: float64

In [28]:
# Merge with IP-to-country data

def find_country(ip_int):
    # Binary search would be faster, but linear search is okay for small data
    row = ip_country[(ip_country['lower_bound_ip_address'] <= ip_int) & 
                        (ip_country['upper_bound_ip_address'] >= ip_int)]
    if not row.empty:
        return row.iloc[0]['country']
    return 'Unknown'


# Map countries
fraud_data["country"] = fraud_data["ip_address"].apply(find_country)
# Check the merged data


In [39]:
fraud_data.to_csv('../data/processed/fraud_data_with_country.csv', index = False)

In [41]:
fraud_data.device_id.nunique()

137956

In [29]:
fraud_copy = fraud_data.copy()

In [30]:
# Checking country values after mapping
fraud_copy

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758369,0,Japan
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311388,0,United States
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621473820,1,United States
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542444,0,Unknown
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583117,0,United States
...,...,...,...,...,...,...,...,...,...,...,...,...
151107,345170,2015-01-27 03:03:34,2015-03-29 00:30:47,43,XPSKTWGPWINLR,SEO,Chrome,M,28,3451154527,1,United States
151108,274471,2015-05-15 17:43:29,2015-05-26 12:24:39,35,LYSFABUCPCGBA,SEO,Safari,M,32,2439047221,0,Netherlands
151109,368416,2015-03-03 23:07:31,2015-05-20 07:07:47,40,MEQHCSJUBRBFE,SEO,IE,F,26,2748470524,0,Japan
151110,207709,2015-07-09 20:06:07,2015-09-07 09:34:46,46,CMCXFGRHYSTVJ,SEO,Chrome,M,37,3601174708,0,United States


In [31]:
# Add time-based features
fraud_copy = add_time_features(fraud_copy)

# Add transaction features
fraud_copy = add_transaction_features(fraud_copy)

# Check the new features
fraud_copy.head()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.




Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,hour_of_day,day_of_week,time_since_signup,user_transaction_count,avg_user_purchase_value,purchase_value_ratio
116708,2,2015-01-11 03:47:13,2015-02-21 10:03:37,54,FGBQNDNBETFJJ,SEO,Chrome,F,25,880217485,0,United States,10,5,990,1,54,1
15108,4,2015-06-02 16:40:57,2015-09-26 21:32:16,41,MKFUIVOHLJBYN,Direct,Safari,F,38,2785906107,0,Switzerland,21,5,2789,1,41,1
46047,8,2015-05-28 07:53:06,2015-08-13 11:53:07,47,SCQGQALXBUQZJ,SEO,Chrome,M,25,356056737,0,United States,11,3,1852,1,47,1
67650,9,2015-05-16 15:58:32,2015-05-20 23:06:42,62,IEZOHXPZBIRTE,SEO,FireFox,M,21,759104706,0,Unknown,23,2,103,1,62,1
109067,12,2015-01-10 06:25:12,2015-03-04 20:56:37,35,MSNWCFEHKTIOY,Ads,Safari,M,19,2985180353,0,Mexico,20,2,1287,1,35,1


In [32]:
for i in fraud_copy.columns:
    print(i)

user_id
signup_time
purchase_time
purchase_value
device_id
source
browser
sex
age
ip_address
class
country
hour_of_day
day_of_week
time_since_signup
user_transaction_count
avg_user_purchase_value
purchase_value_ratio


In [33]:
# Checking missing values after feature engineering
import sidetable as stb
fraud_copy.stb.missing()

Unnamed: 0,missing,total,percent
user_id,0,151112,0
signup_time,0,151112,0
purchase_time,0,151112,0
purchase_value,0,151112,0
device_id,0,151112,0
source,0,151112,0
browser,0,151112,0
sex,0,151112,0
age,0,151112,0
ip_address,0,151112,0


In [34]:
fraud_copy.info()

<class 'pandas.core.frame.DataFrame'>
Index: 151112 entries, 116708 to 109602
Data columns (total 18 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   user_id                  151112 non-null  float64       
 1   signup_time              151112 non-null  datetime64[ns]
 2   purchase_time            151112 non-null  datetime64[ns]
 3   purchase_value           151112 non-null  float64       
 4   device_id                151112 non-null  object        
 5   source                   151112 non-null  object        
 6   browser                  151112 non-null  object        
 7   sex                      151112 non-null  object        
 8   age                      151112 non-null  float64       
 9   ip_address               151112 non-null  float64       
 10  class                    151112 non-null  float64       
 11  country                  151112 non-null  object        
 12  hour_of_day     

In [35]:
# Encode categorical features
categorical_cols = ['source', 'browser', 'sex', 'country']
fraud_copy = encode_categorical_features(fraud_copy, categorical_cols)

# Check the encoded features
fraud_copy.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,age,ip_address,class,hour_of_day,day_of_week,time_since_signup,user_transaction_count,avg_user_purchase_value,purchase_value_ratio,source_Ads,source_Direct,source_SEO,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_F,sex_M,country_Afghanistan,country_Albania,country_Algeria,country_Angola,country_Antigua and Barbuda,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Azerbaijan,country_Bahamas,country_Bahrain,country_Bangladesh,country_Barbados,country_Belarus,country_Belgium,country_Belize,country_Benin,country_Bermuda,country_Bhutan,country_Bolivia,country_Bonaire; Sint Eustatius; Saba,country_Bosnia and Herzegowina,country_Botswana,country_Brazil,country_British Indian Ocean Territory,country_Brunei Darussalam,country_Bulgaria,country_Burkina Faso,country_Burundi,country_Cambodia,country_Cameroon,country_Canada,country_Cape Verde,country_Cayman Islands,country_Chile,country_China,country_Colombia,country_Congo,country_Congo The Democratic Republic of The,country_Costa Rica,country_Cote D'ivoire,country_Croatia (LOCAL Name: Hrvatska),country_Cuba,country_Curacao,country_Cyprus,country_Czech Republic,country_Denmark,country_Djibouti,country_Dominica,country_Dominican Republic,country_Ecuador,country_Egypt,country_El Salvador,country_Estonia,country_Ethiopia,country_European Union,country_Faroe Islands,country_Fiji,country_Finland,country_France,country_Gabon,country_Gambia,country_Georgia,country_Germany,country_Ghana,country_Gibraltar,country_Greece,country_Guadeloupe,country_Guam,country_Guatemala,country_Haiti,country_Honduras,country_Hong Kong,country_Hungary,country_Iceland,country_India,country_Indonesia,country_Iran (ISLAMIC Republic Of),country_Iraq,country_Ireland,country_Israel,country_Italy,country_Jamaica,country_Japan,country_Jordan,country_Kazakhstan,country_Kenya,country_Korea Republic of,country_Kuwait,country_Kyrgyzstan,country_Lao People's Democratic Republic,country_Latvia,country_Lebanon,country_Lesotho,country_Libyan Arab Jamahiriya,country_Liechtenstein,country_Lithuania,country_Luxembourg,country_Macau,country_Macedonia,country_Madagascar,country_Malawi,country_Malaysia,country_Maldives,country_Malta,country_Mauritius,country_Mexico,country_Moldova Republic of,country_Monaco,country_Mongolia,country_Montenegro,country_Morocco,country_Mozambique,country_Myanmar,country_Namibia,country_Nauru,country_Nepal,country_Netherlands,country_New Caledonia,country_New Zealand,country_Nicaragua,country_Niger,country_Nigeria,country_Norway,country_Oman,country_Pakistan,country_Palestinian Territory Occupied,country_Panama,country_Papua New Guinea,country_Paraguay,country_Peru,country_Philippines,country_Poland,country_Portugal,country_Puerto Rico,country_Qatar,country_Reunion,country_Romania,country_Russian Federation,country_Rwanda,country_Saint Kitts and Nevis,country_Saint Martin,country_San Marino,country_Saudi Arabia,country_Senegal,country_Serbia,country_Seychelles,country_Singapore,country_Slovakia (SLOVAK Republic),country_Slovenia,country_South Africa,country_South Sudan,country_Spain,country_Sri Lanka,country_Sudan,country_Sweden,country_Switzerland,country_Syrian Arab Republic,country_Taiwan; Republic of China (ROC),country_Tajikistan,country_Tanzania United Republic of,country_Thailand,country_Trinidad and Tobago,country_Tunisia,country_Turkey,country_Turkmenistan,country_Uganda,country_Ukraine,country_United Arab Emirates,country_United Kingdom,country_United States,country_Unknown,country_Uruguay,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Viet Nam,country_Virgin Islands (U.S.),country_Yemen,country_Zambia,country_Zimbabwe
116708,2,2015-01-11 03:47:13,2015-02-21 10:03:37,54,FGBQNDNBETFJJ,25,880217485,0,10,5,990,1,54,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
15108,4,2015-06-02 16:40:57,2015-09-26 21:32:16,41,MKFUIVOHLJBYN,38,2785906107,0,21,5,2789,1,41,1,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
46047,8,2015-05-28 07:53:06,2015-08-13 11:53:07,47,SCQGQALXBUQZJ,25,356056737,0,11,3,1852,1,47,1,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
67650,9,2015-05-16 15:58:32,2015-05-20 23:06:42,62,IEZOHXPZBIRTE,21,759104706,0,23,2,103,1,62,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
109067,12,2015-01-10 06:25:12,2015-03-04 20:56:37,35,MSNWCFEHKTIOY,19,2985180353,0,20,2,1287,1,35,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [36]:
# Drop unnecessary columns
cols_to_drop = ['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time']
fraud_copy = fraud_copy.drop(cols_to_drop, axis=1)

# Check the final dataset
fraud_copy.head()

Unnamed: 0,purchase_value,age,class,hour_of_day,day_of_week,time_since_signup,user_transaction_count,avg_user_purchase_value,purchase_value_ratio,source_Ads,source_Direct,source_SEO,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_F,sex_M,country_Afghanistan,country_Albania,country_Algeria,country_Angola,country_Antigua and Barbuda,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Azerbaijan,country_Bahamas,country_Bahrain,country_Bangladesh,country_Barbados,country_Belarus,country_Belgium,country_Belize,country_Benin,country_Bermuda,country_Bhutan,country_Bolivia,country_Bonaire; Sint Eustatius; Saba,country_Bosnia and Herzegowina,country_Botswana,country_Brazil,country_British Indian Ocean Territory,country_Brunei Darussalam,country_Bulgaria,country_Burkina Faso,country_Burundi,country_Cambodia,country_Cameroon,country_Canada,country_Cape Verde,country_Cayman Islands,country_Chile,country_China,country_Colombia,country_Congo,country_Congo The Democratic Republic of The,country_Costa Rica,country_Cote D'ivoire,country_Croatia (LOCAL Name: Hrvatska),country_Cuba,country_Curacao,country_Cyprus,country_Czech Republic,country_Denmark,country_Djibouti,country_Dominica,country_Dominican Republic,country_Ecuador,country_Egypt,country_El Salvador,country_Estonia,country_Ethiopia,country_European Union,country_Faroe Islands,country_Fiji,country_Finland,country_France,country_Gabon,country_Gambia,country_Georgia,country_Germany,country_Ghana,country_Gibraltar,country_Greece,country_Guadeloupe,country_Guam,country_Guatemala,country_Haiti,country_Honduras,country_Hong Kong,country_Hungary,country_Iceland,country_India,country_Indonesia,country_Iran (ISLAMIC Republic Of),country_Iraq,country_Ireland,country_Israel,country_Italy,country_Jamaica,country_Japan,country_Jordan,country_Kazakhstan,country_Kenya,country_Korea Republic of,country_Kuwait,country_Kyrgyzstan,country_Lao People's Democratic Republic,country_Latvia,country_Lebanon,country_Lesotho,country_Libyan Arab Jamahiriya,country_Liechtenstein,country_Lithuania,country_Luxembourg,country_Macau,country_Macedonia,country_Madagascar,country_Malawi,country_Malaysia,country_Maldives,country_Malta,country_Mauritius,country_Mexico,country_Moldova Republic of,country_Monaco,country_Mongolia,country_Montenegro,country_Morocco,country_Mozambique,country_Myanmar,country_Namibia,country_Nauru,country_Nepal,country_Netherlands,country_New Caledonia,country_New Zealand,country_Nicaragua,country_Niger,country_Nigeria,country_Norway,country_Oman,country_Pakistan,country_Palestinian Territory Occupied,country_Panama,country_Papua New Guinea,country_Paraguay,country_Peru,country_Philippines,country_Poland,country_Portugal,country_Puerto Rico,country_Qatar,country_Reunion,country_Romania,country_Russian Federation,country_Rwanda,country_Saint Kitts and Nevis,country_Saint Martin,country_San Marino,country_Saudi Arabia,country_Senegal,country_Serbia,country_Seychelles,country_Singapore,country_Slovakia (SLOVAK Republic),country_Slovenia,country_South Africa,country_South Sudan,country_Spain,country_Sri Lanka,country_Sudan,country_Sweden,country_Switzerland,country_Syrian Arab Republic,country_Taiwan; Republic of China (ROC),country_Tajikistan,country_Tanzania United Republic of,country_Thailand,country_Trinidad and Tobago,country_Tunisia,country_Turkey,country_Turkmenistan,country_Uganda,country_Ukraine,country_United Arab Emirates,country_United Kingdom,country_United States,country_Unknown,country_Uruguay,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Viet Nam,country_Virgin Islands (U.S.),country_Yemen,country_Zambia,country_Zimbabwe
116708,54,25,0,10,5,990,1,54,1,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
15108,41,38,0,21,5,2789,1,41,1,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
46047,47,25,0,11,3,1852,1,47,1,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
67650,62,21,0,23,2,103,1,62,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
109067,35,19,0,20,2,1287,1,35,1,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [55]:
fraud_copy.columns

Index(['purchase_value', 'age', 'class', 'hour_of_day', 'day_of_week',
       'time_since_signup', 'user_transaction_count',
       'avg_user_purchase_value', 'purchase_value_ratio', 'source_Ads',
       ...
       'country_Unknown', 'country_Uruguay', 'country_Uzbekistan',
       'country_Vanuatu', 'country_Venezuela', 'country_Viet Nam',
       'country_Virgin Islands (U.S.)', 'country_Yemen', 'country_Zambia',
       'country_Zimbabwe'],
      dtype='object', length=201)

In [56]:
# Step 2: Mutual Information
from sklearn.feature_selection import mutual_info_classif
X = fraud_copy.drop(["class"], axis=1)
y = fraud_copy["class"]
mi_scores = mutual_info_classif(X, y)
# Keep features with high MI scores

In [58]:
# Find the top 30 features with the highest mutual information scores
import numpy as np
mi_scores_series = pd.Series(mi_scores, index=X.columns)
top30_mi_features = mi_scores_series.sort_values(ascending=False).head(30)
top30_mi_features


time_since_signup           0
purchase_value_ratio        0
user_transaction_count      0
sex_M                       0
sex_F                       0
source_SEO                  0
source_Ads                  0
country_United States       0
browser_Chrome              0
browser_IE                  0
browser_Safari              0
day_of_week                 0
source_Direct               0
age                         0
purchase_value              0
country_Bahamas             0
browser_FireFox             0
country_Unknown             0
country_Greece              0
country_Qatar               0
country_Belize              0
country_Korea Republic of   0
country_Japan               0
country_Egypt               0
country_Cambodia            0
country_Cape Verde          0
country_Honduras            0
country_Algeria             0
country_Nigeria             0
country_Thailand            0
dtype: float64

In [None]:
# Compute correlation of all features with 'class' and filter top 10 by absolute correlation (excluding 'class' itself)
corr_with_class = fraud_copy.corr()['class'].drop('class').abs().sort_values(ascending=False)
top10_corr_features = corr_with_class.head(10)

In [52]:
corr_with_class.head(30)

time_since_signup      0
country_Luxembourg     0
source_Direct          0
country_New Zealand    0
day_of_week            0
country_Ireland        0
country_Tunisia        0
country_Peru           0
country_Sri Lanka      0
country_Ecuador        0
browser_Chrome         0
country_Namibia        0
country_Saudi Arabia   0
browser_IE             0
country_Denmark        0
source_SEO             0
country_Germany        0
country_Kuwait         0
country_Canada         0
country_Unknown        0
country_Chile          0
country_Mexico         0
country_Bolivia        0
country_Poland         0
country_Bulgaria       0
country_Romania        0
country_Lithuania      0
country_Turkmenistan   0
country_Norway         0
country_Sweden         0
Name: class, dtype: float64

In [47]:
drop_cols

Unnamed: 0,purchase_value,age,class,hour_of_day,day_of_week,time_since_signup,user_transaction_count,avg_user_purchase_value,purchase_value_ratio,source_Ads,source_Direct,source_SEO,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,browser_Safari,sex_F,sex_M,country_Afghanistan,country_Albania,country_Algeria,country_Angola,country_Antigua and Barbuda,country_Argentina,country_Armenia,country_Australia,country_Austria,country_Azerbaijan,country_Bahamas,country_Bahrain,country_Bangladesh,country_Barbados,country_Belarus,country_Belgium,country_Belize,country_Benin,country_Bermuda,country_Bhutan,country_Bolivia,country_Bonaire; Sint Eustatius; Saba,country_Bosnia and Herzegowina,country_Botswana,country_Brazil,country_British Indian Ocean Territory,country_Brunei Darussalam,country_Bulgaria,country_Burkina Faso,country_Burundi,country_Cambodia,country_Cameroon,country_Canada,country_Cape Verde,country_Cayman Islands,country_Chile,country_China,country_Colombia,country_Congo,country_Congo The Democratic Republic of The,country_Costa Rica,country_Cote D'ivoire,country_Croatia (LOCAL Name: Hrvatska),country_Cuba,country_Curacao,country_Cyprus,country_Czech Republic,country_Denmark,country_Djibouti,country_Dominica,country_Dominican Republic,country_Ecuador,country_Egypt,country_El Salvador,country_Estonia,country_Ethiopia,country_European Union,country_Faroe Islands,country_Fiji,country_Finland,country_France,country_Gabon,country_Gambia,country_Georgia,country_Germany,country_Ghana,country_Gibraltar,country_Greece,country_Guadeloupe,country_Guam,country_Guatemala,country_Haiti,country_Honduras,country_Hong Kong,country_Hungary,country_Iceland,country_India,country_Indonesia,country_Iran (ISLAMIC Republic Of),country_Iraq,country_Ireland,country_Israel,country_Italy,country_Jamaica,country_Japan,country_Jordan,country_Kazakhstan,country_Kenya,country_Korea Republic of,country_Kuwait,country_Kyrgyzstan,country_Lao People's Democratic Republic,country_Latvia,country_Lebanon,country_Lesotho,country_Libyan Arab Jamahiriya,country_Liechtenstein,country_Lithuania,country_Luxembourg,country_Macau,country_Macedonia,country_Madagascar,country_Malawi,country_Malaysia,country_Maldives,country_Malta,country_Mauritius,country_Mexico,country_Moldova Republic of,country_Monaco,country_Mongolia,country_Montenegro,country_Morocco,country_Mozambique,country_Myanmar,country_Namibia,country_Nauru,country_Nepal,country_Netherlands,country_New Caledonia,country_New Zealand,country_Nicaragua,country_Niger,country_Nigeria,country_Norway,country_Oman,country_Pakistan,country_Palestinian Territory Occupied,country_Panama,country_Papua New Guinea,country_Paraguay,country_Peru,country_Philippines,country_Poland,country_Portugal,country_Puerto Rico,country_Qatar,country_Reunion,country_Romania,country_Russian Federation,country_Rwanda,country_Saint Kitts and Nevis,country_Saint Martin,country_San Marino,country_Saudi Arabia,country_Senegal,country_Serbia,country_Seychelles,country_Singapore,country_Slovakia (SLOVAK Republic),country_Slovenia,country_South Africa,country_South Sudan,country_Spain,country_Sri Lanka,country_Sudan,country_Sweden,country_Switzerland,country_Syrian Arab Republic,country_Taiwan; Republic of China (ROC),country_Tajikistan,country_Tanzania United Republic of,country_Thailand,country_Trinidad and Tobago,country_Tunisia,country_Turkey,country_Turkmenistan,country_Uganda,country_Ukraine,country_United Arab Emirates,country_United Kingdom,country_United States,country_Unknown,country_Uruguay,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Viet Nam,country_Virgin Islands (U.S.),country_Yemen,country_Zambia,country_Zimbabwe
purchase_value,1,0,0,0,-0,0,,1,,-0,0,0,0,-0,-0,-0,0,-0,0,-0,-0,-0,-0,-0,-0,0,-0,-0,0,0,-0,0,-0,-0,-0,0,-0,-0,-0,0,-0,0,-0,0,-0,0,-0,-0,-0,0,-0,0,-0,-0,0,-0,0,0,-0,-0,-0,0,-0,-0,0,0,-0,-0,0,-0,0,0,0,0,0,-0,-0,0,-0,-0,-0,-0,-0,0,-0,0,-0,-0,0,-0,-0,0,-0,0,-0,0,-0,-0,-0,0,0,0,0,-0,-0,-0,0,0,-0,-0,-0,-0,0,-0,0,0,0,-0,0,-0,-0,-0,-0,0,-0,-0,0,-0,-0,-0,-0,0,0,-0,0,0,0,-0,0,0,-0,0,-0,0,0,-0,-0,-0,-0,-0,-0,-0,0,-0,0,-0,0,-0,-0,0,0,0,-0,-0,-0,0,-0,0,-0,0,-0,0,-0,0,-0,-0,-0,0,0,-0,-0,0,0,-0,0,0,-0,-0,-0,-0,-0,0,-0,0,-0,-0,0,0,-0,-0,0
age,0,1,0,-0,-0,-0,,0,,-0,0,-0,-0,-0,0,-0,0,-0,0,-0,-0,0,0,0,-0,-0,-0,0,-0,0,0,-0,-0,-0,0,-0,0,0,0,0,-0,-0,-0,0,-0,-0,-0,-0,-0,-0,-0,-0,0,0,-0,-0,0,-0,-0,-0,0,-0,-0,0,0,-0,0,0,0,0,0,0,0,-0,-0,-0,-0,-0,-0,-0,0,0,-0,-0,-0,-0,0,-0,0,-0,-0,-0,-0,-0,-0,0,0,0,0,0,-0,-0,0,-0,-0,0,0,-0,0,-0,-0,-0,0,0,-0,-0,-0,0,-0,0,0,-0,-0,0,-0,0,0,0,0,-0,-0,-0,-0,0,-0,-0,0,-0,-0,0,-0,0,0,0,0,-0,0,0,0,0,0,-0,0,0,-0,0,0,-0,-0,0,0,-0,0,0,-0,-0,-0,0,-0,0,-0,-0,-0,-0,0,0,-0,-0,-0,-0,0,0,-0,-0,-0,-0,0,-0,0,-0,0,0,0,0,-0,-0,0,-0,-0,-0,-0
class,0,0,1,0,0,-0,,0,,-0,0,-0,0,0,-0,-0,-0,-0,0,0,-0,0,-0,-0,0,0,-0,-0,-0,-0,-0,-0,-0,-0,0,-0,-0,-0,-0,0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,0,-0,-0,0,-0,-0,-0,-0,0,0,-0,-0,-0,-0,0,0,-0,-0,-0,0,0,-0,-0,-0,-0,-0,-0,0,0,-0,-0,-0,-0,-0,-0,0,-0,-0,-0,-0,0,0,-0,0,0,-0,0,-0,0,-0,-0,-0,0,-0,-0,-0,-0,0,-0,-0,0,0,-0,-0,-0,0,0,-0,-0,-0,0,-0,-0,0,-0,0,-0,-0,-0,0,-0,-0,-0,0,-0,-0,-0,-0,0,-0,-0,-0,0,-0,-0,-0,-0,-0,-0,0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,0,0,-0,0,-0,-0,-0,-0,-0,-0,0,-0,0,-0,-0,-0,-0,-0,-0,-0,0,-0,0,-0,0,0,0,0,-0,-0,0,-0,0,-0,0,-0,-0,-0
hour_of_day,0,-0,0,1,-0,0,,0,,-0,0,0,-0,0,-0,0,0,0,-0,0,0,0,-0,0,-0,-0,-0,-0,0,-0,-0,0,0,0,-0,0,0,0,-0,0,-0,-0,0,-0,0,0,0,0,-0,-0,0,-0,0,0,-0,0,0,0,-0,0,-0,0,-0,-0,0,-0,0,-0,0,-0,-0,0,0,-0,0,0,0,0,-0,0,-0,0,0,-0,-0,0,0,-0,0,0,0,0,-0,-0,-0,-0,0,-0,-0,-0,-0,-0,0,-0,-0,0,-0,0,0,-0,-0,-0,0,-0,0,0,-0,0,-0,0,-0,0,0,-0,-0,-0,0,-0,0,0,0,-0,0,-0,0,0,0,0,-0,0,0,-0,-0,0,-0,0,-0,-0,-0,-0,0,-0,0,0,-0,-0,0,0,0,0,0,-0,-0,-0,0,0,0,0,-0,-0,0,0,0,0,-0,-0,0,0,0,-0,0,0,0,-0,-0,0,0,-0,-0,-0,-0,0,-0,0,0,0,-0,0,-0,-0,-0
day_of_week,-0,-0,0,-0,1,-0,,-0,,0,-0,0,0,-0,0,-0,0,0,-0,-0,0,-0,-0,-0,0,-0,0,-0,-0,0,0,0,-0,0,-0,0,-0,-0,0,0,-0,0,-0,-0,-0,-0,0,-0,-0,-0,0,0,-0,-0,0,-0,-0,0,-0,0,-0,0,-0,0,0,0,0,0,-0,0,0,-0,-0,0,0,0,-0,-0,0,-0,-0,-0,0,-0,0,-0,0,-0,0,-0,0,0,-0,0,-0,0,0,-0,-0,-0,-0,0,0,0,-0,-0,0,-0,-0,0,-0,0,0,-0,-0,0,0,-0,0,0,-0,0,0,-0,-0,-0,0,-0,-0,0,0,-0,0,0,-0,-0,-0,0,-0,0,-0,0,0,0,-0,-0,-0,0,0,-0,0,0,0,0,-0,-0,-0,-0,0,0,0,-0,-0,0,-0,0,0,0,-0,0,-0,0,-0,0,-0,0,0,0,0,-0,-0,0,0,0,0,-0,0,-0,0,-0,0,-0,-0,0,0,-0,-0,-0,0,-0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
country_Viet Nam,0,0,-0,-0,-0,0,,0,,-0,0,-0,0,-0,0,-0,-0,-0,0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,1,-0,-0,-0,-0
country_Virgin Islands (U.S.),0,-0,0,0,-0,-0,,0,,-0,0,0,0,-0,0,-0,-0,0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,1,-0,-0,-0
country_Yemen,-0,-0,-0,-0,0,-0,,-0,,-0,-0,0,-0,-0,0,-0,-0,0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,1,-0,-0
country_Zambia,-0,-0,-0,-0,-0,0,,-0,,0,-0,-0,-0,-0,0,0,0,-0,0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,-0,1,-0


In [None]:
# Save the preprocessed data
fraud_copy.to_csv('../data/processed/fraud_data_preprocessed.csv', index=False)

In [None]:
# Scale numerical features
fraud_data, scaler_fraud = scale_features(fraud_data)

# Check the scaled features
fraud_data.describe()

## 3. Preprocess creditcard.csv

In [13]:
# Clean data
creditcard = clean_data(creditcard)

# Handle missing values
creditcard = handle_missing_values(creditcard)

Number of duplicates: 1081
Missing values before imputation:
Series([], dtype: int64)
Missing values after imputation:
Series([], dtype: int64)


In [14]:
# Add time-based features
creditcard = add_time_features(creditcard)

# Add amount-based features
creditcard = add_amount_features(creditcard)

# Check the new features
creditcard.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class,time_hours,time_of_day,hour_of_day,hour_sin,hour_cos,log_amount,amount_bin
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0,0.0,0.0,0,0.0,1.0,5.01476,8
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0,0.0,0.0,0,0.0,1.0,1.305626,1
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0,0.000278,1.0,0,0.0,1.0,5.939276,9
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0,0.000278,1.0,0,0.0,1.0,4.824306,8
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0,0.000556,2.0,0,0.0,1.0,4.262539,7


In [15]:
# Scale numerical features
creditcard, scaler_cc = scale_features(creditcard)

# Check the scaled features
creditcard.describe()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class,time_hours,time_of_day,hour_of_day,hour_sin,hour_cos,log_amount,amount_bin
count,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0,283726.0
mean,1.218105e-16,-5.659779e-17,-2.404154e-18,-1.121939e-16,1.422458e-17,3.3658160000000006e-17,4.808308e-18,-1.883254e-17,-1.1419730000000002e-17,-3.1905130000000005e-17,9.215924e-18,2.724708e-17,-3.2055390000000002e-18,-1.552683e-17,1.2020770000000001e-17,1.362354e-17,-1.903289e-18,1.2621810000000001e-17,-1.5226310000000002e-17,1.2546680000000002e-17,-5.810039e-18,-9.215924e-18,1.923323e-17,2.6045e-18,1.412441e-17,4.0069240000000005e-17,-1.207086e-17,-5.860126e-18,-1.502596e-19,-5.4093470000000005e-17,0.001667,1.218105e-16,1.186049e-16,-5.1288620000000006e-17,-1.050816e-16,-5.769970000000001e-17,1.005738e-16,1.30225e-16
std,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,0.040796,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002,1.000002
min,-1.996823,-28.95933,-44.15594,-32.03279,-4.016602,-82.60323,-19.64022,-35.48131,-62.09721,-12.2616,-22.8416,-4.709521,-18.78306,-5.81909,-20.17886,-4.9186,-16.17386,-29.86682,-11.34527,-8.868289,-70.77808,-48.11386,-15.08956,-71.84198,-4.68415,-19.75208,-5.403358,-57.02547,-47.04087,-0.3533268,0.0,-1.996823,-2.486646,-2.407217,-1.161524,-1.183391,-1.903207,-1.558191
25%,-0.8552128,-0.4732329,-0.3620488,-0.5907784,-0.5990519,-0.5022907,-0.576526,-0.451517,-0.1763908,-0.586609,-0.4962232,-0.7478525,-0.4076546,-0.6514439,-0.4473627,-0.6366813,-0.5356809,-0.5745927,-0.5965414,-0.5606559,-0.2748846,-0.3148664,-0.7489963,-0.259582,-0.5856219,-0.6086746,-0.6781691,-0.1829567,-0.1626863,-0.3309625,0.0,-0.8552128,-0.6738937,-0.6933643,-0.9556001,-0.9913246,-0.7644127,-0.8640676
50%,-0.2131081,0.007426456,0.04134568,0.1182157,-0.01363454,-0.04015609,-0.2057379,0.03181559,0.01929719,-0.04655399,-0.08527996,-0.03191039,0.1405355,-0.01359251,0.05246341,0.05274504,0.07549218,-0.07838176,-0.004367121,0.00446453,-0.08122341,-0.04015703,0.009233221,-0.01820863,0.06737087,0.03167716,-0.1085382,-0.0007187408,0.0327422,-0.2654671,0.0,-0.2131081,0.08069497,0.1635622,-0.3930054,-0.1208329,-0.01102319,-0.1699443
75%,0.9369423,0.6725541,0.4885028,0.6796321,0.5251188,0.4432736,0.2987636,0.4632165,0.2769666,0.5454844,0.4227584,0.7257918,0.6209995,0.6656182,0.5167783,0.7094403,0.5978627,0.4733517,0.5976293,0.5640338,0.172757,0.2577181,0.7290881,0.236572,0.7257359,0.6732285,0.4981036,0.2260181,0.2369592,-0.04378088,0.0,0.9369423,0.8196987,0.8491034,0.7733279,0.967011,0.7298789,0.8712406
max,1.642362,1.257179,13.39762,6.217985,11.93504,25.27211,55.03507,98.2255,16.96961,14.23708,22.061,11.79787,7.891145,7.159011,11.05478,9.702455,19.81693,10.98314,6.018269,6.875329,51.19687,37.57826,14.49606,36.1202,7.569585,14.42736,7.296299,79.87613,103.1847,102.2476,1.0,1.642362,1.618571,1.534645,1.912551,1.683812,4.224415,1.565364


In [16]:
# Save the preprocessed data
creditcard.to_csv('../data/processed/creditcard_preprocessed.csv', index=False)

## 4. Prepare Data for Modeling

In [17]:
# Prepare Fraud_Data for modeling
X_fraud = fraud_data.drop('class', axis=1)
y_fraud = fraud_data['class']

# Prepare creditcard for modeling
X_cc = creditcard.drop('Class', axis=1)
y_cc = creditcard['Class']

# Print shapes
print(f"Fraud_Data: X shape = {X_fraud.shape}, y shape = {y_fraud.shape}")
print(f"Creditcard: X shape = {X_cc.shape}, y shape = {y_cc.shape}")

Fraud_Data: X shape = (151112, 18), y shape = (151112,)
Creditcard: X shape = (283726, 37), y shape = (283726,)


In [18]:
# Split the data into training and testing sets
from sklearn.model_selection import train_test_split

# Fraud_Data
X_train_fraud, X_test_fraud, y_train_fraud, y_test_fraud = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

# Creditcard
X_train_cc, X_test_cc, y_train_cc, y_test_cc = train_test_split(
    X_cc, y_cc, test_size=0.2, random_state=42, stratify=y_cc
)

# Print shapes
print(f"Fraud_Data: X_train shape = {X_train_fraud.shape}, X_test shape = {X_test_fraud.shape}")
print(f"Creditcard: X_train shape = {X_train_cc.shape}, X_test shape = {X_test_cc.shape}")

Fraud_Data: X_train shape = (120889, 18), X_test shape = (30223, 18)
Creditcard: X_train shape = (226980, 37), X_test shape = (56746, 37)


In [None]:
# Handle class imbalance for Fraud_Data
X_train_fraud_resampled, y_train_fraud_resampled = handle_class_imbalance(
    X_train_fraud, y_train_fraud, method='smote', sampling_strategy=0.1
)

# Handle class imbalance for Creditcard
X_train_cc_resampled, y_train_cc_resampled = handle_class_imbalance(
    X_train_cc, y_train_cc, method='smote', sampling_strategy=0.1
)

In [None]:
# Save the train-test split data
import joblib

# Create directory if it doesn't exist
os.makedirs('../data/processed', exist_ok=True)

# Save Fraud_Data splits
joblib.dump(X_train_fraud, '../data/processed/X_train_fraud.pkl')
joblib.dump(X_test_fraud, '../data/processed/X_test_fraud.pkl')
joblib.dump(y_train_fraud, '../data/processed/y_train_fraud.pkl')
joblib.dump(y_test_fraud, '../data/processed/y_test_fraud.pkl')
joblib.dump(X_train_fraud_resampled, '../data/processed/X_train_fraud_resampled.pkl')
joblib.dump(y_train_fraud_resampled, '../data/processed/y_train_fraud_resampled.pkl')

# Save Creditcard splits
joblib.dump(X_train_cc, '../data/processed/X_train_cc.pkl')
joblib.dump(X_test_cc, '../data/processed/X_test_cc.pkl')
joblib.dump(y_train_cc, '../data/processed/y_train_cc.pkl')
joblib.dump(y_test_cc, '../data/processed/y_test_cc.pkl')
joblib.dump(X_train_cc_resampled, '../data/processed/X_train_cc_resampled.pkl')
joblib.dump(y_train_cc_resampled, '../data/processed/y_train_cc_resampled.pkl')

# Save scalers
joblib.dump(scaler_fraud, '../data/processed/scaler_fraud.pkl')
joblib.dump(scaler_cc, '../data/processed/scaler_cc.pkl')

print("Data splits saved successfully.")

## 5. Summary of Preprocessing Steps

### Fraud_Data.csv
1. Cleaned data by removing duplicates and correcting data types
2. Handled missing values using imputation
3. Merged with IP-to-country data for geolocation analysis
4. Added time-based features (hour_of_day, day_of_week, time_since_signup)
5. Added transaction features (user_transaction_count, time_since_last_transaction, etc.)
6. Encoded categorical features (source, browser, sex, country)
7. Dropped unnecessary columns (user_id, device_id, ip_address, signup_time, purchase_time)
8. Scaled numerical features
9. Split data into training and testing sets
10. Handled class imbalance using SMOTE

### creditcard.csv
1. Cleaned data by removing duplicates and correcting data types
2. Handled missing values using imputation
3. Added time-based features
4. Added amount-based features
5. Scaled numerical features
6. Split data into training and testing sets
7. Handled class imbalance using SMOTE