 Task 1: Data Analysis and Preprocessing
 
 In this notebook, we will:
 - Load the raw datasets.
 - Clean the data using functions from `data_preprocessing.py`.
 - Perform exploratory data analysis (EDA).
 - Engineer new features using functions from `feature_engineering.py`

In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import sys

sys.path.append(os.path.abspath('..'))



Import custom functions

In [None]:

from scripts.data_preprocessing import clean_fraud_data, merge_ip_data
from scripts.feature_engineering import add_time_features, add_transaction_frequency, add_transaction_velocity


Load datasets

In [None]:


fraud_df = pd.read_csv("../data/Fraud_Data.csv")
ip_mapping_df = pd.read_csv("../data/IpAddress_to_Country.csv")
creditcard_df = pd.read_csv("../data/creditcard.csv")



Clean Fraud Data

In [None]:


fraud_df = clean_fraud_data(fraud_df)
fraud_df = merge_ip_data(fraud_df, ip_mapping_df)
print("Cleaned Fraud Data:")
display(fraud_df.head())




EDA: Distribution of Purchase Value

In [None]:

plt.figure(figsize=(10,6))
sns.histplot(fraud_df['purchase_value'], bins=30, kde=True)
plt.title("Distribution of Purchase Value")
plt.xlabel("Purchase Value ($)")
plt.ylabel("Frequency")
plt.show()



Feature Engineering

In [None]:


fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])
fraud_df = add_time_features(fraud_df)
fraud_df = add_transaction_frequency(fraud_df)
fraud_df = add_transaction_velocity(fraud_df)
print("Data with Engineered Features:")
display(fraud_df[['purchase_time', 'hour_of_day', 'day_of_week', 'transaction_count', 'time_diff']].head())



Save the preprocessed data

In [None]:

fraud_df.to_csv("../data/Fraud_Data_Featured.csv", index=False)
print("Preprocessed data saved as Fraud_Data_Featured.csv")
