In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ipaddress
import os

from datetime import datetime
%matplotlib inline


In [36]:
# Load datasets
fraud_df = pd.read_csv('../data/Fraud_Data.csv')
ip_df = pd.read_csv('../data/IpAddress_to_Country.csv')
credit_df = pd.read_csv('../data/creditcard.csv')

fraud_df.head()


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [37]:
# Drop duplicates
fraud_df.drop_duplicates(inplace=True)
credit_df.drop_duplicates(inplace=True)

# Convert date columns
fraud_df['signup_time'] = pd.to_datetime(fraud_df['signup_time'])
fraud_df['purchase_time'] = pd.to_datetime(fraud_df['purchase_time'])


In [38]:
# Time-based features
fraud_df['time_since_signup'] = (fraud_df['purchase_time'] - fraud_df['signup_time']).dt.total_seconds()
fraud_df['hour_of_day'] = fraud_df['purchase_time'].dt.hour
fraud_df['day_of_week'] = fraud_df['purchase_time'].dt.dayofweek

# Transaction count per user
user_freq = fraud_df.groupby('user_id')['purchase_time'].count().reset_index(name='transaction_count')
fraud_df = fraud_df.merge(user_freq, on='user_id', how='left')


In [39]:
# Convert IPs to integer
def ip_to_int(ip_str):
    try:
        return int(ipaddress.IPv4Address(ip_str))
    except:
        return np.nan

fraud_df['ip_int'] = fraud_df['ip_address'].apply(ip_to_int)
ip_df['lower_bound_ip_address'] = ip_df['lower_bound_ip_address'].apply(ip_to_int)
ip_df['upper_bound_ip_address'] = ip_df['upper_bound_ip_address'].apply(ip_to_int)

# Map IP to country
def map_country(ip_val):
    match = ip_df[(ip_df['lower_bound_ip_address'] <= ip_val) & (ip_df['upper_bound_ip_address'] >= ip_val)]
    return match['country'].values[0] if not match.empty else 'Unknown'

fraud_df['country'] = fraud_df['ip_int'].apply(map_country)


In [40]:
# Class distribution
print("Fraud class distribution (e-commerce):\n", fraud_df['class'].value_counts(normalize=True))
print("Fraud class distribution (credit):\n", credit_df['Class'].value_counts(normalize=True))


Fraud class distribution (e-commerce):
 class
0    0.906354
1    0.093646
Name: proportion, dtype: float64
Fraud class distribution (credit):
 Class
0    0.998333
1    0.001667
Name: proportion, dtype: float64


In [41]:
# Create output directory
os.makedirs("../outputs", exist_ok=True)

def save_plot(fig, filename):
    fig.savefig(f"../outputs/{filename}")
    plt.close(fig)

# 1. Class distribution
fig = plt.figure(figsize=(6, 4))
sns.countplot(x='class', data=fraud_df)
plt.title("Fraudulent vs Non-Fraudulent Transactions")
save_plot(fig, "fraud_distribution.png")

# 2. Purchase value
fig = plt.figure(figsize=(6, 4))
sns.boxplot(x='class', y='purchase_value', data=fraud_df)
plt.title("Purchase Value by Fraud Class")
save_plot(fig, "purchase_value_by_class.png")

# 3. Hour of Day
fig = plt.figure(figsize=(8, 4))
sns.histplot(data=fraud_df, x='hour_of_day', hue='class', multiple='stack', bins=24)
plt.title("Fraud by Hour of Day")
save_plot(fig, "fraud_by_hour.png")

# 4. Day of Week
fig = plt.figure(figsize=(8, 4))
sns.countplot(data=fraud_df, x='day_of_week', hue='class')
plt.title("Fraud by Day of Week")
save_plot(fig, "fraud_by_day.png")

# 5. Source
fig = plt.figure(figsize=(6, 4))
sns.countplot(data=fraud_df, x='source', hue='class')
plt.title("Fraud by Traffic Source")
save_plot(fig, "fraud_by_source.png")

# 6. Browser
fig = plt.figure(figsize=(6, 4))
sns.countplot(data=fraud_df, x='browser', hue='class')
plt.title("Fraud by Browser")
save_plot(fig, "fraud_by_browser.png")

# 7. Country
fig = plt.figure(figsize=(6, 4))
sns.countplot(data=fraud_df, x='country', hue='class')
plt.title("Fraud by Country")
plt.xticks(rotation=45)
save_plot(fig, "fraud_by_country.png")


In [42]:
# Save cleaned datasets
fraud_df.to_csv('../data/fraud_data_cleaned.csv', index=False)
credit_df.to_csv('../data/creditcard_cleaned.csv', index=False)

print("✅ Cleaned data and EDA plots saved.")


✅ Cleaned data and EDA plots saved.
