In [None]:
# Importing required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno

# Set visualization style
sns.set(style="whitegrid")
plt.style.use("ggplot")

# Data file paths
fraud_data_path = r"C:\Users\hayyu.ragea\AppData\Local\Programs\Python\Python312\fraud_detection_project\data\processed\fraud_cleaned.csv"
creditcard_data_path = r"C:\Users\hayyu.ragea\AppData\Local\Programs\Python\Python312\fraud_detection_project\data\processed\creditcard_cleaned.csv"
ip_country_data_path = r"C:\Users\hayyu.ragea\AppData\Local\Programs\Python\Python312\fraud_detection_project\data\processed\ip_country_cleaned.csv"

# Load datasets
fraud_data = pd.read_csv(fraud_data_path)
creditcard_data = pd.read_csv(creditcard_data_path)
ip_country_data = pd.read_csv(ip_country_data_path)

# Display the first few rows of the fraud dataset
fraud_data.head()


In [None]:
# Fraud Data
print("Fraud Data Overview:")
print(fraud_data.info())
print(fraud_data.describe())

# Credit Card Data
print("\nCredit Card Data Overview:")
print(creditcard_data.info())
print(creditcard_data.describe())


In [None]:
# Univariate Analysis - Fraud Data

# Plot distribution of 'purchase_value'
plt.figure(figsize=(10, 6))
sns.histplot(fraud_data['purchase_value'], kde=True, color='blue')
plt.title('Distribution of Purchase Value')
plt.show()

# Countplot of 'source'
plt.figure(figsize=(10, 6))
sns.countplot(data=fraud_data, x='source', palette='Set2')
plt.title('Source of Users')
plt.show()

# Countplot of 'browser'
plt.figure(figsize=(10, 6))
sns.countplot(data=fraud_data, x='browser', palette='Set3')
plt.title('Browser Usage in Transactions')
plt.show()

# Countplot of 'class' (fraud vs non-fraud)
plt.figure(figsize=(10, 6))
sns.countplot(data=fraud_data, x='class', palette='Set1')
plt.title('Fraud vs Non-Fraud Transactions')
plt.show()


In [None]:
# Univariate Analysis - Credit Card Data

# Distribution of 'Amount'
plt.figure(figsize=(10, 6))
sns.histplot(creditcard_data['Amount'], kde=True, color='purple')
plt.title('Distribution of Transaction Amount')
plt.show()

# Distribution of 'Class'
plt.figure(figsize=(10, 6))
sns.countplot(data=creditcard_data, x='Class', palette='Set1')
plt.title('Fraud vs Non-Fraud Transactions (Credit Card Data)')
plt.show()


In [None]:
# Bivariate Analysis - Fraud Data

# Purchase Value vs Fraud/Non-Fraud
plt.figure(figsize=(10, 6))
sns.boxplot(data=fraud_data, x='class', y='purchase_value', palette='coolwarm')
plt.title('Purchase Value vs Fraud/Non-Fraud Transactions')
plt.show()

# Source vs Fraud/Non-Fraud
plt.figure(figsize=(10, 6))
sns.countplot(data=fraud_data, x='source', hue='class', palette='Set2')
plt.title('Source vs Fraud/Non-Fraud Transactions')
plt.show()

# Browser vs Fraud/Non-Fraud
plt.figure(figsize=(10, 6))
sns.countplot(data=fraud_data, x='browser', hue='class', palette='Set3')
plt.title('Browser vs Fraud/Non-Fraud Transactions')
plt.show()


In [None]:
# Bivariate Analysis - Credit Card Data

# Correlation matrix
plt.figure(figsize=(12, 8))
corr_matrix = creditcard_data.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix (Credit Card Data)')
plt.show()

# Amount vs Fraud/Non-Fraud
plt.figure(figsize=(10, 6))
sns.boxplot(data=creditcard_data, x='Class', y='Amount', palette='coolwarm')
plt.title('Transaction Amount vs Fraud/Non-Fraud')
plt.show()


In [None]:
# Merge fraud_data with ip_country_data based on IP address ranges
fraud_data['ip_address'] = fraud_data['ip_address'].astype(int)

# Let's create a function that merges based on IP ranges
def map_ip_to_country(ip, ip_country_df):
    condition = (ip_country_df['lower_bound_ip_address'] <= ip) & (ip_country_df['upper_bound_ip_address'] >= ip)
    return ip_country_df.loc[condition, 'country'].values[0] if len(ip_country_df.loc[condition, 'country'].values) > 0 else 'Unknown'

fraud_data['country'] = fraud_data['ip_address'].apply(lambda x: map_ip_to_country(x, ip_country_data))

# Plot Fraud Cases by Country
plt.figure(figsize=(12, 6))
fraud_by_country = fraud_data[fraud_data['class'] == 1]['country'].value_counts()
sns.barplot(x=fraud_by_country.index, y=fraud_by_country.values, palette='magma')
plt.xticks(rotation=90)
plt.title('Fraud Cases by Country')
plt.show()
