In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load dataset
# Specify the path to the dataset and load it into a DataFrame
df = pd.read_csv('C:/Users/user/Desktop/10 Academy- Machine-Learning/10 Academy W8 & 9/fraud_detection_project/data/Fraud_Data.csv')

# Display the first few rows to understand the structure of the dataset
df.head()

# Basic Exploratory Data Analysis (EDA)
# Summarize numerical features
print("Descriptive Statistics of the Dataset:")
print(df.describe())

# Checking for missing values
print("Missing Values in Dataset:")
missing_values = df.isnull().sum()
print(missing_values)

# Visualizing the distribution of the target variable ('Class' column: fraud or not fraud)
plt.figure(figsize=(8, 5))
sns.countplot(x='Class', data=df)
plt.title('Distribution of Fraudulent and Non-Fraudulent Transactions')
plt.xlabel('Class (0: Non-Fraudulent, 1: Fraudulent)')
plt.ylabel('Count')
plt.show()

# Preprocessing: Handling Missing Values and Duplicates
# Here, we are dropping rows with missing values and duplicates.
# An alternative would be to impute missing values, depending on the dataset.
df_cleaned = df.dropna().drop_duplicates()

# If you want to impute missing values, you can replace 'dropna()' with something like:
# df_cleaned = df.fillna(df.median())  # Impute missing values with median

# Feature Engineering: Adding a 'Transaction Amount Range' feature
# Create a new feature 'Amount_Range' to categorize transaction amounts
df_cleaned['Amount_Range'] = pd.cut(df_cleaned['Amount'], 
                                    bins=[0, 50, 100, 500, 1000, 5000], 
                                    labels=['Low', 'Medium', 'High', 'Very High', 'Extremely High'])

# Further data insights: Visualizing the new feature
plt.figure(figsize=(8, 5))
sns.countplot(x='Amount_Range', data=df_cleaned)
plt.title('Distribution of Transaction Amount Ranges')
plt.xlabel('Amount Range')
plt.ylabel('Count')
plt.show()

# Visualizing correlation between numerical features
correlation_matrix = df_cleaned.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap of Numerical Features')
plt.show()

# Saving the cleaned data for further processing or model training
df_cleaned.to_csv('C:/Users/user/Desktop/10 Academy- Machine-Learning/10 Academy W8 & 9/fraud_detection_project/data/Fraud_Data_cleaned.csv', index=False)

# Summary of the steps taken
print("\nData Preprocessing Summary:")
print("1. Loaded and inspected the dataset.")
print("2. Displayed descriptive statistics.")
print("3. Checked and handled missing values (dropped rows with missing values).")
print("4. Added a new feature 'Amount_Range' based on transaction amounts.")
print("5. Visualized the target class distribution and new feature distribution.")
print("6. Saved the cleaned data to a new CSV file.")

