In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [16]:
df = pd.read_csv('./data/transaction_data.csv')
print(df.info())
print(df.head())


# Display basic information
print(df.info())
print(df.head())

# Convert 'date' column to datetime format if it exists
if 'date' in df.columns:
    df['date'] = pd.to_datetime(df['date'])

# Check for missing values
print("\nMissing values:\n", df.isnull().sum())

# Summary statistics
print("\nSummary statistics:\n", df.describe())

# Transaction types distribution
if 'transaction_type' in df.columns:
    plt.figure(figsize=(10, 5))
    sns.countplot(y=df['transaction_type'], order=df['transaction_type'].value_counts().index, palette='coolwarm')
    plt.xlabel("Count")
    plt.ylabel("Transaction Type")
    plt.title("Transaction Type Distribution")
    plt.show()

# Top 10 accounts with highest transactions
if 'account_id' in df.columns:
    top_accounts = df['account_id'].value_counts().head(10)
    plt.figure(figsize=(10, 5))
    sns.barplot(x=top_accounts.values, y=top_accounts.index, palette='magma')
    plt.xlabel("Number of Transactions")
    plt.ylabel("Account ID")
    plt.title("Top 10 Accounts by Number of Transactions")
    plt.show()

# Daily transaction trends if 'date' column exists
if 'date' in df.columns:
    daily_transactions = df.groupby(df['date'].dt.date).size()
    plt.figure(figsize=(12, 5))
    plt.plot(daily_transactions.index, daily_transactions.values, marker='o', linestyle='-', color='b')
    plt.xlabel("Date")
    plt.ylabel("Number of Transactions")
    plt.title("Daily Transaction Trend")
    plt.xticks(rotation=45)
    plt.grid()
    plt.show()

# Transaction amount distribution if 'amount' column exists
if 'amount' in df.columns:
    plt.figure(figsize=(10, 5))
    sns.histplot(df['amount'], bins=30, kde=True, color='g')
    plt.xlabel("Transaction Amount")
    plt.ylabel("Frequency")
    plt.title("Transaction Amount Distribution")
    plt.show()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Transaction ID                    1000 non-null   object 
 1   Sender Account ID                 1000 non-null   object 
 2   Receiver Account ID               1000 non-null   object 
 3   Transaction Amount                1000 non-null   float64
 4   Transaction Type                  1000 non-null   object 
 5   Timestamp                         1000 non-null   object 
 6   Transaction Status                1000 non-null   object 
 7   Fraud Flag                        1000 non-null   bool   
 8   Geolocation (Latitude/Longitude)  1000 non-null   object 
 9   Device Used                       1000 non-null   object 
 10  Network Slice ID                  1000 non-null   object 
 11  Latency (ms)                      1000 non-null   int64  
 12  Slice B