In [None]:
# Load the latest data for analysis
import glob
import pandas as pd
import pathlib
import seaborn as sns
import matplotlib.pyplot as plt

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")

print("All packages imported successfully!")


All packages imported successfully!


In [None]:
latest = sorted(glob.glob("../data/processed/transactions_*.parquet"))[-1]
df = pd.read_parquet(latest)
df.shape, df['date'].min(), df['date'].max()

IndexError: list index out of range

In [None]:
# Cell 2 – Comprehensive manual EDA
print("=== TRANSACTION DATA EXPLORATORY DATA ANALYSIS ===")
print(f"Dataset shape: {df.shape}")
print(f"Date range: {df['date'].min()} to {df['date'].max()}")

print("\n=== DATASET OVERVIEW ===")
print(df.info())

print("\n=== MISSING VALUES ===")
missing_data = df.isnull().sum()
missing_percent = 100 * df.isnull().sum() / len(df)
missing_table = pd.DataFrame({'Missing Count': missing_data, 'Missing Percentage': missing_percent})
print(missing_table[missing_table['Missing Count'] > 0])

print("\n=== BASIC STATISTICS ===")
print(df.describe())

print("\n=== CATEGORICAL COLUMNS ANALYSIS ===")
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
    print(f"\n{col.upper()} - Unique values: {df[col].nunique()}")
    print(df[col].value_counts().head(10))

print("\n=== NUMERICAL COLUMNS ANALYSIS ===")
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
    print(f"\n{col.upper()}:")
    print(f"Mean: {df[col].mean():.2f}, Median: {df[col].median():.2f}")
    print(f"Min: {df[col].min():.2f}, Max: {df[col].max():.2f}")
    print(f"Std: {df[col].std():.2f}")

print("\n=== DATA QUALITY ISSUES ===")
# Check for duplicates
duplicates = df.duplicated().sum()
print(f"Duplicate rows: {duplicates}")

# Check for zero amounts
if 'amount' in df.columns:
    zero_amounts = (df['amount'] == 0).sum()
    print(f"Zero amount transactions: {zero_amounts}")

print("\nManual EDA completed! See visualizations in next cells.")


In [None]:
# Cell 3 – Visualizations
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
fig.suptitle('Transaction Data Analysis', fontsize=16)

# 1. Amount distribution
if 'amount' in df.columns:
    axes[0, 0].hist(df['amount'], bins=50, alpha=0.7, color='skyblue')
    axes[0, 0].set_title('Amount Distribution')
    axes[0, 0].set_xlabel('Amount')
    axes[0, 0].set_ylabel('Frequency')

# 2. Transactions over time
if 'date' in df.columns:
    df_time = df.groupby('date').size()
    axes[0, 1].plot(df_time.index, df_time.values, color='green')
    axes[0, 1].set_title('Transactions Over Time')
    axes[0, 1].set_xlabel('Date')
    axes[0, 1].set_ylabel('Number of Transactions')
    axes[0, 1].tick_params(axis='x', rotation=45)

# 3. Top categories (if category column exists)
cat_columns = df.select_dtypes(include=['object']).columns
if len(cat_columns) > 0:
    # Use the first categorical column
    cat_col = cat_columns[0]
    top_cats = df[cat_col].value_counts().head(10)
    axes[1, 0].barh(range(len(top_cats)), top_cats.values, color='coral')
    axes[1, 0].set_yticks(range(len(top_cats)))
    axes[1, 0].set_yticklabels(top_cats.index)
    axes[1, 0].set_title(f'Top 10 {cat_col.title()}')
    axes[1, 0].set_xlabel('Count')

# 4. Missing values heatmap
missing_df = df.isnull().sum()
if missing_df.sum() > 0:
    axes[1, 1].bar(range(len(missing_df)), missing_df.values, color='red', alpha=0.6)
    axes[1, 1].set_xticks(range(len(missing_df)))
    axes[1, 1].set_xticklabels(missing_df.index, rotation=45)
    axes[1, 1].set_title('Missing Values by Column')
    axes[1, 1].set_ylabel('Missing Count')
else:
    axes[1, 1].text(0.5, 0.5, 'No Missing Values', ha='center', va='center', 
                    transform=axes[1, 1].transAxes, fontsize=14)
    axes[1, 1].set_title('Missing Values Check')

plt.tight_layout()
plt.show()

# Correlation matrix for numerical columns
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
if len(numerical_cols) > 1:
    plt.figure(figsize=(10, 8))
    correlation_matrix = df[numerical_cols].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
    plt.title('Correlation Matrix of Numerical Variables')
    plt.show()

print("Visualizations completed!")
