In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set plot style
sns.set(style='whitegrid')

In [None]:
# 1) Load Data
data_dir = '../data/raw/'
files = os.listdir(data_dir)
csv_files = [f for f in files if f.endswith('.csv') and 'data' in f.lower()]
if csv_files:
    csv_file = csv_files[0]
    file_path = os.path.join(data_dir, csv_file)
    df = pd.read_csv(file_path)
    print(f'Dataset Shape: {df.shape}')
    display(df.head())
else:
    print('No data file found!')

In [None]:
# 2) Data Structure
df.info()
# The dataset provides transaction details, including IDs, amounts, and categorical metadata.

In [None]:
# 3) Summary Statistics
display(df.describe())

# Check skewness for monetary values
if 'Amount' in df.columns:
    print(f"Skewness of Amount: {df['Amount'].skew()}")
if 'Value' in df.columns:
    print(f"Skewness of Value: {df['Value'].skew()}")

In [None]:
# 4) Missing Values
missing = df.isnull().sum()
print(missing[missing > 0])

plt.figure(figsize=(10, 6))
missing_counts = missing[missing > 0]
if not missing_counts.empty:
    missing_counts.plot(kind='bar')
    plt.title('Missing Values per Column')
    plt.ylabel('Count')
    plt.show()
else:
    print('No missing values found.')

In [None]:
# 5) Numerical Distributions
cols_to_plot = [c for c in ['Amount', 'Value'] if c in df.columns]

if cols_to_plot:
    fig, axes = plt.subplots(1, len(cols_to_plot), figsize=(7 * len(cols_to_plot), 5))
    if len(cols_to_plot) == 1:
        axes = [axes]
    
    for i, col in enumerate(cols_to_plot):
        sns.histplot(df[col], bins=50, kde=True, ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_yscale('log')

    plt.tight_layout()
    plt.show()

In [None]:
# 6) Categorical Analysis
cat_cols = ['ProductCategory', 'ChannelId']
present_cat_cols = [c for c in cat_cols if c in df.columns]

if present_cat_cols:
    fig, axes = plt.subplots(1, len(present_cat_cols), figsize=(7 * len(present_cat_cols), 6))
    if len(present_cat_cols) == 1:
        axes = [axes]
    
    for i, col in enumerate(present_cat_cols):
        top_10 = df[col].value_counts().head(10)
        sns.barplot(x=top_10.values, y=top_10.index, ax=axes[i], palette='viridis')
        axes[i].set_title(f'Top 10 {col}')
        axes[i].set_xlabel('Count')

    plt.tight_layout()
    plt.show()

In [None]:
# 7) Correlation Analysis
numeric_df = df.select_dtypes(include=[np.number])
if not numeric_df.empty:
    corr_matrix = numeric_df.corr()
    plt.figure(figsize=(10, 8))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
    plt.title('Correlation Matrix')
    plt.show()

In [None]:
# 8) Outlier Detection
if 'Amount' in df.columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=df['Amount'])
    plt.title('Boxplot of Amount')
    plt.xlabel('Amount')
    plt.show()