# EDA Notebook

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
def print_table_meta(data):
    print("Num rows in dataset: ", len(data))
    print("Columns: ", data.columns)

def count_na(data):
    missing_values = data.isnull().sum()

    missing_values_percentage = (missing_values / len(data)) * 100

    missing_data = pd.DataFrame({'Missing Values': missing_values, 'Percentage (%)': missing_values_percentage})
    return missing_data.sort_values(by='Percentage (%)', ascending=False)

def plots(col, name, dropna=False, continuous=True, topten=True, exclude_outliers=False):
    col = pd.Series(col)
    if dropna:
        col = col.dropna()

    if continuous:
        data = col.copy()

        if exclude_outliers:
            # Use IQR method to exclude outliers
            Q1 = data.quantile(0.25)
            Q3 = data.quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            data = data[(data >= lower) & (data <= upper)]
            print(f"{name}: {len(col) - len(data)} outliers excluded")

        mean = data.mean()
        count = data.count()
        max_val = data.max()
        min_val = data.min()
        print(f"{name} mean: {mean:.2f} | count: {count}")
        print(f"{name} max: {max_val} | min: {min_val}")

        plt.hist(data, density=True, bins=30, edgecolor='black')
        plt.title(f"Distribution of {name}")
        plt.xlabel(name)
        plt.ylabel("Probability Density")
        plt.show()

    else:
        counts = col.value_counts(dropna=dropna)
        vals = counts.index.to_list()
        if topten:
            counts = counts[:10]
            vals = vals[:10]
        counts_list = counts.to_list()
        most_common = vals[np.argmax(counts_list)]
        least_common = vals[np.argmin(counts_list)]
        print(f"{name} most common: {most_common}, count: {np.max(counts_list)}")
        print(f"{name} least common: {least_common}, count: {np.min(counts_list)}")

        plt.bar([str(v) for v in vals], counts_list)
        plt.title(f"Top Categories in {name}")
        plt.xlabel("Values")
        plt.ylabel("Counts")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()
