In [None]:
# %pip install phik

In [None]:
import pandas as pd
import numpy as np
# from phik import phik_matrix
from scipy.stats import entropy


In [None]:
def datType(c):
    if c.nunique() > 20 and c.dtype.kind in "iufc":
        return "Continuous"
    elif c.nunique() < 20 and c.dtype.kind in "iufc":
        return "Categorical"
    elif c.dtype==object:
        return "Text"
    elif pd.api.types.is_datetime64_any_dtype(c):
        return "Date"
    else:
        return "Other"

def meta(c):
    dtype = c.dtype
    mem = round(c.memory_usage(deep=True) / (1024 * 1024),2)
    uniq = c.nunique()
    nulls = c.isnull().sum()
    empty = c.astype(str).str.strip().eq("").sum()
    return dtype,mem,uniq,nulls,empty

def stats(c):
    mode_val = c.mode()[0] if not c.mode().empty else "N/A"
    ent = round(entropy(c.value_counts(normalize=True), base=2), 2)
    maxent = round(np.log2(c.nunique()), 2) if c.nunique() > 0 else 0
    # Defaults for all columns
    mean = std = min_val = max_val = skew = kurt = q1 = q2 = q3 = outliers = "N/A"
    if c.dtype.kind in "iufcM":
      mean = c.mean()
      std = c.std()
      min_val = c.min()
      max_val = c.max()
      q1 = c.quantile(0.25)
      q2 = c.quantile(0.5)
      q3 = c.quantile(0.75)
      iqr = q3 - q1
      lower = q1 - 1.5 * iqr
      upper = q3 + 1.5 * iqr
      outliers = ((c < lower) | (c > upper)).sum()
      if c.dtype.kind in "iufc" and c.nunique() > 20:
          skew = round(c.skew(), 2)
          kurt = round(c.kurtosis(), 2)
    return mode_val,ent,maxent,mean ,std ,min_val , max_val ,skew , kurt , q1 , q2 , q3 , outliers

def DatasetMetadata(df, toPrint=True, Matrix=False):
    summary_data = []
    #Since the memory usage of columns doesn't change during the loop, there's no need to recalculate it inside the loop.
    memUsage = list(df.memory_usage(deep=True) / (1024 * 1024))[1:]

    for  c in df.columns:
        col = df[c]
        dtype,mem,uniq,nulls,empty=meta(col)

        mode_val,ent,maxent,mean ,std ,min_val , max_val ,skew , kurt , q1 , q2 , q3 , outliers=stats(col)
        vartype = datType(col)

        summary_data.append([
            c, str(dtype), mem, vartype, uniq, nulls, empty, mode_val,
            mean, std, min_val, q2, max_val, skew, kurt, q1, q2, q3, outliers, ent, maxent
        ])

    SummaryDF = pd.DataFrame(summary_data, columns=[
        "CName", "DType", "UsageMB", "VarType", "Cardinality", "Null*", "Empty*", "Mode",
        "Mean", "Std", "Min", "Median", "Max", "Skewness", "Kurtosis", "Q1", "Q2/Median", "Q3",
        "Outliers", "Entropy", "Max Entropy"
    ])

    if toPrint:
        print(f"The dataset includes {df.shape[1]} columns and {format(df.shape[0],',')} rows")
        print(f"Dataset has {df.duplicated().sum()} duplicate rows")
        print(f"Memory Usage of the DataFrame: {df.memory_usage(deep=True).sum() / (1024 * 1024):.2f} MB")
        print("--------------------------")
        print("First rows")
        print(df.head())
        print("--------------------------")
    if Matrix:
        print("Phik Correlation Matrix")
        numeric = [c for c in df.columns if df[c].dtype.kind in "iufc"]
        corr_matrix = df.phik_matrix(interval_cols=numeric)
        print(corr_matrix)

    return SummaryDF


In [None]:
def CheckGap(df,dateColumn,freq,showGaps=True):
  FullDate=pd.date_range(df[dateColumn].min(),df[dateColumn].max(),freq=freq)
  minDate=set(FullDate)-set(df[dateColumn])
  if showGaps:
    print(minDate)
  return f"The {dateColumn} has gap = {len(minDate)}"
  