In [141]:
from typing import Any, Dict, Union, List
from dataclasses import dataclass
from datetime import datetime

import pandas as pd


@dataclass
class Metric:
    """Base class for Metric"""

    def __call__(self, df: pd.DataFrame) -> Dict[str, Any]:
        return {}


@dataclass
class CountTotal(Metric):
    """Total number of rows in DataFrame"""

    def __call__(self, df: pd.DataFrame) -> Dict[str, Any]:
        return {"total": len(df)}


@dataclass
class CountZeros(Metric):
    """Number of zeros in choosen column"""

    column: str

    def __call__(self, df: pd.DataFrame) -> Dict[str, Any]:
        n = len(df)
        k = sum(df[self.column] == 0)
        return {"total": n, "count": k, "delta": k / n}


@dataclass
class CountNull(Metric):
    """Number of empty values in choosen columns"""

    columns: List[str]
    aggregation: str = "any"  # either "all", or "any"

    def __call__(self, df: pd.DataFrame) -> Dict[str, Any]:
        n = len(df)
        
        mask = df[self.columns[0]].isna()
        if self.aggregation == "any":
            for column in self.columns[1:]:
                mask |= df[column].isna()
        else:
            for column in self.columns[1:]:
                mask &= df[column].isna()
        
        k = sum(mask)
        return {"total": n, "count": k, "delta": k / n}


@dataclass
class CountDuplicates(Metric):
    """Number of duplicates in choosen columns"""

    columns: List[str]

    def __call__(self, df: pd.DataFrame) -> Dict[str, Any]:
        n = len(df)
        k = sum(df.duplicated(subset=self.columns))
        return {"total": n, "count": k, "delta": k / n}


@dataclass
class CountValue(Metric):
    """Number of values in choosen column"""

    column: str
    value: Union[str, int, float]

    def __call__(self, df: pd.DataFrame) -> Dict[str, Any]:
        n = len(df)
        k = sum(df[self.column] == self.value)
        return {"total": n, "count": k, "delta": k / n}


@dataclass
class CountBelowValue(Metric):
    """Number of values below threshold"""

    column: str
    value: float
    strict: bool = False

    def __call__(self, df: pd.DataFrame) -> Dict[str, Any]:
        n = len(df)
        k = sum(df[self.column] < self.value if self.strict else df[self.column] <= self.value)
        return {"total": n, "count": k, "delta": k / n}


@dataclass
class CountBelowColumn(Metric):
    """Count how often column X below Y"""

    column_x: str
    column_y: str
    strict: bool = False

    def __call__(self, df: pd.DataFrame) -> Dict[str, Any]:
        n = len(df)
        k = sum(df[self.column_x] < df[self.column_y] if self.strict else df[self.column_x] <= df[self.column_y])
        return {"total": n, "count": k, "delta": k / n}


@dataclass
class CountRatioBelow(Metric):
    """Count how often X / Y below Z"""

    column_x: str
    column_y: str
    column_z: str
    strict: bool = False

    def __call__(self, df: pd.DataFrame) -> Dict[str, Any]:
        n = len(df)
        k = sum(df[self.column_x]/df[self.column_y] < df[self.column_z] if self.strict 
                else df[self.column_x]/df[self.column_y] <= df[self.column_z])
        return {"total": n, "count": k, "delta": k / n}


@dataclass
class CountCB(Metric):
    """Calculate lower/upper bounds for N%-confidence interval"""

    column: str
    conf: float = 0.95

    def __call__(self, df: pd.DataFrame) -> Dict[str, Any]:
        alpha = 1 - self.conf
        lcb, ucb = df[self.column].quantile([alpha/2, 1-alpha/2])
        return {"lcb": lcb, "ucb": ucb}


@dataclass
class CountLag(Metric):
    """A lag between latest date and today"""

    column: str
    fmt: str = "%Y-%m-%d"

    def __call__(self, df: pd.DataFrame) -> Dict[str, Any]:
        a = datetime.today ()
        b = datetime.strptime(max(df[self.column]), self.fmt)
        lag = a - b
        return {"today": a.strftime(self.fmt), "last_day": b.strftime(self.fmt), "lag": lag.days}


In [154]:
df_sales = pd.read_csv('ke_daily_sales.csv')
df_sales.head()

Unnamed: 0,day,item_id,qty,price,revenue
0,2022-10-24,100,5,120.0,500.0
1,2022-10-24,100,6,120.0,720.0
2,2022-10-24,200,2,200.0,400.0
3,2022-10-24,300,10,85.0,850.0
4,2022-10-23,100,3,110.0,330.0


In [155]:
df_visits = pd.read_csv('ke_visits.csv')
df_visits.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9 entries, 0 to 8
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   day       9 non-null      object
 1   item_id   9 non-null      int64 
 2   views     9 non-null      int64 
 3   clicks    9 non-null      int64 
 4   payments  9 non-null      int64 
dtypes: int64(4), object(1)
memory usage: 488.0+ bytes


In [153]:
ct_dict = {"total": 9}
ct = CountTotal()
assert ct(df_visits) == ct_dict

cz_dict = {"total": 9, "count": 1, "delta": 1 / 9}
cz = CountZeros(column='views')
assert cz(df_visits) == cz_dict

cn_dict = {'total': 9, 'count': 1, 'delta': 1/9}
df_visits.loc[2,'views'] = None
cn = CountNull(columns=['item_id', 'views'], aggregation="any")
assert cn(df_visits) == cn_dict

#для проверки поменять 'today' на текущую дату
cl_dict = {'today': '2023-10-09', 'last_day': '2022-10-24', 'lag': 350}
cl = CountLag(column ='day')
assert cl(df=df_sales) == cl_dict