# УРОВЕНЬ INTERN

## 2. SMAPE

* MAPE (Mean Absolute Percentage Error) — метрика ошибки, учитывающая средний процент отклонения предсказываемого значения от реального.

* sMAPE (от symmetric MAPE) — логическое продолжение MAPE, где в знаменателе стоит уже сумма модулей предсказания и факта, а не только факта

In [16]:
import numpy as np

def smape(y_true: np.array, y_pred: np.array) -> float:
    denominator = np.abs(y_true) + np.abs(y_pred)
    den = np.where(denominator > 0, 1, denominator)
    return np.mean(2 * np.abs(y_true - y_pred) / den )

In [17]:
y_true = np.array(0.5)
y_pred = np.array(50)
a = smape(y_true, y_pred)
a

99.0

## 3. VALID EMAILS

https://docs.python.org/3/library/re.html

Можно ускорить код засчет предварительной компиляции выражения

In [6]:
import re
from typing import List
import time
import numpy as np



def valid_emails(strings: List[str]) -> List[str]:
    """Take list of potential emails and returns only valid ones"""

    valid_email_regex = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}$"
    
    def is_valid_email(email: str) -> bool:
        return bool(re.fullmatch(valid_email_regex, email))
    
    start = time.perf_counter()
    emails = []
    for email in strings:
        if is_valid_email(email):
            emails.append(email)
            
    finish  = time.perf_counter() 
    
    print(f"Вычисление заняло {start - finish:0.4f} секунд")         

    return emails


In [7]:
def valid_emails_imrove(strings: List[str]) -> List[str]:
    """Take list of potential emails and returns only valid ones"""
    valid_email_regex = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z]{2,}$"
    valid = re.compile(valid_email_regex)
    
    def is_valid_email(email: str) -> bool:
        return bool(valid.fullmatch(email))
    
    start = time.perf_counter()
    emails = []
    for email in strings:
        if is_valid_email(email):
            emails.append(email)
    finish  = time.perf_counter()               
    print(f"Вычисление заняло {start - finish:0.4f} секунд")
    return emails

In [20]:
wrong = list(np.full(1000, 'sbsbfs@7hbhcw3'))
right = list(np.full(10, 'sfsd@mail.ru'))
emails = wrong+right

In [33]:
valid_emails(emails)

Вычисление заняло -0.0022 секунд


['sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru']

In [38]:
valid_emails_imrove(emails)

Вычисление заняло -0.0009 секунд


['sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru',
 'sfsd@mail.ru']

## 4. Cold start
Возьмем данные по количеству проданных товаров из задачи "SKU SALES" и заполним пропуски средним значением по имеющимся данным

In [5]:
import pandas as pd
import numpy as np

df = pd.read_excel('sku_sales.xlsx')
df.head()

Unnamed: 0,days,sku
0,2020-06-08,5
1,2020-06-10,5
2,2021-01-01,378
3,2021-01-02,1020
4,2021-01-03,2581


In [6]:
# Заменили все значения продаж в январе 2021 на na для проверки работоспособности функции
df.loc[df['days'].str.contains('2021-01'), 'sku'] = None

# Создаем столбец для дальнейшей группировки и подсчета среднего
df['day'] = df.days.apply(lambda x: x.split('-')[2])

In [7]:
import pandas as pd
import numpy as np


def fillna_with_mean(
    df: pd.DataFrame, target: str, group: str
) -> pd.DataFrame:
    df_new = df.copy() #важно не забывать делать копию, иначе изменим изначальный датафрейм
    group = df_new.groupby(group)[target].transform('mean')
    df_new.loc[df[target].isnull(), target] = group.apply(np.floor)
    return df_new


In [8]:
df_new = fillna_with_mean(df, 'sku', 'day')
df_new

Unnamed: 0,days,sku,day
0,2020-06-08,5.0,08
1,2020-06-10,5.0,10
2,2021-01-01,9343.0,01
3,2021-01-02,10509.0,02
4,2021-01-03,10308.0,03
...,...,...,...
361,2021-12-26,1081.0,26
362,2021-12-27,12879.0,27
363,2021-12-28,1588.0,28
364,2021-12-29,14686.0,29


In [9]:
df

Unnamed: 0,days,sku,day
0,2020-06-08,5.0,08
1,2020-06-10,5.0,10
2,2021-01-01,,01
3,2021-01-02,,02
4,2021-01-03,,03
...,...,...,...
361,2021-12-26,1081.0,26
362,2021-12-27,12879.0,27
363,2021-12-28,1588.0,28
364,2021-12-29,14686.0,29


## 5. Stocks

* SKU (Stock Keeping Unit), уникальный ID товара (тип int)
* GMV (Gross Merchandise Volume), аналог розничного товарооборота (тип float)
* stock – число единиц товара на складе (тип int)
* price – цена на товар (тип float)

**Notes:**
'/' - деление, '//' -целочисленное деление; '/' и округление работает быстрее

Ex.: 5 / 2 will return 2.5 and 5 // 2 will return 2

In [2]:
import pandas as pd
df_p = pd.read_excel('stocks.xlsx')
df_p

Unnamed: 0,sku,gmv,price,stock
0,100,400,100,3
1,200,350,70,10
2,300,500,120,5


In [3]:
def limit_gmv(df: pd.DataFrame):
    df.loc[df.price*df.stock < df.gmv, 'gmv'] = df.price*df.stock
    df.loc[df.gmv%df.price != 0, 'gmv'] = round(df.gmv/df.price)*df['price']     
    return df

In [4]:
limit_gmv(df_p)

Unnamed: 0,sku,gmv,price,stock
0,100,300,100,3
1,200,350,70,10
2,300,480,120,5


In [11]:
def limit_gmv(df: pd.DataFrame):
    """Принимает на вход датафрейм с предсказанием оборота,
        возвращает обработанные датафрейм"""
    df = df.copy()
    df['gmv'] = ((df.gmv/df.price).astype(int) * df['price']) \
                                .clip(upper = df.price*df.stock) 
    return df

In [12]:
limit_gmv(df_p)

Unnamed: 0,sku,gmv,price,stock
0,100,300,100,3
1,200,350,70,10
2,300,480,120,5


## 6. AVERAGE CHECK

https://clickhouse.com/docs/ru/sql-reference/aggregate-functions/reference/quantiles#quantiles 

 ```SQL
 select toStartOfMonth(toDate(buy_date)) as month,
        avg(check_amount) as avg_check,
        quantileExactExclusive(0.5)(check_amount) as median_check
from default.view_checks
group by month

 ```

## 7. DOCSTRING
http://daouzli.com/blog/docstring.html

**Pyment**
* https://github.com/dadadel/pyment
* http://daouzli.com/blog/pyment.html

In [20]:
#!pip install git+https://github.com/dadadel/pyment.git

In [27]:
#исходный файл (исправленный docstring.py)
def fix_params(params, space):
    params = dict(params)
    ...

    return params


def ranking(ser):
    """Make rank transformation.
    @param ser: Values for ranking. None interpreted as worst.
    @type ser: Series of float

    @return: Ranks (1: highest, N: lowest)
    @rtype: Series of int
    """
    ser = ser.fillna(ser.min())
    ...
    return rnk


def qround(x, a, b, q):
    '''Convert x to one of [a, a+q, a+2q, .., b]
    Parameters
    ----
    x : int or float
        Input value. x must be in [a, b].
        If x < a, x set to a.
        If x > b, x set to b.
    a : int or float
        Boundaries. b must be greater than a. Otherwize b set to a.
    b : int or float
        Boundaries. b must be greater than a. Otherwize b set to a.
    q : int or float
        Step value. If q and a are both integer, x set to integer too.
    Returns
    -------
    int or float
        Rounded value
    '''
    # Check if a <= x <= b
    b = max(a, b)
    x = min(max(x, a), b)
    ...
    return x

In [28]:
!pyment -o numpydoc -f false docstring.py

In [29]:
!patch -p1 < docstring.py.patch

patching file docstring.py


## 8. ASYMMETRIC METRICS
* https://alexanderdyakonov.files.wordpress.com/2018/10/book_08_metrics_12_blog1.pdf
* https://alexanderdyakonov.wordpress.com/2018/10/23/функции-ошибок-в-задачах-регрессии/

In [30]:
#Нужно определить сколько товара привезти на склад. Лучше привезти больше.
#Используем функцию, которая штрафует за недопрогноз

import numpy as np


def turnover_error(y_true: np.array, y_pred: np.array) -> float:
    res = np.sum(((y_pred - y_true)/y_pred)**2)
    return res

In [33]:
y_true = np.array([100])
y_pred = np.array([150])
#y_pred = np.array([50])
turnover_error(y_true, y_pred)

0.1111111111111111

In [62]:
#Прогнозируем LTV (LifeTime Value, сколько денег нам принесёт клиент за всё время, что будет пользоваться нашим сервисом)
#Лучше недооценить. Используем функцию, которая штрафует за перепрогноз
#Реализуем квантильную функцию потерь
import numpy as np

def ltv_error(y_true: np.array, y_pred: np.array) -> float:
    diff = y_true-y_pred
    diff = np.multiply(np.abs(diff), np.where(diff >= 0, 0.25, 0.75))
    error = np.sum(diff)
    return error

In [63]:
y_true = np.array([100])
y_pred = np.array([150])
#y_pred = np.array([50])
ltv_error(y_true, y_pred)

[-50]
[37.5]


37.5

## 9. DAU

```SQL
select toDate(timestamp) as day,
        uniqExact(user_id)
from default.churn_submits
group by day
```

## 10. TOP 10

TOP-10 брендов по количеству SKU 

```SQL
select brand,
        count(sku_type) as count_sku
from sku_dict_another_one
where brand is not null
group by brand
order by count_sku DESC
limit 10
```

TOP-10 SKU по количеству вендоров их продающих

```SQL
select sku_type,
        count(distinct vendor) as count_vendor
from sku_dict_another_one
group by sku_type
order by count_vendor DESC
limit 10
```

TOP-10 вендоров по количеству брендов, которые они продают
```SQL
select vendor,
       count(distinct brand) as brand
from sku_dict_another_one
where brand is not null
group by vendor
order by brand DESC
limit 10
```

TOP-10 вендоров по количеству SKU, которые они продают
```SQL
select vendor,
       count(sku_type) as sku
from sku_dict_another_one
where vendor is not null
group by vendor
order by sku DESC
limit 10
```

## 11. HELLO PYTEST

In [73]:
#!pip install -U pytest

In [65]:
#Пример теста
#Запускать тест командой pytest test_label.py
from sklearn.preprocessing import MultiLabelBinarizer


def test_multilabel_binarizer_empty_sample():
    mlb = MultiLabelBinarizer()
    y = [[1, 2], [1], []]
    Y = np.array([[1, 1], [1, 0], [0, 0]])
    assert_array_equal(mlb.fit_transform(y), Y)

In [66]:
#Напишем тесты для следующих функций
from typing import List


def profit(revenue: List[float], costs: List[float]) -> float:
    return sum(revenue) - sum(costs)


def margin(revenue: List[float], costs: List[float]) -> float:
    return (sum(revenue) - sum(costs)) / sum(revenue)


def markup(revenue: List[float], costs: List[float]) -> float:
    return (sum(revenue) - sum(costs)) / sum(costs)

In [70]:
#Тесты
def test_profit() -> None:
    assert profit([1, 2, 3], [1, 1, 1]) == 3
    assert profit([0, 0], [0, 0]) == 0
    assert profit([2.2, 0], [1, 1]) == 0.2
    assert profit([2, 0], [1, 3]) == -2
    assert profit([], []) == 0


def test_margin() -> None:
    assert margin([1, 2, 3], [1, 1, 1]) == 0.5
    assert margin([2, 0], [1, 1]) == 0
    assert margin([2, 0], [1, 3]) == -1


def test_markup() -> None:
    assert markup([1, 2, 3], [1, 1, 1]) == 1
    assert markup([2, 0], [1, 1]) == 0
    assert markup([2, 0], [1, 3]) == -0.5


In [76]:
!pytest hello_pytest.py

platform darwin -- Python 3.8.5, pytest-7.4.3, pluggy-1.3.0
rootdir: /Users/inessa/Desktop/git/ML_Simulator/INTERN/SQL&Pandas
plugins: anyio-3.7.1
collected 3 items                                                              [0m

hello_pytest.py [32m.[0m[32m.[0m[32m.[0m[32m                                                      [100%][0m



## 14. NEGATIVE TESTS

In [118]:
def ctr(clicks: int, views: int) -> float:
    """Click-through Rate."""

    # Check that the values are integers
    if not isinstance(clicks, int):
        raise TypeError("clicks must be an integer")

    if not isinstance(views, int):
        raise TypeError("views must be an integer")

    # Check that the values are positive
    if clicks < 0:
        raise ValueError("clicks must be positive")

    if views < 0:
        raise ValueError("views must be positive")

    # Check if clicks are greater than views
    if views < clicks:
        raise ValueError("clicks must be less than or equal to views")

    # Calculate the clickthrough rate
    if views:
        return clicks / views
    else:
        raise ZeroDivisionError("views must be greater than zero")


In [119]:
import metrics


def test_non_int_clicks():
    try:
        metrics.ctr(1.5, 2)
    except TypeError:
        pass
    else:
        raise AssertionError("Non int clicks not handled")


def test_non_int_views():
    try:
        metrics.ctr(1, 2.5)
    except TypeError:
        pass
    else:
        raise AssertionError("Non int views not handled")


def test_non_positive_clicks():
    try:
        metrics.ctr(-1, 2)
    except ValueError:
        pass
    else:
        raise AssertionError("Non positive clicks not handled")


def test_non_positive_views():
    try:
        metrics.ctr(1, -2)
    except ValueError:
        pass
    else:
        raise AssertionError("Non positive views not handled")


def test_clicks_greater_than_views():
    try:
        metrics.ctr(2, 1)
    except ValueError:
        pass
    else:
        raise AssertionError("Clicks greater than views not handled")


def test_zero_views():
    try:
        metrics.ctr(1, 0)
    except ZeroDivisionError:
        pass
    else:
        raise AssertionError("Zero views not handled")


## 15. RECALL @ K
Метрики классификации:
- habr https://habr.com/ru/companies/ods/articles/328372/
- Лекции Соколова https://github.com/esokolov/ml-course-msu/blob/master/ML15/lecture-notes/Sem05_metrics.pdf 

Про ROC-AUC
- https://alexanderdyakonov.wordpress.com/2017/07/28/auc-roc-площадь-под-кривой-ошибок/ 

In [109]:
from typing import List


def recall_at_k(labels: List[int], scores: List[float], k=5) -> float:
    """Compute recall at k.

    Args:
        y_true: list of true labels
        y_pred: list of predicted labels
        k: number of top labels to consider

    Returns:
        Recall at k
    """
    sl = sorted(zip(scores, labels), key=lambda t: t[0], reverse=True)
    label_sort = [x[1] for x in sl]
    tp = sum(label_sort[:k])
    fn = sum(label_sort[k:])
    
#Эталонное решение    
#     positive_class = np.argsort(scores)[::-1][:k]
#     negative_class = np.argsort(scores)[::-1][k:]
#     tp_rate = np.sum([labels[i] == 1 for i in positive_class])
#     fn_rate = np.sum([labels[i] == 1 for i in negative_class])

    return tp/(tp+fn)


def precision_at_k(labels: List[int], scores: List[float], k=5) -> float:
    """Compute precision at k.

    Args:
        y_true: list of true labels
        y_pred: list of predicted labels
        k: number of top labels to consider

    Returns:
        Precision at k
    """
    sl = sorted(zip(scores, labels), key=lambda t: t[0], reverse=True)
    label_sort = [x[1] for x in sl]
    tp = sum(label_sort[:k])
    fp = k-tp
    return tp/(fp+tp)


def specificity_at_k(labels: List[int], scores: List[float], k=5) -> float:
    """Compute specificity at k.

    Args:
        y_true: list of true labels
        y_pred: list of predicted labels
        k: number of top labels to consider

    Returns:
        Specificity at k
    """
    sl = sorted(zip(scores, labels), key=lambda t: t[0], reverse=True)
    label_sort = [x[1] for x in sl]
    tp = sum(label_sort[:k])
    fp = k-tp
    fn = sum(label_sort[k:])
    tn = len(label_sort) - k - fn
    if(fp+tn) == 0:
        return 0
    return tn/(fp+tn)


def f1_at_k(labels: List[int], scores: List[float], k=5) -> float:
    """Compute f1 score at k.

    Args:
        y_true: list of true labels
        y_pred: list of predicted labels
        k: number of top labels to consider

    Returns:
        F1 score at k
    """
    precision = precision_at_k(labels, scores, k)
    recall = recall_at_k(labels, scores, k)
    if(precision+recall) == 0:
        return 0
    return 2*precision*recall/(precision + recall)


In [110]:
scores = [0.85, 0.9, 0.78, 0.75, 0.68, 0.45, 0.23]
labels = [1, 0, 0, 1, 1, 0, 1]

In [114]:
print("Recall: ", recall_at_k(labels, scores))
print("Precision: ", precision_at_k(labels, scores))
print("Specificity: ", specificity_at_k(labels, scores))
print("F1 : ", f1_at_k(labels, scores))

Recall:  0.75
Precision:  0.6
Specificity:  0.3333333333333333
F1 :  0.6666666666666665
