In [1]:
import numpy as np
from typing import List, Dict, Tuple, Iterable, Any, Callable, Union, Optional
from math import log
from collections import Counter

## Gini Score

$$Gini = 1 - \sum p_{j}^{2}$$

In [2]:
def gini_score(values: Iterable) -> float:
    counts = Counter(values)
    total = sum(counts.values())
    return 1 - sum(
                    (cnt / total) ** 2 
                    for cnt
                    in counts.values()
                )

In [3]:
Counter(["A", "A", "A", "A", "B", "A", "A", "C", "D"])

Counter({'A': 6, 'B': 1, 'C': 1, 'D': 1})

In [4]:
gini_score(["A", "A", "A", "A", "B", "A", "A", "C", "D"]), gini_score([1, 2, 1, 2, 0, 1])

(0.5185185185185186, 0.6111111111111112)

## Entropy score

$$Entropy = - \sum p_{j}\log p_{j}$$

In [5]:
def entropy_score(values: Iterable) -> float:
    total  = len(values)
    return - sum((
                    (cnt / total) * log(cnt / total)
                    for cnt
                    in  Counter(values).values()
                ))

In [6]:
entropy_score(list(range(5_000)))

8.517193191415725

## MSE Score

$$MSE = \sum_{i=0}^{n} \frac{(\hat{y}_{i} - y_{i})^{2}}{n}$$

In [7]:
values = np.random.randint(0, 100, size=1_000_000)

In [8]:
def mse_score(values: Iterable) -> float:
    total = len(values)
    mean  = sum(values) / total
    return sum(
                (val - mean)**2
                for val
                in values
            ) / total

In [9]:
%%timeit

mse_score(values=values)

324 ms ± 3.85 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
def mse_score(values: Iterable) -> float:
    counts = Counter(values)
    total = sum(counts.values())
    mean  = sum(values) / total
    return sum(
                val_count * (val - mean)**2
                for val, val_count
                in counts.items()
            ) / total

In [11]:
%%timeit

mse_score(values=values)

95.7 ms ± 2.73 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [12]:
mse_score([-0.1, -2.0, 1.0, 1.8, 0.8])

1.688

## RMSE Score

$$RMSE = \sqrt{\sum_{i=0}^{n} \frac{(\hat{y}_{i} - y_{i})^{2}}{n}}$$

In [13]:
from math import sqrt


def rmse_score(values: Iterable) -> float:
    return sqrt(mse_score(values))

In [14]:
rmse_score([-0.1, -2.0, 1.0, 1.8, 0.8])

1.2992305415129373

## MAPE score

$$MAPE = \frac{100}{n} * \sum{\mid \frac{y^{'}_{i} - y_{i}}{y_{i}} \mid}$$

In [15]:
def mape_score(values: Iterable) -> float:
    counts = Counter(values)
    counts.pop(0)  # Выбрасываем нулевое значение
    total = sum(counts.values())
    mean = sum(values) / total
    return 100 / total * sum(
                                val_count * abs((val - mean) / val)
                                for val, val_count
                                in counts.items()
                            )

In [16]:
mape_score([-0.1, -2.0, 1.0, 0, 1.8, 0.8, 0, 0])

146.16666666666669

## Взвешенная оценка

$$Gini_{total} = \frac{N_{A}}{N_{A} + N_{B}}Gini_{A} + \frac{N_{B}}{N_{A} + N_{B}}Gini_{B} = \frac{6}{9}*0.611 + \frac{3}{9}*0.444 = 0.556$$

In [17]:
def weighted_metric_score(metric: Callable, *items: Iterable) -> float:
    total = sum(map(len, items))
    return sum(
                metric(vals) * len(vals)
                for vals
                in items
            ) / total

In [18]:
weighted_metric_score(gini_score, [1, 1, 2, 1, 3, 2], [3, 3, 2])

0.5555555555555556

## Разделенение группы

In [21]:
def split_target_by_feature(targets: Iterable, features: Iterable, split_value: Union[int, float]) -> Tuple[Iterable, Iterable]:
    return (
            [val for ind, val in zip(features, targets) if ind <= split_value],
            [val for ind, val in zip(features, targets) if ind >  split_value],
        )

In [22]:
def split_target_by_feature(targets: Iterable, features: Iterable, split_value: Union[int, float]) -> Tuple[Iterable, Iterable]:
    left  = []
    right = []
    for ind, val in zip(features, targets):
        if ind <= split_value:
            left.append(val)
        else:
            right.append(val)
    return left, right

In [23]:
split_target_by_feature(targets=[1,1,2,2,1,1,2,1,3,1,3,4,1], features=[3,1,9,9,1,5,2,1,7,1,3,4,9], split_value=5)

([1, 1, 1, 1, 2, 1, 1, 3, 4], [2, 2, 3, 1])

### Поиск оптимального "делителя"

In [27]:
from typing import NamedTuple

In [28]:
class SplitValueScore(NamedTuple):
    value: Union[int, float, str]
    score: float

In [43]:
from dataclasses import dataclass

In [54]:
@dataclass(repr=False)
class SplitValueScore:
    value: Union[int, float, str]
    score: float

In [52]:
class SplitValueScore:

    max_value = 0
    
    def __init__(self, value, score):
        self.value = value
        self.score = score
        SplitValueScore.max_value = max(SplitValueScore.max_value, self.score)

In [55]:
s1  = SplitValueScore(value=3, score=.02)
s1

<__main__.SplitValueScore at 0x24352febc10>

In [51]:
s1.value = 34

In [34]:
s1.value

3

In [35]:
def find_best_split_value(score_func: Callable, targets: Iterable, features: Iterable) -> SplitValueScore:
    options = {
                split_value: split_target_by_feature(
                                                        targets     = targets,
                                                        features    = features,
                                                        split_value = split_value
                                                    )
                for split_value
                in set(features)
            }
    scores = {
                split_value: weighted_metric_score(score_func, *split_results)
                for split_value, split_results
                in options.items()
            }
    best_split_value, min_score = min(scores.items(), key=lambda tup: tup[1])
    return SplitValueScore(value=best_split_value, score=min_score)

In [36]:
find_best_split_value(score_func=gini_score, targets=[1,1,2,2,1,1,2,1,3,1,3,4,1], features=[3,1,9,9,1,5,2,1,7,1,3,4,9])

SplitValueScore(value=1, score=0.49572649572649574)

In [39]:
score_func = gini_score
targets    = [1,1,2,2,1,1,2,1,3,1,3,4,1]
features   = [[0,1,2,3,0,2,3,1,2,2,3,0,0],
              [3,3,2,2,2,2,2,2,7,1,3,4,9],
              [3,1,9,9,1,5,2,1,7,1,3,4,9],
              [0,1,2,3,0,2,3,1,2,2,3,0,0],
              [3,1,9,9,1,5,2,1,7,1,3,4,9]]

In [41]:
def find_best_split_feature(score_func: Callable, targets: Iterable, features: Iterable) -> int:
    features_split_options = (
                                (i, find_best_split_value(score_func=score_func, targets=targets, features=row))
                                for i, row
                                in enumerate(features)
                            )
    best_feature_ind, split_value_score = min(features_split_options, key=lambda tup: tup[1].score)
    return best_feature_ind, split_value_score

In [42]:
find_best_split_feature(score_func, targets, features)

(0, SplitValueScore(value=2, score=0.47179487179487184))

## Node

In [52]:
from itertools import count

In [82]:
class Node:

    counter = count()

    def __init__(self, values: Iterable, parent: Optional["Node"] = None, children: Optional[List["Node"]] = None):
        self.values = values
        self.parent = parent
        self.children = children
        self.id = next(Node.counter)

    def __repr__(self) -> str:
        return f"Node {self.id}"

    def is_root(self) -> bool:
        return self.parent is None

    def is_leaf(self) -> bool:
        return self.children is None


In [83]:
node_1 = Node(values=[1,2,3])
node_2 = Node(values=[2,3], parent=node_1)

In [86]:
node_1.id

0

In [87]:
node_2.id

1

## Condition & ConditionsGroup

In [12]:
from functools import reduce
from operator import add

In [13]:
class ConditionsGroup:

    def __init__(self, conditions: List["Condition"]):
        self.conditions = conditions

    def __repr__(self) -> str:
        return " | ".join(map(str, self.conditions))

    def __add__(self, other: Union["Condition", "ConditionsGroup"]) -> "ConditionsGroup":
        if isinstance(other, ConditionsGroup):
            return reduce(add, [*self.conditions, *other.conditions])
        else:
            return other + self


In [14]:
class Condition:

    def __init__(self, feature: Union[str, int], greater: bool, value: Union[int, float]):
        self.feature = feature
        self.greater = greater
        self.value = value

    def __repr__(self) -> str:
        sign = ">" if self.greater else "<="
        return f"{self.feature} {sign} {self.value}"

    def __add__(self, other: Union["Condition", ConditionsGroup]) -> Union["Condition", ConditionsGroup]:
        if isinstance(other, Condition):
            return self._add_to_condition(other=other)
        if isinstance(other, ConditionsGroup):
            return self._add_to_conditions_group(other=other)
        raise ValueError("Expected or Condition or ConditionsGroup")

    def is_like(self, other: "Condition") -> bool:
        return (self.feature == other.feature) & (self.greater == other.greater)

    def _add_to_condition(self, other: "Condition") -> Union["Condition", ConditionsGroup]:
        if not self.is_like(other):
            return ConditionsGroup(conditions=[self, other])
        if self.greater:
            value = max(self.value, other.value)
        else:
            value = min(self.value, other.value)
        return Condition(feature=self.feature, greater=self.greater, value=value)

    def _add_to_conditions_group(self, other: ConditionsGroup) -> ConditionsGroup:
        similar_conditions   = [c for c in other.conditions if self.is_like(c)]
        different_conditions = [c for c in other.conditions if not self.is_like(c)]
        united_condition = reduce(add, similar_conditions, self)
        return ConditionsGroup(conditions=[united_condition, *different_conditions])


In [15]:
c1 = Condition(feature="рост", greater=False, value=180)
c2 = Condition(feature="рост", greater=True, value=160)
c3 = Condition(feature="рост", greater=True, value=150)

c4 = Condition(feature="вес", greater=False, value=90)
c5 = Condition(feature="вес", greater=False, value=85)
c6 = Condition(feature="вес", greater=True, value=45)

In [16]:
c1, c2

(рост <= 180, рост > 160)

In [17]:
c2 + c3 + c1 + (c3 + c4 + c5)

рост > 160 | вес <= 85 | рост <= 180

In [18]:
sum([c1, c2, c3, c4, c5, c6], start=c1)

вес > 45 | вес <= 85 | рост > 160 | рост <= 180

---

# Decorators

In [61]:
def say_hello(name: str, times: int = 1) -> str:
    return f"Hello, {name}" + "!" * times

In [60]:
say_hello("sergei", times=4)

'Hello, sergei!!!!'

In [62]:
"sergei".capitalize()

'Sergei'

In [63]:
def make_capital(func: Callable) -> Callable:
    
    def wrapped(name: str, *args, **kwargs) -> str:
        name = name.capitalize()
        print("finished capitalization")
        result = func(name, *args, **kwargs)
        print("got results")
        return result
    
    return wrapped

In [64]:
capitalized_say_hello = make_capital(func=say_hello)
capitalized_say_hello

<function __main__.make_capital.<locals>.wrapped(name: str, *args, **kwargs) -> str>

In [65]:
capitalized_say_hello(name="sergei", times=3)

finished capitalization
got results


'Hello, Sergei!!!'

In [66]:
@make_capital
def say_hello_2(name: str, times: int = 10) -> str:
    return f"Helloooooooooooo, {name}" + "!" * times

In [67]:
say_hello_2

<function __main__.make_capital.<locals>.wrapped(name: str, *args, **kwargs) -> str>

---

In [68]:
def weighted(func: Callable) -> Callable:

    def wrapped(*values: Iterable) -> float:
        total = sum(map(len, values))
        if not total:
            raise ValueError("You must pass at least one array")
        return sum(
                     func(vals) * len(vals)
                     for vals
                     in values
                ) / total

    wrapped.__doc__ = func.__doc__
    return wrapped

In [69]:
@weighted
def gini_score(values: Iterable) -> float:
    counts = Counter(values)
    total = sum(counts.values())
    return 1 - sum(
                    (cnt / total) ** 2 
                    for cnt
                    in counts.values()
                )

In [None]:
@weighted

In [72]:
gini_score(["A", "A", "A", "A", "B", "A", "A", "C", "D"], [1, 2, 1, 2, 0, 1], [1,5,1,5,11,1,1,1])

0.5471014492753624

In [74]:
from itertools import count

In [85]:
class Node:

    counter = count()

    def __init__(self, values: Iterable[int], number: int):
        self._id = next(Node.counter)
        self._number: int
        self.values = values
        self.number = number

    @property
    def number(self) -> int:
        return self._number

    @number.setter
    def number(self, value: int):
        print(f"setted new value {value}")
        self._number = value

    @property
    def id(self) -> int:
        return self._id

In [86]:
n1 = Node(values=[1,2,3], number=44)

setted new value 44


In [84]:
n1.id = 3

AttributeError: can't set attribute

In [77]:
n1._id = 3

0