-
Notifications
You must be signed in to change notification settings - Fork 1
/
unsupervised.py
101 lines (83 loc) · 3.71 KB
/
unsupervised.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
""" Implements the equal width and equal frequency binning """
import pandas as pd
import numpy as np
import warnings
from pandas.api.types import is_numeric_dtype
from ..utils import searchsorted, wrap_with_inf, assign_group, make_series
from .base import Binning
class EqualWidthBinning(Binning):
def __init__(self,
n: int,
cols: list = None,
bins: dict = None,
encode: bool = True,
fill: int = -1):
"""
:param n: Number of bins to split into
:param cols: A list of columns to perform binning, if set to None, perform binning on all columns.
:param bins: A dictionary mapping column name to cutoff points
:param encode: If set to False, the result of transform will be right cutoff point of the interval
If the input has missing values, it will be put under a seperate group with the largest bin value
:param fill: Used to fill in missing value.
"""
super().__init__(cols, bins, encode, fill)
self.n = n
def _fit(self, X: pd.Series, y=None, **fit_parmas):
""" Fit a single feature and return the cutoff points"""
if not is_numeric_dtype(X):
return None
def find_nearest_element(series, elem):
min_idx = (series - elem).abs().values.argmin()
return series.iloc[min_idx]
X_ = X[X.notnull()]
v_min, v_max = X_.min(), X_.max()
bins = [find_nearest_element(X_, elem) for elem in np.linspace(v_min, v_max, self.n+1)]
return bins
class EqualFrequencyBinning(Binning):
def __init__(self,
n: int,
cols: list = None,
bins: dict = None,
encode: bool = True,
fill: int = -1):
"""
:param q: Number of equal width intervals to split into
:param cols: A list of columns to perform binning, if set to None, perform binning on all columns.
:param bins: A series of cutoff points, if provided, n will be ignored
:param encode: If set to False, the result of transform will be right cutoff point of the interval
:param fill: Used to fill in missing value.
"""
super().__init__(cols, bins, encode, fill)
self.n = n
def _fit(self, X: pd.Series, y=None, **fit_parmas):
""" Fit a single feature and return the cutoff points"""
if not is_numeric_dtype(X):
return None
quantiles = np.linspace(0, len(X[X.notnull()]) - 1, self.n+1, dtype=int)
cutoff = X.sort_values().reset_index(drop=True)[quantiles]
# there might be duplicated cutoff points
return set(cutoff)
def equal_width_binning(X: pd.Series, n: int, encode: bool = True, fill: int = -1):
""" Shortcut for equal width binning on a Pandas.Series, returns
the encoded series and the cutoff points
"""
s_name = X.name or 0
EWB = EqualWidthBinning(n, encode=encode, fill=fill)
binned = EWB.fit_transform(X.to_frame())
return binned[s_name], EWB.bins[s_name]
def equal_frequency_binning(X: pd.Series, n: int, encode: bool = True, fill: int = -1):
""" Shortcut for equal frequency binning on a Pandas.Series, returns
the encoded series and the cutoff points
"""
s_name = X.name or 0
EFB = EqualFrequencyBinning(n, encode=encode, fill=fill)
binned = EFB.fit_transform(X.to_frame())
return binned[s_name], EFB.bins[s_name]
if __name__ == '__main__':
# s = pd.Series(list(range(20)) + [np.nan] * 4)
# EWB = EqualWidthBinning(n=5, encode=True)
# EFB = EqualFrequencyBinning(n=5, encode=False)
# print(EFB.fit_transform(s))
# print(EFB.bins)
# print(EFB.transform(s))
pass