In [1]:
import os
import numpy as np
import pandas as pd
import functions as func

import concurrent.futures as futures
from functools import partial
from multiprocessing import Process, Pool
from concurrent.futures import ProcessPoolExecutor, Executor, as_completed

In [2]:
def read_data(path, filename):
    print(f'reading file = {os.path.join(path, filename)}')
    data = pd.read_csv(os.path.join(path, filename))
    data = data.rename(columns={'Unnamed: 0':'Id'})
    print(f'data shape = {data.shape}')
    types_info = pd.DataFrame(data.dtypes.value_counts(), columns=['columns_count'])
    print('types info about df columns: ')
    print(types_info)
    return data

In [3]:
ls -la -h Datasets/GiveMeSomeCredit/

total 28272
drwx------@ 6 adam  staff   192B Dec 23 21:30 [34m.[m[m/
drwxr-xr-x  8 adam  staff   256B Dec 23 21:36 [34m..[m[m/
-rwxr-xr-x@ 1 adam  staff    15K Dec 11  2019 [31mData Dictionary.xls[m[m*
-rwxr-xr-x@ 1 adam  staff   4.8M Dec 11  2019 [31mcs-test.csv[m[m*
-rwxr-xr-x@ 1 adam  staff   7.2M Dec 11  2019 [31mcs-training.csv[m[m*
-rwxr-xr-x@ 1 adam  staff   1.8M Dec 11  2019 [31msampleEntry.csv[m[m*


In [3]:
train_data = read_data(path='Datasets/GiveMeSomeCredit/', filename='cs-training.csv')
test_data = read_data(path='Datasets/GiveMeSomeCredit/', filename='cs-test.csv')
descript = pd.read_excel("Datasets/GiveMeSomeCredit/Data Dictionary.xls")
# sample_data = pd.read_csv("Datasets/GiveMeSomeCredit/sampleEntry.csv")

reading file = Datasets/GiveMeSomeCredit/cs-training.csv
data shape = (150000, 12)
types info about df columns: 
         columns_count
int64                8
float64              4
reading file = Datasets/GiveMeSomeCredit/cs-test.csv
data shape = (101503, 12)
types info about df columns: 
         columns_count
int64                7
float64              5


In [4]:
def transform_to_description(data):
    transformed_data = pd.DataFrame(columns=data.columns)
    
    for col in data:
        transformed_data[col] = data[col].apply(lambda x: (x, x))
    
    return transformed_data

In [5]:
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

#ставим колонку Id как индекс клиента
train_data.set_index('Id', inplace=True)
test_data.set_index('Id', inplace=True)

#сохраняем метку класса
train_label = train_data['SeriousDlqin2yrs'].copy()
train_data.drop('SeriousDlqin2yrs', axis=1, inplace=True)
#удаляем колонку класса из тестовых данных, так как она не несет никакой информации
test_data.drop('SeriousDlqin2yrs', axis=1, inplace=True)

In [6]:
train_data.shape, test_data.shape

((150000, 10), (101503, 10))

In [7]:
float_cols = train_data.select_dtypes('float').columns
train_data.loc[:, float_cols] = train_data.loc[:, float_cols].round(2)
train_data.head()

Unnamed: 0_level_0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.77,45,2,0.8,9120.0,13,0,6,0,2.0
2,0.96,40,0,0.12,2600.0,4,0,0,0,1.0
3,0.66,38,1,0.09,3042.0,2,1,0,0,0.0
4,0.23,30,0,0.04,3300.0,5,0,0,0,0.0
5,0.91,49,1,0.02,63588.0,7,0,1,0,0.0


In [8]:
float_cols = test_data.select_dtypes('float').columns
test_data.loc[:, float_cols] = test_data.loc[:, float_cols].round(2)
test_data.head()

Unnamed: 0_level_0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.89,43,0,0.18,5700.0,4,0,0,0,0.0
2,0.46,57,0,0.53,9141.0,15,0,4,0,2.0
3,0.04,59,0,0.69,5083.0,12,0,1,0,2.0
4,0.28,38,1,0.93,3200.0,7,0,2,0,0.0
5,1.0,27,0,0.02,3865.0,4,0,0,0,1.0


In [9]:
transformed_train = transform_to_description(train_data)
transformed_test = transform_to_description(test_data)
transformed_train.head(5)

Unnamed: 0_level_0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,"(0.77, 0.77)","(45, 45)","(2, 2)","(0.8, 0.8)","(9120.0, 9120.0)","(13, 13)","(0, 0)","(6, 6)","(0, 0)","(2.0, 2.0)"
2,"(0.96, 0.96)","(40, 40)","(0, 0)","(0.12, 0.12)","(2600.0, 2600.0)","(4, 4)","(0, 0)","(0, 0)","(0, 0)","(1.0, 1.0)"
3,"(0.66, 0.66)","(38, 38)","(1, 1)","(0.09, 0.09)","(3042.0, 3042.0)","(2, 2)","(1, 1)","(0, 0)","(0, 0)","(0.0, 0.0)"
4,"(0.23, 0.23)","(30, 30)","(0, 0)","(0.04, 0.04)","(3300.0, 3300.0)","(5, 5)","(0, 0)","(0, 0)","(0, 0)","(0.0, 0.0)"
5,"(0.91, 0.91)","(49, 49)","(1, 1)","(0.02, 0.02)","(63588.0, 63588.0)","(7, 7)","(0, 0)","(1, 1)","(0, 0)","(0.0, 0.0)"


In [10]:
transformed_train.shape, transformed_test.shape

((150000, 10), (101503, 10))

In [11]:
def similarity(vect1, vect2):
    
    """
    previous version:
     vect1 = transformed_train.iloc[0]
     vect2 = transformed_train.iloc[2]
     func = (lambda x,y: (min(x[0], y[0]), max(x[1], y[1])))
     pd.Series(map(func, vect1, vect2), index=train_data.columns)
    
     for col in cols:
     vect_min = min(vect1.loc[col][0], vect2.loc[col][0])
     vect_max = max(vect1.loc[col][1], vect2.loc[col][1])
     vect[col] = (vect_min, vect_max)
    """
    
    func = lambda x, y: (min(x[0], y[0]), max(x[1], y[1]))
    vect = pd.Series(map(func, vect1, vect2), index=vect1.index)
    return vect

def inclusion(obj, patterns):
    """
    check where an obj is inluded in list of patterns
    is_include = any([all(obj == elem) for elem in patterns])
    """     
    is_include = any([obj.equals(elem) for elem in patterns])
    return  is_include

**Алгоритм из работы Алексея(QBCA)**

In [12]:
alpha = 0.003
sample_ratio = 0.003#parameter
num_iters = 1000#parameter
N_neg = train_label.value_counts().reset_index().iloc[0, 1]
N_pos = train_label.value_counts().reset_index().iloc[1, 1]
N_neg, N_pos

(139974, 10026)

In [13]:
train_pos = transformed_train.loc[train_label[train_label == 1].index]
train_neg = transformed_train.loc[train_label[train_label == 0].index]
#не обновляем индекс так как индекс - это id клиента, имеет значимую информацию
train_neg.shape, train_pos.shape

((139974, 10), (10026, 10))

Берем выборку заемов, для начального тестирования работоспособности

In [14]:
train_pos = train_pos.sample(n=1000, random_state=123, replace=False)
train_neg = train_neg.sample(n=1000, random_state=123, replace=False)
train_pos.shape, train_neg.shape

((1000, 10), (1000, 10))

**get_similarity_sample_repr and is_included_in_repr работают вроде правильно**

По крайней мере на тестовых объектах показали правильные результаты

In [15]:
def get_similarity_sample_repr(sample: pd.DataFrame):
    
    """
    get sample of feature represantations from pos or neg class dataset
    returns feature represantation for sample by similarity operation
    """
    pattern = None
    for i, obj in sample.iterrows():
        if pattern is None:
            pattern = obj
        else:
            pattern = similarity(pattern, obj)
    return pattern
    
#операция нахождения объектов по признаковому представлению
def is_included_in_repr(d, train_data):
    """
    returns objects from train dataset(from train pos and neg data) that is included in d representation
    """
    
    d_list = []
    
    for i, obj in train_data.iterrows():
        feature_repr = similarity(obj, d)
        is_included = d.equals(feature_repr)
        if is_included:
            d_list.append(obj)
            
    return d_list

Что нужно посмотреть:

    1. Сравнение количества генерируемых гипотез в зависимости от критерия и области генерации выборки.
    (Возможно изобразить всего 4 варианта: старый подход - локальная или случайная выборка, новый подход - 
    локальная или случайная выборка)
    
    2. Сравнение генерируемых гипотез для старого и нового критерия в зависимости от ширины локальной области генерации
    (Возможно есть какая то оптимальная ширина окна)
    

### Mining step


    для положительного класса нас интересуют объекты отрицательного класса, 
    а для отрицательного - положительные
    Уже после определения объектов из другого класса, попадающие в признаковое представление семпла данных, будет приниматься решение
     о включение этого признакого представление в список гипотез представления(областей или интервальных представлений)
    

To-do-list:

1. **Нужно также проанализировать время работы и качество** done!!!
2. **классификации при разном включении признаков из исходного множества**

To-do-list
1. Подумать о том, каким образом генерить гипотезы. То есть выбирать не случайно выборку из множества объектов,
    а какую-то локальную область. (Делать будем через расширение области объекта; более того, будем искать 
    оптимальное значение расширения локальной области)

In [46]:
def generate_gypothesis(iteration: int,
                        obj: pd.Series, 
                        train_data: pd.DataFrame, 
                        other_data: pd.DataFrame, 
                        sample_size: int, 
                        hypothesis_criterion: str,
                        verbose: bool,
                        other_data_size: int, 
                        alpha: float):
        
        print(f'iteration: {iteration}')
        
        inds = np.random.RandomState().choice(train_data.index, replace=False, size=sample_size)
        sample = train_data.loc[inds, :].copy()
        sample = sample.append(obj)
        
        d = get_similarity_sample_repr(sample)
        if verbose:
            print('got feature represantation for sample')
        
        d_other_objects = is_included_in_repr(d, train_data=other_data)
        #print(f'got {len(d_other_objects)} d_other_objects')
        #print(f'thresh for hypothesis = {int(other_data_size * alpha)}')
        
#         if verbose:
#             print('got objects that is included in sample represantation')
            
        if d_other_objects is None:
            print('did not find any hypothesis on this iteration')
            return None
        
        ###проверка критерия
        if hypothesis_criterion == 'contr_class':
            if len(d_other_objects) <= int(other_data_size * alpha):
                return d
        elif hypothesis_criterion == 'both_classes':
            #дополнительно смотрим какие объекты target(рассматриваемого на этой итерации) класса попадают в паттерн d
            d_target_objects = is_included_in_repr(d, train_data=train_data)
            if len(d_other_objects) <= int(len(d_target_objects) * alpha):
                return d
        else:
            return None

def mining_step(test_obj: pd.Series, train_pos: pd.DataFrame, train_neg: pd.DataFrame,
                num_iters: int, sample_ratio: float, alpha: float, hypothesis_criterion: str, 
                mining_type: str = 'pos', verbose : bool = False, n_jobs : int = 4):
    """
    hypothesis_criterion: 'contr_class', если используем базовый критерий, 
                                когда смотрится пересечение с противоположным классом(старый критерий отбора гипотез)
                           'both_classes', когда интересует пересечение по обоим классам(новый критерий отбора гипотез)
                            
    
    returns list of hypothesises
    """
    
    train_data = train_pos if mining_type == 'pos' else train_neg
    other_data = train_neg if mining_type == 'pos' else train_pos
    other_data_size = train_neg.shape[0] if mining_type == 'pos' else train_pos.shape[0]
    
    sample_size = int(train_data.shape[0] * sample_ratio)
    print(f'using sample size = {sample_size}')
    print(f'other data size = {other_data_size}')
    print(f'other data rate size = {int(other_data_size * alpha)}')
    print('start generating hypothesises')
    
    mining = partial(generate_gypothesis, 
                     obj=test_obj, 
                     train_data=train_data, 
                     other_data=other_data, 
                     sample_size=sample_size, 
                     hypothesis_criterion=hypothesis_criterion,
                     verbose=verbose, 
                     other_data_size=other_data_size,
                     alpha=alpha
                    )
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        hypothesises = executor.map(mining, range(num_iters))

    hypothesises = [res for res in hypothesises if res is not None]
    return hypothesises
    

In [23]:
test_sample = transformed_test.loc[:2]
test_sample

Unnamed: 0_level_0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,"(0.89, 0.89)","(43, 43)","(0, 0)","(0.18, 0.18)","(5700.0, 5700.0)","(4, 4)","(0, 0)","(0, 0)","(0, 0)","(0.0, 0.0)"
2,"(0.46, 0.46)","(57, 57)","(0, 0)","(0.53, 0.53)","(9141.0, 9141.0)","(15, 15)","(0, 0)","(4, 4)","(0, 0)","(2.0, 2.0)"


In [24]:
num_iters, alpha, sample_ratio

(1000, 0.003, 0.003)

In [26]:
%%time
for i, obj in test_sample.iterrows():
    
    print(f'start mining from pos objects')
    pos_hyps = mining_step(test_obj=obj, train_pos=train_pos, train_neg=train_neg, 
                           num_iters=num_iters,sample_ratio=sample_ratio, alpha = alpha,
                           hypothesis_criterion='contr_class', mining_type='pos',
                           verbose=True, n_jobs=4
                          )
    
    print(f'start mining from neg objects')
    neg_hyps = mining_step(test_obj=obj, train_pos=train_pos, train_neg=train_neg,
                           num_iters=num_iters, sample_ratio=sample_ratio, alpha = alpha, 
                           hypothesis_criterion='contr_class',mining_type='neg', 
                           verbose=True, n_jobs=4
                          )

start mining from pos objects
using sample size = 3
other data size = 1000
other data rate size = 3
start generating hypothesises
iteration: 0
got feature represantation for sample
iteration: 1
got feature represantation for sample
iteration: 2
got feature represantation for sample
iteration: 3
got feature represantation for sample
got 18 d_sim_objects
got 44 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 4
got feature represantation for sample
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 5
got feature represantation for sample
got 14 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 6
got feature represantation for sample
got 39 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 7
got feature represantation for sample
got 75 d_sim_objects
thresh for hypothesis = 3
got objects

iteration: 55
got feature represantation for sample
got 14 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
got 22 d_sim_objects
thresh for hypothesis = 3
iteration: 56
got objects that is included in sample represantation
iteration: 57
got feature represantation for sample
got feature represantation for sample
got 65 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 58
got feature represantation for sample
got 12 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 59
got feature represantation for sample
got 4 d_sim_objects
got 142 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 60
thresh for hypothesis = 3
got feature represantation for sample
got objects that is included in sample represantation
iteration: 61
got feature represantation for sample
got 15 d_sim_objects
thresh fo

got objects that is included in sample represantation
got 39 d_sim_objects
thresh for hypothesis = 3
iteration: 109
got objects that is included in sample represantation
got feature represantation for sample
iteration: 110
got 25 d_sim_objects
got feature represantation for sample
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 111
got feature represantation for sample
got 8 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 112
got feature represantation for sample
got 25 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 113
got feature represantation for sample
got 64 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 114
got feature represantation for sample
got 57 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 115
got feat

got feature represantation for sample
iteration: 162
thresh for hypothesis = 3
got objects that is included in sample represantation
got feature represantation for sample
iteration: 163
got feature represantation for sample
got 16 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 164
got feature represantation for sample
got 27 d_sim_objects
thresh for hypothesis = 3
got 15 d_sim_objects
got objects that is included in sample represantation
thresh for hypothesis = 3
got 121 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 165
got objects that is included in sample represantation
got feature represantation for sample
iteration: 166
iteration: 167
got feature represantation for sample
got feature represantation for sample
got 7 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 168
got feature represantation for sample
got 45

got 61 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 216
got feature represantation for sample
got 21 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 217
got feature represantation for sample
got 30 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 218
got feature represantation for sample
got 63 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 219
got feature represantation for sample
got 16 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 220
got 4 d_sim_objects
thresh for hypothesis = 3
got feature represantation for sample
got objects that is included in sample represantation
iteration: 221
got feature represantation for sample
got 120 d_sim_objects
thresh for hypothesis = 3
got objects that is included 

got objects that is included in sample represantation
iteration: 269
got feature represantation for sample
got 29 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 270
got feature represantation for sample
got 5 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 271
got feature represantation for sample
got 42 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 272
got feature represantation for sample
got 20 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 273
got feature represantation for sample
got 22 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 274
got feature represantation for sample
got 3 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 275
got featu

iteration: 322
got feature represantation for sample
got 10 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 323
got feature represantation for sample
got 8 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 324
got feature represantation for sample
got 47 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 325
got feature represantation for sample
got 98 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 326
got feature represantation for sample
got 3 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 327
got feature represantation for sample
got 16 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 328
got feature represantation for sample
got 14 d_sim_objects
thre

got 57 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 376
got feature represantation for sample
got 8 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 377
got feature represantation for sample
got 187 d_sim_objects
thresh for hypothesis = 3
got 88 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
got objects that is included in sample represantation
got 5 d_sim_objects
iteration: 378
thresh for hypothesis = 3
iteration: 379
got feature represantation for sample
got objects that is included in sample represantation
got feature represantation for sample
iteration: 380
got feature represantation for sample
got 9 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 381
got feature represantation for sample
got 19 d_sim_objects
thresh for hypothesis = 3
got objects that is included in

got objects that is included in sample represantation
iteration: 429
got feature represantation for sample
got 22 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 430
got feature represantation for sample
got 10 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 431
got feature represantation for sample
got 19 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 432
got feature represantation for sample
got 53 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 433
got feature represantation for sample
got 28 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 434
got feature represantation for sample
got 73 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
got 45 d_sim_objects
t

iteration: 482
got feature represantation for sample
got 18 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 483
got feature represantation for sample
got 4 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 484
got feature represantation for sample
got 3 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 485
got feature represantation for sample
got 95 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
got 212 d_sim_objects
thresh for hypothesis = 3
iteration: 486
got objects that is included in sample represantation
got feature represantation for sample
got 54 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 488
got feature represantation for sample
iteration: 487
got feature represantation for sample
got 11 d_sim_objects
thr

got 38 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 536
got feature represantation for sample
got 9 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 537
got feature represantation for sample
got 217 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
got 30 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 539
iteration: 538
got feature represantation for sample
got feature represantation for sample
got 53 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 540
got feature represantation for sample
got 52 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 541
got feature represantation for sample
got 18 d_sim_objects
got 10 d_sim_objects
thresh for hypothesis = 3
thresh f

got 11 d_sim_objects
got objects that is included in sample represantation
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 590
got feature represantation for sample
iteration: 589
got 9 d_sim_objects
got feature represantation for sample
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 591
got feature represantation for sample
got 30 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 592
got feature represantation for sample
got 26 d_sim_objects
thresh for hypothesis = 3
got 19 d_sim_objects
got objects that is included in sample represantation
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 593
got feature represantation for sample
iteration: 594
got 3 d_sim_objects
got feature represantation for sample
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 595
got featu

iteration: 642
got feature represantation for sample
got 5 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 643
got feature represantation for sample
got 37 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 644
got feature represantation for sample
got 24 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 645
got feature represantation for sample
got 144 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 646
got feature represantation for sample
got 10 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 647
got feature represantation for sample
got 22 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 648
got feature represantation for sample
got 22 d_sim_objects
th

got 20 d_sim_objects
thresh for hypothesis = 3
got 4 d_sim_objects
got objects that is included in sample represantation
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 696
got feature represantation for sample
iteration: 697
got feature represantation for sample
got 60 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 698
got feature represantation for sample
got 78 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 699
got feature represantation for sample
got 157 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 700
got 5 d_sim_objects
got feature represantation for sample
thresh for hypothesis = 3
got objects that is included in sample represantation
got 73 d_sim_objects
iteration: 701
thresh for hypothesis = 3
got objects that is included in sample represantation
got feature re

got objects that is included in sample represantation
got 113 d_sim_objects
thresh for hypothesis = 3
iteration: 749
got objects that is included in sample represantation
got feature represantation for sample
iteration: 750
got feature represantation for sample
got 13 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 751
got feature represantation for sample
got 18 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 752
got feature represantation for sample
got 13 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 753
got feature represantation for sample
got 55 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 754
got feature represantation for sample
got 39 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 755
got fe

iteration: 802
got 8 d_sim_objects
thresh for hypothesis = 3
got feature represantation for sample
got objects that is included in sample represantation
iteration: 803
got feature represantation for sample
got 189 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 804
got feature represantation for sample
got 34 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 805
got feature represantation for sample
got 56 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
got 72 d_sim_objects
iteration: 806
thresh for hypothesis = 3
got objects that is included in sample represantation
got feature represantation for sample
iteration: 807
got feature represantation for sample
got 13 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 808
got feature represantation for sample
got 2 d_sim_objects
thr

got 29 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 856
got feature represantation for sample
got 22 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 857
got feature represantation for sample
got 7 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 858
got 14 d_sim_objects
got feature represantation for sample
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 859
got feature represantation for sample
got 8 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
got 60 d_sim_objects
thresh for hypothesis = 3
iteration: 860
got objects that is included in sample represantation
got feature represantation for sample
iteration: 861
got feature represantation for sample
got 62 d_sim_objects
thresh for hypothesis = 3
got objects that is included in

got objects that is included in sample represantation
iteration: 909
got 117 d_sim_objects
got feature represantation for sample
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 910
got feature represantation for sample
got 4 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 911
got feature represantation for sample
got 336 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 912
got feature represantation for sample
got 21 d_sim_objects
got 25 d_sim_objects
thresh for hypothesis = 3
thresh for hypothesis = 3
got objects that is included in sample represantation
got objects that is included in sample represantation
iteration: 913
got feature represantation for sample
iteration: 914
got feature represantation for sample
got 11 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 915
got fe

got feature represantation for sample
thresh for hypothesis = 3
iteration: 962
got objects that is included in sample represantation
got feature represantation for sample
iteration: 963
got feature represantation for sample
got 19 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 964
got feature represantation for sample
got 1 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 965
got 4 d_sim_objects
got 22 d_sim_objects
thresh for hypothesis = 3
got feature represantation for sample
got objects that is included in sample represantation
thresh for hypothesis = 3
got objects that is included in sample represantation
iteration: 966
iteration: 967
got feature represantation for sample
got feature represantation for sample
got 13 d_sim_objects
thresh for hypothesis = 3
got objects that is included in sample represantation
got 3 d_sim_objects
iteration: 968
got 208 d_sim_objects
got 

In [108]:
# a = pd.Series({'feat1':(1,1), 'feat2':(0, 1.4), 'feat3':(3, 4)})
# b = pd.Series({'feat1':(1,2), 'feat2':(1, 1.4), 'feat3':(7, 7)})
# c = pd.Series({'feat1':(1,1), 'feat2':(0.8, 1.6), 'feat3':(3, 4)})
# d = pd.Series({'feat1':(2,3), 'feat2':(0, 1.4), 'feat3':(4, 6)})


# print(a.equals(b))
# print(a == b)

# test1 = pd.Series({'feat1':(1,3), 'feat2':(1, 3.4), 'feat3':(1, 1.9)})
# test2 = pd.Series({'feat1':(2,2.2), 'feat2':(2, 2.4), 'feat3':(3, 3.9)})
# test3 = pd.Series({'feat1':(3,3), 'feat2':(4, 4.4), 'feat3':(4, 4.9)})

g1 = pd.Series({'feat1':(1,1), 'feat2':(1.5, 1.5)})
g2 = pd.Series({'feat1':(-1,-1), 'feat2':(0, 0)})
g3 = pd.Series({'feat1':(0.5,0.5), 'feat2':(1, 1)})

test1 = pd.Series({'feat1':(0.1, 1), 'feat2':(-0.5, 0)})
test2 = pd.Series({'feat1':(-0.1, 1), 'feat2':(0.5, 0.8)})
tst = pd.Series({'feat1':(-1, 1), 'feat2':(0, 1.5)})

In [85]:
d = similarity(g1, g2)
d

feat1     (-1, 1)
feat2    (0, 1.5)
dtype: object

In [86]:
d

feat1     (-1, 1)
feat2    (0, 1.5)
dtype: object

In [96]:
d.equals(tst)

True

In [92]:
tst

feat1     (-1, 1)
feat2    (0, 1.5)
dtype: object

In [103]:
inter_sample = get_similarity_sample_repr(sample)
inter_sample

feat1    (0.5, 1)
feat2    (1, 1.5)
dtype: object

In [109]:
sample = pd.DataFrame([g1, g2, g3, test1, test2])
sample

Unnamed: 0,feat1,feat2
0,"(1, 1)","(1.5, 1.5)"
1,"(-1, -1)","(0, 0)"
2,"(0.5, 0.5)","(1, 1)"
3,"(0.1, 1)","(-0.5, 0)"
4,"(-0.1, 1)","(0.5, 0.8)"


In [104]:
g1

feat1        (1, 1)
feat2    (1.5, 1.5)
dtype: object

In [111]:
sample

Unnamed: 0,feat1,feat2
0,"(1, 1)","(1.5, 1.5)"
1,"(-1, -1)","(0, 0)"
2,"(0.5, 0.5)","(1, 1)"
3,"(0.1, 1)","(-0.5, 0)"
4,"(-0.1, 1)","(0.5, 0.8)"


In [124]:
is_included_in_repr(d, sample)

[feat1        (1, 1)
 feat2    (1.5, 1.5)
 Name: 0, dtype: object, feat1    (-1, -1)
 feat2      (0, 0)
 Name: 1, dtype: object, feat1    (0.5, 0.5)
 feat2        (1, 1)
 Name: 2, dtype: object, feat1     (-0.1, 1)
 feat2    (0.5, 0.8)
 Name: 4, dtype: object]

In [125]:
d

feat1     (-1, 1)
feat2    (0, 1.5)
dtype: object

Тестовая функция провекри np.random.choice

In [40]:
def compute_stat(times, data):
    print(f"times = {times}")
    sample = np.random.RandomState().choice(data, replace=False, size=10)
    print(f"sample = {sample}")
    sample_sum = np.sum(sample)
    print(f"sample sum = {sample_sum}")
    return sample_sum

def run_computing(n_jobs=2):
    data = list(range(100))
    
    compute_func = partial(compute_stat, data=data)
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        results = executor.map(compute_func, list(range(26)))
    
    return results

In [42]:
t = run_computing(n_jobs=6)

times = 0
times = 1
times = 2
times = 3
times = 4
times = 5
sample = [32  5 50 21  9 43 94  8 96 38]
sample = [11 35 37 89 88 72 50 28 38 34]
sample = [97 53 52 93 81 86 58 35 49 68]
sample = [72  5 41 66 65 18 26 71 74 44]
sample = [85 23 90 86 18 21 27 80 62 76]
sample = [19  6 31 10 98 18 76 71 35 43]
sample sum = 482
sample sum = 396
sample sum = 672
sample sum = 482
sample sum = 568
sample sum = 407
times = 6
times = 7
times = 8
times = 9
times = 10
times = 11
sample = [77 67  8 51 54 27 10 52  3 11]
sample = [68 85 39 41 77 10 19 81 71 53]
sample = [12 88 89 32 26 54 38  2 17 42]
sample = [84 89 72 36  3 58 20 17 15 52]
sample = [94 46 49  2 79 53 42 91  6 55]
sample = [90  3 95  5 76 30 73 56 37 34]
sample sum = 360
sample sum = 544
sample sum = 400
sample sum = 446
times = 12
sample sum = 499
sample sum = 517
times = 14
times = 13
times = 16
times = 17
sample = [70 43 42 85 41 61 33 34 93 10]
times = 15
sample = [33 14 60 28  7 45 36 68 91 29]
sample = [32 48 45 34 66 47  9 73 