In [1]:
import os
import json
import imp
import numpy as np
import pandas as pd
import functions as utils
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from functools import partial
from concurrent.futures import ProcessPoolExecutor, Executor, as_completed

In [2]:
def read_data(path, filename):
    print(f'reading file = {os.path.join(path, filename)}')
    data = pd.read_csv(os.path.join(path, filename))
    data = data.rename(columns={'Unnamed: 0':'Id'})
    print(f'data shape = {data.shape}')
    types_info = pd.DataFrame(data.dtypes.value_counts(), columns=['columns_count'])
    print('types info about df columns: ')
    print(types_info)
    return data

In [3]:
# ls -la -h Datasets/GiveMeSomeCredit/

In [6]:
train_data = read_data(path='Datasets/GiveMeSomeCredit/', filename='cs-training.csv')
# test_data = read_data(path='Datasets/GiveMeSomeCredit/', filename='cs-test.csv')
# descript = pd.read_excel("Datasets/GiveMeSomeCredit/Data Dictionary.xls")
# sample_data = pd.read_csv("Datasets/GiveMeSomeCredit/sampleEntry.csv")

reading file = Datasets/GiveMeSomeCredit/cs-training.csv
data shape = (150000, 12)
types info about df columns: 
         columns_count
int64                8
float64              4


In [7]:
revolve_thresh = round(np.quantile(train_data.RevolvingUtilizationOfUnsecuredLines.values, q=[0.99])[0], 3)
# debtratio_thresh = round(np.quantile(train_data.DebtRatio.values, q=[0.9])[0], 3)
debtratio_thresh = 2.1
revolve_thresh

train_data = train_data[(train_data.RevolvingUtilizationOfUnsecuredLines < revolve_thresh) & 
                        (train_data.DebtRatio < debtratio_thresh)
                       ]
train_data.shape

(117910, 12)

In [8]:
train_data.fillna(int(train_data.MonthlyIncome.mean()), inplace=True)

In [10]:
train_data.fillna(0, inplace=True)
# test_data.fillna(0, inplace=True)

#ставим колонку Id как индекс клиента
train_data.set_index('Id', inplace=True)
# test_data.set_index('Id', inplace=True)

#сохраняем метку класса
train_label = train_data['SeriousDlqin2yrs'].copy()
train_data.drop('SeriousDlqin2yrs', axis=1, inplace=True)
#удаляем колонку класса из тестовых данных, так как она не несет никакой информации
# test_data.drop('SeriousDlqin2yrs', axis=1, inplace=True)

In [14]:
train_data['NumberOfDependents'] = train_data.NumberOfDependents.astype('int')
train_data['MonthlyIncome'] = train_data.MonthlyIncome.astype('int')

In [15]:
train_data.shape#, test_data.shape

(117910, 10)

In [16]:
float_cols = train_data.select_dtypes('float').columns
train_data.loc[:, float_cols] = train_data.loc[:, float_cols].round(2)
# train_data.head()

In [199]:
# float_cols = test_data.select_dtypes('float').columns
# test_data.loc[:, float_cols] = test_data.loc[:, float_cols].round(2)
# test_data.head()

In [17]:
transformed_train = utils.transform_to_description(train_data)
# transformed_test = transform_to_description(test_data)
# transformed_train.head(5)

In [18]:
transformed_train.shape#, transformed_test.shape

(117910, 10)

In [19]:
trainX, testX, trainY, testY = train_test_split(transformed_train, train_label, test_size=0.4)

In [20]:
valX, testX, valY, testY = train_test_split(testX, testY, test_size=0.5)

In [21]:
print(trainX.shape, valX.shape, testX.shape)
print(trainY.shape, valY.shape, testY.shape)

(70746, 10) (23582, 10) (23582, 10)
(70746,) (23582,) (23582,)


In [22]:
trainY.value_counts(normalize=True)

0    0.93382
1    0.06618
Name: SeriousDlqin2yrs, dtype: float64

In [23]:
valY.value_counts(normalize=True)

0    0.934229
1    0.065771
Name: SeriousDlqin2yrs, dtype: float64

In [24]:
testY.value_counts(normalize=True)

0    0.932576
1    0.067424
Name: SeriousDlqin2yrs, dtype: float64

**Алгоритм из работы Алексея(QBCA)**

In [14]:
# alpha = 0.005
# sample_ratio = 0.01
# num_iters = 100
# N_neg = train_label.value_counts().reset_index().iloc[0, 1]
# N_pos = train_label.value_counts().reset_index().iloc[1, 1]
# N_neg, N_pos

**не обновляем индекс так как индекс - это id клиента, имеет значимую информацию**

In [26]:
# train_pos = transformed_train.loc[train_label[train_label == 1].index]
# train_neg = transformed_train.loc[train_label[train_label == 0].index]
train_pos = trainX.loc[trainY[trainY == 1].index]
train_neg = trainX.loc[trainY[trainY == 0].index]

train_neg.shape, train_pos.shape

((66064, 10), (4682, 10))

In [27]:
classes_ratio_pos = train_pos.shape[0] / train_neg.shape[0]
classes_ratio_neg = train_neg.shape[0] / train_pos.shape[0]
classes_ratio_pos, classes_ratio_neg

(0.07087067086461613, 14.110209312259718)

**Берем выборку заемов, для начального тестирования работоспособности**

In [87]:
train_pos = train_pos.sample(n=1000, replace=False)
train_neg = train_neg.sample(n=1000, replace=False)
train_pos.shape, train_neg.shape

((2000, 10), (2000, 10))

In [210]:
train_pos.shape, train_neg.shape

((4412, 10), (65621, 10))

In [342]:
test_sample = valX.sample(20)

### Mining step


    для положительного класса нас интересуют объекты отрицательного класса, 
    а для отрицательного - положительные
    Уже после определения объектов из другого класса, попадающие в признаковое представление семпла данных, будет приниматься решение
     о включение этого признакого представление в список гипотез представления(областей или интервальных представлений)
    

Что нужно посмотреть:

    1. Сравнение количества генерируемых гипотез в зависимости от критерия и области генерации выборки.
    (Возможно изобразить всего 4 варианта: старый подход - локальная или случайная выборка, новый подход - 
    локальная или случайная выборка)
    
    2. Сравнение генерируемых гипотез для старого и нового критерия в зависимости от ширины локальной области генерации
    (Возможно есть какая то оптимальная ширина окна)
    

To-do-list:
1. **классификации при разном включении признаков из исходного множества**

2. **Подумать о том, каким образом генерить гипотезы. То есть выбирать не случайно выборку из множества объектов,
    а какую-то локальную область. (Делать будем через расширение области объекта; более того, будем искать 
    оптимальное значение расширения локальной области) done !!!**

In [78]:
# d = utils.find_opt_local_area(obj=test_sample.iloc[0],
#                         train_data=train_pos,
#                         trainx_min=trainx_min,
#                         trainx_max=trainx_max,
#                         frac=0.15,
#                         num_iters=60
#                        )

# utils.generate_random_sample(train_data=train_pos, sample_size=10, d=d)

In [67]:
utils = imp.reload(utils)

In [72]:
def generate_hypothesis(iteration: int, obj: pd.Series, object_area: pd.Series, train_data: pd.DataFrame, 
                        other_data: pd.DataFrame, sample_size: int, classes_ratio: float,
                        hypothesis_criterion: str, sample_type:str, verbose: bool, alpha: float):
        
        print(f'iteration: {iteration}')
        
        if sample_type == 'local' and object_area is None:
            print(f'Cannot generate sample from local area. Got None as local object area param!')
            raise NotImplementedError('Wrong params for local sampling!')
    
        if sample_type == 'random' and object_area is not None:
            print(f'got misleading params values. Got sample_type = None and local object area is not None')
            return NotImplementedError('Wrong params for random sampling!')

        #generating random sample using sample strategy = {sample_type}
        sample = utils.generate_random_sample(train_data=train_data, sample_size=sample_size, d=object_area) 
        sample.append(obj)
        
        d = utils.get_similarity_sample_repr(sample)
        if verbose:
            print('got feature represantation for sample')
        
        d_other_objects = utils.is_included_in_repr(d, train_data=other_data)
        if d_other_objects is not None:
            print(f'got {d_other_objects.shape[0]} d_other_objects')
            print(f'thresh for hypothesis = {int(other_data.shape[0] * alpha)}')
    
        if d_other_objects is None:
            return d    
        
        result_hypothesis = utils.check_criterion(d=d, train_data=train_data, 
                                                  hypothesis_criterion=hypothesis_criterion, 
                                                  d_other_objects=d_other_objects,
                                                  other_data=other_data, alpha=alpha, 
                                                  classes_ratio=classes_ratio
                                                 )

        return result_hypothesis
        

        

def mining_step(test_obj: pd.Series, train_pos: pd.DataFrame, train_neg: pd.DataFrame, num_iters: int, 
                sample_ratio: float, alpha: float, hypothesis_criterion: str, sample_type: str,
                trainx_min: pd.Series, trainx_max:pd.Series, classes_ratio: float, 
                mining_type: str = 'pos', verbose : bool = False, n_jobs : int = 4):
    """
    hypothesis_criterion: 'contr_class', если используем базовый критерий, 
                                когда смотрится пересечение с противоположным классом(старый критерий отбора гипотез)
                           'both_classes', когда интересует пересечение по обоим классам(новый критерий отбора гипотез)
                           
    sample_type: 'random', если берем произвольную выборку интервальных представлений
                 'local', если берем произвольную выборку из локальной области
    
    returns list of hypothesises
    """
    
    if sample_type == 'local':
        #sampling
        print(f'start searching optimal local area')
        object_area = utils.find_opt_local_area(obj=test_obj,
                                                train_data=train_data,
                                                trainx_min=trainx_min,
                                                trainx_max=trainx_max,
                                                frac=0.15,
                                                num_iters=30
                                               )
    else:
        print(f'using random sample from train data')
        object_area = None
    
    train_data = train_pos if mining_type == 'pos' else train_neg
    other_data = train_neg if mining_type == 'pos' else train_pos
    sample_size = int(train_data.shape[0] * sample_ratio)
    print('start generating hypothesises')
    
    mining = partial(generate_hypothesis, 
                     obj=test_obj, 
                     object_area=object_area,
                     train_data=train_data, 
                     other_data=other_data, 
                     sample_size=sample_size,
                     classes_ratio=classes_ratio,
                     hypothesis_criterion=hypothesis_criterion,
                     sample_type=sample_type,
                     verbose=verbose,
                     alpha=alpha
                    )
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        hypothesises = executor.map(mining, range(num_iters))

    hypothesises = [res for res in hypothesises if res is not None]
    print(f'All hypothesises are generated!')
    return hypothesises

In [20]:
results_pos = {(0.0, 0.001, 1000): 1000,
 (0.0, 0.002, 1000): 608,
 (0.0, 0.004, 1000): 17,
 (0.001, 0.001, 1000): 1000,
 (0.001, 0.002, 1000): 740,
 (0.001, 0.004, 1000): 37,
 (0.002, 0.001, 1000): 1000,
 (0.002, 0.002, 1000): 790,
 (0.002, 0.004, 1000): 51}

**Генерация гипотез для полож класса занимает 10 мин (с 4 процессами) c 3000 iterations**
**c 1000 iterations - 2 mins**

In [294]:
# %%time
# for i, obj in test_sample.iterrows():
#     print(type(obj))
#     print(f'start mining from pos objects')
#     pos_hyps = mining_step(test_obj=obj, train_pos=train_pos, train_neg=train_neg, 
#                            num_iters=num_iters,sample_ratio=sample_ratio, alpha = alpha,
#                            hypothesis_criterion='contr_class',
#                            sample_type='random',
#                            mining_type='pos',
#                            verbose=False, n_jobs=4
#                           )
    
#     print(f'start mining from neg objects')
#     neg_hyps = mining_step(test_obj=obj, train_pos=train_pos, train_neg=train_neg,
#                            num_iters=num_iters, sample_ratio=sample_ratio, alpha = alpha, 
#                            hypothesis_criterion='contr_class',mining_type='neg', 
#                            sample_type='random',
#                            verbose=False, n_jobs=4
#                           )

In [104]:
pos_hyps = pd.DataFrame(pos_hyps)
neg_hyps = pd.DataFrame(neg_hyps)

In [39]:
pos_hyps = pd.DataFrame(pos_hyps)
neg_hyps = pd.DataFrame(neg_hyps)

In [40]:
pos_hyps.shape, neg_hyps.shape

((761, 10), (789, 10))

In [25]:
# neg_hyps.to_csv('hypothesises/test_neg_hyps.csv', header=True, index=False)
# pos_hyps = pd.read_csv('hypothesises/test_pos_hyps.csv')
# neg_hyps = pd.read_csv('hypothesises/test_neg_hyps.csv')
# pos_hyps.shape, neg_hyps.shape

In [110]:
utils = imp.reload(utils)

#### Исследования количества итераций, необходимых для достижения определенной доли объектов обучающей выборки

In [211]:
# utils.generate_local_area(test_sample.iloc[0], trainx_min, trainx_max)

In [28]:
train_pos = train_pos.sample(n=1000, replace=False)
train_neg = train_neg.sample(n=1000, replace=False)
train_pos.shape, train_neg.shape

((1000, 10), (1000, 10))

In [30]:
trainx_min = train_data.min()
trainx_max = train_data.max()
test_sample = valX.sample(30)

In [33]:
fractions = list(range(1, 23, 2))
fractions = [x/100 for x in fractions]
num_iterations = 1000
print(fractions)

[0.01, 0.03, 0.05, 0.07, 0.09, 0.11, 0.13, 0.15, 0.17, 0.19, 0.21]


In [34]:
iterations_list_pos = {}
iterations_list_neg = {}

for frac in fractions:
    iterations_list_pos[frac] = []
    iterations_list_neg[frac] = []
num_iterations

1000

In [341]:
for frac in fractions:
    print(f'frac = {frac}')
    for i, test_obj in test_sample.iterrows():
        print(f'i = {i}')
        d, num_iter = utils.find_opt_local_area(obj=test_obj, 
                                                train_data=train_pos,
                                                trainx_min=trainx_min,
                                                trainx_max=trainx_max,
                                                frac=frac, 
                                                num_iters=num_iterations
                                               )
        iterations_list_pos[frac].append(num_iter)

In [None]:
iterations_list_pos_str = json.dumps(iterations_list_pos)
iterations_list_pos_str

In [None]:
with open('iterations_list_pos1.txt', 'w') as f:
    f.write(iterations_list_pos_str)
    
# for key, val in iterations_list_pos.items():
#     iterations_list_pos[key] = np.mean(val)

In [None]:
iterations_list_pos = pd.DataFrame.from_dict(data=iterations_list_pos, orient='index', columns=['iters'])
iterations_list_pos.reset_index(drop=False, inplace=True)
iterations_list_pos.rename(columns={'index':'frac'}, inplace=True)
iterations_list_pos.head()

In [340]:
for frac in fractions:
    print(f'frac = {frac}')
    for i, test_obj in test_sample.iterrows():
        print(f'i = {i}')
        d, num_iter = utils.find_opt_local_area(obj=test_obj, 
                                                train_data=train_neg,
                                                trainx_min=trainx_min,
                                                trainx_max=trainx_max,
                                                frac=frac, 
                                                num_iters=num_iterations
                                               )
        iterations_list_neg[frac].append(num_iter)

In [82]:
iterations_list_neg_str=json.dumps(iterations_list_neg)
iterations_list_neg_str

'{"0.01": [19, 10, 12, 6, 14, 16, 23, 12, 20, 100, 13, 21, 15, 14, 15, 34, 13, 40, 16, 13, 16, 12, 17, 17, 16, 20, 10, 39, 16, 11, 11, 6, 5, 15, 14, 15, 19, 64, 15, 25], "0.03": [23, 14, 15, 9, 46, 21, 27, 16, 29, 800, 22, 27, 21, 16, 19, 50, 16, 94, 21, 18, 22, 15, 23, 30, 20, 26, 13, 58, 25, 26, 16, 12, 11, 24, 20, 21, 25, 89, 21, 38], "0.05": [25, 16, 16, 15, 81, 24, 30, 18, 33, 1001, 26, 33, 26, 17, 21, 54, 18, 100, 24, 24, 26, 19, 29, 38, 22, 32, 16, 77, 28, 36, 21, 17, 14, 28, 24, 24, 27, 109, 24, 42], "0.07": [26, 19, 18, 19, 91, 28, 35, 22, 36, 1001, 32, 36, 32, 20, 24, 58, 20, 202, 27, 29, 32, 22, 33, 42, 25, 35, 18, 91, 33, 42, 24, 23, 19, 32, 32, 25, 31, 117, 30, 65], "0.09": [27, 22, 21, 25, 97, 30, 38, 24, 40, 1001, 36, 40, 40, 22, 26, 63, 23, 1001, 30, 35, 41, 24, 36, 48, 26, 37, 20, 99, 36, 47, 27, 29, 23, 34, 35, 26, 33, 127, 37, 77], "0.11": [27, 23, 23, 31, 99, 32, 43, 26, 43, 1001, 39, 43, 43, 24, 28, 67, 25, 1001, 32, 41, 54, 26, 39, 52, 29, 38, 23, 108, 40, 61, 29,

In [None]:
with open('iterations_list_neg.txt', 'w') as f:
    f.write(iterations_list_neg_str)

In [None]:
iterations_list_neg = pd.DataFrame.from_dict(data=iterations_list_neg, orient='index', columns=['iters'])
iterations_list_neg.reset_index(drop=False, inplace=True)
iterations_list_neg.rename(columns={'index':'frac'}, inplace=True)
iterations_list_neg.head()

In [37]:
# iterations_list_pos.to_csv('iterations_list_pos.csv', header=True, index=False)
# iterations_list_neg.to_csv('iterations_list_neg.csv', header=True, index=False)

In [39]:
# sns.lineplot(y='iters', x='frac', data=iterations_list_pos)
# sns.lineplot(y='iters', x='frac', data=iterations_list_neg)

**Исследование количества генерируемых гипотез в зависимости от набора параметров для нового подхода отбора гипотез**

*Сначала проверим построение старого подхода генерации гипотез*

In [101]:
alpha_params = [0.000, 0.001, 0.002, 0.003, 0.004]
sample_ratio_params = [0.001, 0.002, 0.004, 0.006, 0.008, 0.01]
alpha_params, sample_ratio_params

results_pos = {}
results_neg = {}
pos_hyps = []
neg_hyps = []

for alpha in alpha_params:
    results_pos[alpha] = {}
    results_neg[alpha] = {}
    
for alpha in alpha_params:
    for sample_ratio in sample_ratio_params:
        results_pos[alpha][sample_ratio] = []
        results_neg[alpha][sample_ratio] = []

In [103]:
params = {
    'train_pos': train_pos,
    'train_neg': train_neg,
    'num_iters': 1000,
    'hypothesis_criterion': 'contr_class',
    'sample_type': 'random',
    'trainx_min': trainx_min,
    'trainx_max': trainx_max,
    'verbose': False,
    'n_jobs': 4
}

In [2]:
%%time
for alpha in alpha_params:
    for sample_ratio in sample_ratio_params:
        for i, obj in test_sample[:1].iterrows():
            print(f"using params: alpha = {alpha}, sample_ratio = {sample_ratio}, num_iters = {params['num_iters']}")
            pos_hyps = mining_step(test_obj=obj, sample_ratio=sample_ratio,
                                   alpha = alpha, mining_type='pos', classes_ratio=classes_ratio_pos,
                                   **params
                                  )
            
            pos_hyps_shape = pd.DataFrame(pos_hyps).shape[0] if len(pos_hyps) > 0 else 0
            
            results_pos[alpha][sample_ratio].append(pos_hyps_shape)
            
            neg_hyps = mining_step(test_obj=obj, sample_ratio=sample_ratio, 
                                   alpha = alpha, mining_type='neg', classes_ratio=classes_ratio_neg,
                                   **params
                                  )
            
            neg_hyps_shape = pd.DataFrame(neg_hyps).shape[0] if len(neg_hyps) > 0 else 0
            
            results_neg[alpha][sample_ratio].append(neg_hyps_shape)
            
            

*Новый подход генерации гипотез*

In [100]:
alpha_params = [1.0, 1.05, 1.1, 1.3, 1.5, 1.7, 2.0]
sample_ratio_params = [0.001, 0.002, 0.004, 0.006, 0.008, 0.01]
alpha_params, sample_ratio_params

results_pos = {}
results_neg = {}
pos_hyps = []
neg_hyps = []

In [None]:
params = {
    'train_pos': train_pos,
    'train_neg': train_neg,
    'num_iters': 1000,
    'hypothesis_criterion': 'both_classes',
    'sample_type': 'local',
    'trainx_min': trainx_min,
    'trainx_max': trainx_max,
    'verbose': False,
    'n_jobs': 4
}

In [None]:
%%time
for alpha in alpha_params:
    for sample_ratio in sample_ratio_params:
        for i, obj in test_sample[:1].iterrows():
            print(f"using params: alpha = {alpha}, sample_ratio = {sample_ratio}, num_iters = {params['num_iters']}")
            pos_hyps = mining_step(test_obj=obj, sample_ratio=sample_ratio,
                                   alpha = alpha, mining_type='pos', classes_ratio=classes_ratio_pos,
                                   **params
                                  )
            
            pos_hyps_shape = pd.DataFrame(pos_hyps).shape[0] if len(pos_hyps) > 0 else 0
            
            results_pos[alpha][sample_ratio].append(pos_hyps_shape)
            
            neg_hyps = mining_step(test_obj=obj, sample_ratio=sample_ratio, 
                                   alpha = alpha, mining_type='neg', classes_ratio=classes_ratio_neg,
                                   **params
                                  )
            
            neg_hyps_shape = pd.DataFrame(neg_hyps).shape[0] if len(neg_hyps) > 0 else 0
            
            results_neg[alpha][sample_ratio].append(neg_hyps_shape)

**Оценивание модели, вычисление ROC AUC and Gini coefficient**

##### Проделаем для стандартного метода, который уже использовался

    Берем множество объеков(тестовое)
    Для каждого объекта считаем множество положительных и негативных гипотез
    Разница между количеством гипотез будет как раз скором, а знак этой разницы - классом

In [339]:
for i, test_obg in test_sample[:1].iterrows():
    pos_hyps = mining_step(test_obj=test_obj, train_pos=train_pos, train_neg=train_neg, 
                            num_iters=600,sample_ratio=0.003, alpha = 0.004,
                                   hypothesis_criterion='contr_class', sample_type='random',
                                   mining_type='pos', trainx_min=trainx_min, trainx_max=trainx_max,
                                   classes_ratio=classes_ratio_pos, verbose=False, n_jobs=4
                        )

In [338]:
for i, test_obg in test_sample[:1].iterrows():
    neg_hyps = mining_step(test_obj=test_obj, train_pos=train_pos, train_neg=train_neg, 
                            num_iters=600,sample_ratio=0.003, alpha = 0.004,
                                   hypothesis_criterion='contr_class', sample_type='random',
                                   mining_type='neg', trainx_min=trainx_min, trainx_max=trainx_max,
                                   classes_ratio=classes_ratio_pos, verbose=False, n_jobs=2
                        )

In [291]:
pos_hyps = pd.DataFrame(pos_hyps)
neg_hyps = pd.DataFrame(neg_hyps)

In [295]:
from time import sleep

In [304]:
num_iters=600
sample_ratio=0.003
alpha = 0.004
hypothesis_criterion = 'contr_class'
sample_type = 'random'

In [303]:
def calculate_metric_for_obj(test_obj: pd.Series, train_pos: pd.DataFrame, train_neg: pd.DataFrame, 
                            num_iters, sample_ratio, alpha, hypothesis_criterion, sample_type,
                            trainx_min, trainx_max, classes_ratio_pos, classes_ratio_neg, verbose, n_jobs):
    
    
    pos_hyps = mining_step(test_obj=test_obj, train_pos=train_pos, train_neg=train_neg, 
                           num_iters=num_iters,sample_ratio=sample_ratio, alpha = alpha,
                           hypothesis_criterion=hypothesis_criterion, sample_type=sample_type,
                           mining_type='pos', trainx_min=trainx_min, trainx_max=trainx_max,
                           classes_ratio=classes_ratio_pos, verbose=verbose, n_jobs=n_jobs
                        )
    pos_hyps = pd.DataFrame(pos_hyps)
    
    sleep(10)
    
    neg_hyps = mining_step(test_obj=test_obj, train_pos=train_pos, train_neg=train_neg, 
                           num_iters=num_iters,sample_ratio=sample_ratio, alpha = alpha,
                           hypothesis_criterion=hypothesis_criterion, sample_type=sample_type,
                           mining_type='neg', trainx_min=trainx_min, trainx_max=trainx_max,
                           classes_ratio=classes_ratio_neg, verbose=verbose, n_jobs=n_jobs
                        )
    neg_hyps = pd.DataFrame(neg_hyps)
    
    sleep(5)
    
    diff = pos_hyps.shape[0] - neg_hyps.shape[0]
    label = None
    if diff > 0:
        label = 1
    elif diff < 0:
        label = 0
    else:
        label = -1
        
    return diff, label

In [302]:
def evaluate_model(test_sample: pd.DataFrame, train_pos, train_neg,  num_iters, sample_ratio,
                   alpha, hypothesis_criterion, sample_type, trainx_min, trainx_max,
                   classes_ratio_pos, classes_ratio_neg, verbose=False, n_jobs=2):
    
    results = {}
    
    print(f'using params: num_iters = {num_iters}, sample_ratio = {sample_ratio}, alpha = {alpha}')
    
    for i, test_obj in test_sample.iterrows():
        diff, label = calculate_metric_for_obj(test_obj=test_obj, train_pos=train_pos, train_neg=train_neg, 
                                num_iters=num_iters, sample_ratio=sample_ratio, alpha=alpha, 
                                hypothesis_criterion=hypothesis_criterion, sample_type=sample_type,
                                trainx_min=trainx_min, trainx_max=trainx_max,
                                 classes_ratio_pos=classes_ratio_pos, classes_ratio_neg=classes_ratio_neg, 
                                verbose=verbose, n_jobs=n_jobs)
        results[i] = (diff, label)
        
    return results


In [None]:
results = {}

In [335]:
# %%time
# results = evaluate_model(test_sample=test_sample[8:9], train_pos=train_pos, train_neg=train_neg, 
#                          num_iters=1500, sample_ratio=sample_ratio, alpha=alpha, 
#                                 hypothesis_criterion=hypothesis_criterion, sample_type=sample_type,
#                                 trainx_min=trainx_min, trainx_max=trainx_max,
#                                  classes_ratio_pos=classes_ratio_pos, classes_ratio_neg=classes_ratio_neg, 
#                                 verbose=False, n_jobs=4)

In [336]:
# valY[test_sample[8:12].index]

In [337]:
# valY.sample(10)

In [24]:
# %%time
# t = hypothesises_to_feat_matrix(pos_hyps=pos_hyps, neg_hyps=neg_hyps, trainX=train_pos.sample(1))

**Обучение, основанное на признаках, полученных из сгенерированных гипотез**

In [276]:
def to_binary_repr(num: int, pos_hyps: pd.DataFrame, neg_hyps: pd.DataFrame, indices: list, trainX: pd.DataFrame):
    ind = indices[num]
    obj = trainX.loc[ind]
    pos_features = [f'pos_feat_{feat_num}' for feat_num in pos_hyps.index]
    neg_features = [f'neg_feat_{feat_num}' for feat_num in neg_hyps.index]
    features = pos_features + neg_features
    start_values = np.zeros(shape=(1, len(features)))
#     print(start_values.shape)
#     print(obj.name)
    result = pd.DataFrame(data=start_values,index=[obj.name], columns=features, dtype='int')
    ind = obj.name
    for pi in range(pos_hyps.shape[0]):
        feat_repr = utils.similarity(obj, pos_hyps.iloc[pi])
        is_included = pos_hyps.iloc[pi].equals(feat_repr)
        if is_included:
            result.loc[ind, f'pos_feat_{pi}'] = 1
        else:
            result.loc[ind, f'pos_feat_{pi}'] = 0
        
    for pi in range(neg_hyps.shape[0]):
        feat_repr = utils.similarity(obj, neg_hyps.iloc[pi])
        is_included = neg_hyps.iloc[pi].equals(feat_repr)
        if is_included:
            result.loc[ind, f'neg_feat_{pi}'] = 1
        else:
            result.loc[ind, f'neg_feat_{pi}'] = 0
                          
    return result

def transform_to_feature_matrix(pos_hyps: pd.DataFrame, neg_hyps: pd.DataFrame, trainX: pd.DataFrame, n_jobs: int = 4):
    
    indices = trainX.index
    transform_func = partial(to_binary_repr, pos_hyps=pos_hyps, 
                             neg_hyps=neg_hyps, indices=indices, trainX=trainX)
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        obj_features = executor.map(transform_func, range(len(indices)))

    features = pd.concat(obj_features)#pd.DataFrame(obj_features)
    
    return features

In [290]:
%%time
gen = transform_to_feature_matrix(pos_hyps=pos_hyps, neg_hyps=neg_hyps, trainX=trainX.sample(512))

CPU times: user 4.07 s, sys: 442 ms, total: 4.51 s
Wall time: 3min 47s


In [285]:
1.18, 1.25, 1.71, 2 range 1 4
3.95(8), 7.35(16), 29(64), 512

(1.18, 1.25, 1.71)

In [248]:
t.sum(axis=1)

28125    3
dtype: int64

In [140]:
def hypothesises_to_feat_matrix(pos_hyps: pd.DataFrame, neg_hyps: pd.DataFrame, trainX: pd.DataFrame):
    pos_features = [f'pos_feat_{feat_num}' for feat_num in pos_hyps.index]
#     neg_features = [f'neg_feat_{feat_num}' for feat_num in neg_hyps[pi]s.index]
    
    result = pd.DataFrame(index=trainX.index, columns=pos_features)# + neg_features)
    
    for i in range(trainX.shape[0]):
        for pi in range(pos_hyps.shape[0]):

            feat_repr = utils.similarity(obj, pos_hyps.iloc[pi])
            is_included = pos_hyps.iloc[pi].equals(feat_repr)
            if is_included:
                result.loc[i, f'pos_feat_{pi}'] = 1
            else:
                result.loc[i, f'pos_feat_{pi}'] = 0
                
        for pi in range(neg_hyps.shape[0]):
            feat_repr = utils.similarity(obj, neg_hyps.iloc[pi])
            is_included = neg_hyps.iloc[pi].equals(feat_repr)
            if is_included:
                result.loc[i, f'neg_feat_{pi}'] = 1
            else:
                result.loc[i, f'neg_feat_{pi}'] = 0
    
    
    return result
    

In [None]:
###Данный вариант не прокатит, слишком долго
### Нужно придумать другой вариант

In [70]:
# %%time
# result = hypothesises_to_feat_matrix(pos_hyps=pos_hyps, neg_hyps=neg_hyps, trainX=trainX.sample(1000))

**Обучение композиции, основанное на гипотезах, считая их слабыми классификаторами(Бустинг)**

In [36]:
pos_hyps = pd.read_csv('hypothesises/test_pos_hyps.csv')
neg_hyps = pd.read_csv('hypothesises/test_neg_hyps.csv')
hyps = pd.concat([pos_hyps, neg_hyps])
hyps.reset_index(drop=True, inplace=True)

In [43]:
def hyps_boosting(hypothesises: pd.DataFrame, num_iterations):
    '''
    у гипотез есть индекс, просто число, от 0 до кол-ва гипотез, будем использовать его за id этой гипотезы
    '''
    
    ansamble = []
    weights = list(range(1, 11))
    weights = [w / 10 for w in weights]
    
    hyps_indices = list(hypothesises.index)
    
    ##допустим первую берем случайно(потом переделаем, чтоб брать осознанно)
    index = np.random.choice(hyps_indices, replace=False)
    h = hypothesises.loc[index]
    hyps_indices.remove(index)
    ansamble.append(h)
    
    #тут еще проверяется критерий того, что данный 
    while not hyps_indices and num_iterations > 0:
        
        index = np.random.choice(hyps_indices, replace=False)
        h = hypothesises.loc[index]
        
        
    
    return weights
    
    

In [42]:
hyps_boosting(hypothesises=hyps, num_iterations=10)

[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]

In [83]:
inds = list(range(13))
inds

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]

In [104]:
np.random.choice(inds)

5

In [62]:
# a = pd.Series({'feat1':(1,1), 'feat2':(0, 1.4), 'feat3':(3, 4)})
# b = pd.Series({'feat1':(1,2), 'feat2':(1, 1.4), 'feat3':(7, 7)})
# c = pd.Series({'feat1':(1,1), 'feat2':(0.8, 1.6), 'feat3':(3, 4)})
# d = pd.Series({'feat1':(2,3), 'feat2':(0, 1.4), 'feat3':(4, 6)})


# print(a.equals(b))
# print(a == b)

# test1 = pd.Series({'feat1':(1,3), 'feat2':(1, 3.4), 'feat3':(1, 1.9)})
# test2 = pd.Series({'feat1':(2,2.2), 'feat2':(2, 2.4), 'feat3':(3, 3.9)})
# test3 = pd.Series({'feat1':(3,3), 'feat2':(4, 4.4), 'feat3':(4, 4.9)})

g1 = pd.Series({'feat1':(1,1), 'feat2':(1.5, 1.5)})
g2 = pd.DataFrame({'feat1':(-1,-1), 'feat2':(0, 0)})
g3 = pd.Series({'feat1':(0.5,0.5), 'feat2':(1, 1)})

test1 = pd.Series({'feat1':(0.1, 1), 'feat2':(-0.5, 0)})
test2 = pd.DataFrame({'feat1':(-0.1, 1), 'feat2':(0.5, 0.8)})
tst = pd.Series({'feat1':(-1, 1), 'feat2':(0, 1.5)})

In [287]:
# for i,obj in sample.iterrows():
#     print(similarity(pd.DataFrame(d).T, obj))

In [286]:
# sample = pd.DataFrame([g1, g2, g3, test1, test2])
# sample

In [248]:
g1

feat1        (1, 1)
feat2    (1.5, 1.5)
dtype: object

In [95]:
type(d), type(sample)

(pandas.core.series.Series, pandas.core.frame.DataFrame)

In [186]:
# is_included_in_repr(d, sample)

Тестовая функция провекри np.random.choice

In [40]:
def compute_stat(times, data):
    print(f"times = {times}")
    sample = np.random.RandomState().choice(data, replace=False, size=10)
    print(f"sample = {sample}")
    sample_sum = np.sum(sample)
    print(f"sample sum = {sample_sum}")
    return sample_sum

def run_computing(n_jobs=2):
    data = list(range(100))
    
    compute_func = partial(compute_stat, data=data)
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        results = executor.map(compute_func, list(range(26)))
    
    return results

In [284]:
# t = run_computing(n_jobs=6)