In [2]:
import os
import numpy as np
import pandas as pd
import functions as func

from functools import partial
from concurrent.futures import ProcessPoolExecutor, Executor, as_completed

In [66]:
def read_data(path, filename):
    print(f'reading file = {os.path.join(path, filename)}')
    data = pd.read_csv(os.path.join(path, filename))
    data = data.rename(columns={'Unnamed: 0':'Id'})
    print(f'data shape = {data.shape}')
    types_info = pd.DataFrame(data.dtypes.value_counts(), columns=['columns_count'])
    print('types info about df columns: ')
    print(types_info)
    return data

In [4]:
ls -la -h Datasets/GiveMeSomeCredit/

total 28272
drwx------@ 6 adam  staff   192B Dec 23 21:30 [34m.[m[m/
drwxr-xr-x  8 adam  staff   256B Dec 23 21:36 [34m..[m[m/
-rwxr-xr-x@ 1 adam  staff    15K Dec 11  2019 [31mData Dictionary.xls[m[m*
-rwxr-xr-x@ 1 adam  staff   4.8M Dec 11  2019 [31mcs-test.csv[m[m*
-rwxr-xr-x@ 1 adam  staff   7.2M Dec 11  2019 [31mcs-training.csv[m[m*
-rwxr-xr-x@ 1 adam  staff   1.8M Dec 11  2019 [31msampleEntry.csv[m[m*


In [72]:
train_data = read_data(path='Datasets/GiveMeSomeCredit/', filename='cs-training.csv')
# test_data = read_data(path='Datasets/GiveMeSomeCredit/', filename='cs-test.csv')
# descript = pd.read_excel("Datasets/GiveMeSomeCredit/Data Dictionary.xls")
# sample_data = pd.read_csv("Datasets/GiveMeSomeCredit/sampleEntry.csv")

reading file = Datasets/GiveMeSomeCredit/cs-training.csv
data shape = (150000, 12)
types info about df columns: 
         columns_count
int64                8
float64              4


In [73]:
def transform_to_description(data):
    transformed_data = pd.DataFrame(columns=data.columns)
    
    for col in data:
        transformed_data[col] = data[col].apply(lambda x: (x, x))
    
    return transformed_data

In [74]:
train_data.fillna(0, inplace=True)
# test_data.fillna(0, inplace=True)

#ставим колонку Id как индекс клиента
train_data.set_index('Id', inplace=True)
# test_data.set_index('Id', inplace=True)

#сохраняем метку класса
train_label = train_data['SeriousDlqin2yrs'].copy()
train_data.drop('SeriousDlqin2yrs', axis=1, inplace=True)
#удаляем колонку класса из тестовых данных, так как она не несет никакой информации
# test_data.drop('SeriousDlqin2yrs', axis=1, inplace=True)

In [75]:
train_data['NumberOfDependents'] = train_data.NumberOfDependents.astype('int')
train_data['MonthlyIncome'] = train_data.MonthlyIncome.astype('int')

In [80]:
train_data.shape#, test_data.shape

(150000, 10)

In [81]:
float_cols = train_data.select_dtypes('float').columns
train_data.loc[:, float_cols] = train_data.loc[:, float_cols].round(2)
# train_data.head()

In [82]:
# float_cols = test_data.select_dtypes('float').columns
# test_data.loc[:, float_cols] = test_data.loc[:, float_cols].round(2)
# test_data.head()

In [83]:
transformed_train = transform_to_description(train_data)
# transformed_test = transform_to_description(test_data)
# transformed_train.head(5)

In [84]:
transformed_train.shape#, transformed_test.shape

(150000, 10)

In [18]:
def similarity(vect1, vect2):
    
    """
    previous version:
     vect1 = transformed_train.iloc[0]
     vect2 = transformed_train.iloc[2]
     func = (lambda x,y: (min(x[0], y[0]), max(x[1], y[1])))
     pd.Series(map(func, vect1, vect2), index=train_data.columns)
    
     for col in cols:
     vect_min = min(vect1.loc[col][0], vect2.loc[col][0])
     vect_max = max(vect1.loc[col][1], vect2.loc[col][1])
     vect[col] = (vect_min, vect_max)
    """
    
    func = lambda x, y: (min(x[0], y[0]), max(x[1], y[1]))
    vect = pd.Series(map(func, vect1, vect2), index=vect1.index)
    return vect

def inclusion(obj, patterns):
    """
    check where an obj is inluded in list of patterns
    is_include = any([all(obj == elem) for elem in patterns])
    """     
    is_include = any([obj.equals(elem) for elem in patterns])
    return  is_include

**Алгоритм из работы Алексея(QBCA)**

In [86]:
alpha = 0.003
sample_ratio = 0.003#parameter
num_iters = 1000#parameter
N_neg = train_label.value_counts().reset_index().iloc[0, 1]
N_pos = train_label.value_counts().reset_index().iloc[1, 1]
N_neg, N_pos

(139974, 10026)

In [87]:
train_pos = transformed_train.loc[train_label[train_label == 1].index]
train_neg = transformed_train.loc[train_label[train_label == 0].index]
#не обновляем индекс так как индекс - это id клиента, имеет значимую информацию
train_neg.shape, train_pos.shape

((139974, 10), (10026, 10))

Берем выборку заемов, для начального тестирования работоспособности

In [88]:
train_pos = train_pos.sample(n=1000, random_state=123, replace=False)
train_neg = train_neg.sample(n=1000, random_state=123, replace=False)
train_pos.shape, train_neg.shape

((1000, 10), (1000, 10))

**get_similarity_sample_repr and is_included_in_repr работают вроде правильно**

По крайней мере на тестовых объектах показали правильные результаты

In [253]:
def get_similarity_sample_repr(sample: pd.DataFrame):
    
    """
    get sample of feature represantations from pos or neg class dataset
    returns feature represantation for sample by similarity operation
    """
    pattern = None
    for i, obj in sample.iterrows():
        if pattern is None:
            pattern = obj
        else:
            pattern = similarity(pattern, obj)
    return pattern
    
#операция нахождения объектов по признаковому представлению
def is_included_in_repr(d, train_data):
    """
    returns objects from train dataset(from train pos and neg data) that is included in d representation
    """
    
    d_list = []
    
    for i, obj in train_data.iterrows():
        feature_repr = similarity(obj, d)
        is_included = d.equals(feature_repr)
        if is_included:
            d_list.append(obj)
            
    return pd.DataFrame(d_list)

### Mining step


    для положительного класса нас интересуют объекты отрицательного класса, 
    а для отрицательного - положительные
    Уже после определения объектов из другого класса, попадающие в признаковое представление семпла данных, будет приниматься решение
     о включение этого признакого представление в список гипотез представления(областей или интервальных представлений)
    

Что нужно посмотреть:

    1. Сравнение количества генерируемых гипотез в зависимости от критерия и области генерации выборки.
    (Возможно изобразить всего 4 варианта: старый подход - локальная или случайная выборка, новый подход - 
    локальная или случайная выборка)
    
    2. Сравнение генерируемых гипотез для старого и нового критерия в зависимости от ширины локальной области генерации
    (Возможно есть какая то оптимальная ширина окна)
    

To-do-list:
1. **классификации при разном включении признаков из исходного множества**

2. **Подумать о том, каким образом генерить гипотезы. То есть выбирать не случайно выборку из множества объектов,
    а какую-то локальную область. (Делать будем через расширение области объекта; более того, будем искать 
    оптимальное значение расширения локальной области)**

In [237]:
def generate_local_area(obj: pd.DataFrame):
    eps = 0
    index = obj.index.values[0]
    local_obj = pd.DataFrame(index=[index], columns=obj.columns)
    for key in obj.columns:
        left_val, right_val = obj[key].iloc[0]
        if isinstance(left_val, int):
            eps = 1 if abs(left_val) // 100 == 0 else 100
        elif isinstance(left_val, float):
            eps = 0.01
        left_val, right_val = left_val - eps, right_val + eps 
        local_obj.loc[index, key] = (left_val, right_val)
    return local_obj

def generate_local_sample(obj: pd.DataFrame, train_data: pd.DataFrame, 
                              sample_size: int, frac: float = 0.15, num_iters=10):
    
    iters = num_iters
    while iters > 0:    
        print(f'itr: {iters}')
        d_local_area = generate_local_area(obj)
        d_local_objects = is_included_in_repr(d=d_local_area, train_data=train_data)
        if d_local_objects.shape[0] > int(train_data.shape[0] * frac):
        #sampling
            inds = np.random.RandomState().choice(d_local_objects.index, replace=False, size=sample_size)
            sample = d_local_objects.loc[inds, :].copy()
            break
        #else:
        #    d_local_area = generate_local_area(d_local_area)
        iters -= 1
        
    return sample

In [22]:
def generate_hypothesis(iteration: int,
                        obj: pd.Series, 
                        train_data: pd.DataFrame, 
                        other_data: pd.DataFrame, 
                        sample_size: int, 
                        hypothesis_criterion: str,
                        sample_type:str,
                        verbose: bool,
                        other_data_size: int, 
                        alpha: float):
        
        print(f'iteration: {iteration}')
        
        if sample_type == 'random':
            inds = np.random.RandomState().choice(train_data.index, replace=False, size=sample_size)
            sample = train_data.loc[inds, :].copy()
            sample = sample.append(obj)
        elif sample_type == 'local':
            #генерация семпла данных из локальной области
            sample = generate_local_sample(obj)
            sample.append(ob)
        else:
            print('Не задали тип семплирования')
            return None
        
        d = get_similarity_sample_repr(sample)
        if verbose:
            print('got feature represantation for sample')
        
        d_other_objects = is_included_in_repr(d, train_data=other_data)
        #print(f'got {len(d_other_objects)} d_other_objects')
        #print(f'thresh for hypothesis = {int(other_data_size * alpha)}')
        
#         if verbose:
#             print('got objects that is included in sample represantation')
            
        if d_other_objects is None:
            print('did not find any hypothesis on this iteration')
            return None
        
        ###проверка критерия
        if hypothesis_criterion == 'contr_class':
            if d_other_objects.shape[0] <= int(other_data_size * alpha):
                return d
        elif hypothesis_criterion == 'both_classes':
            #дополнительно смотрим какие объекты target(рассматриваемого на этой итерации) класса попадают в паттерн d
            d_target_objects = is_included_in_repr(d, train_data=train_data)
            if d_other_objects.shape[0] <= int(d_target_objects.shape[0] * alpha):
                return d
        else:
            return None

def mining_step(test_obj: pd.Series, train_pos: pd.DataFrame, train_neg: pd.DataFrame,
                num_iters: int, sample_ratio: float, alpha: float, hypothesis_criterion: str, sample_type: str,
                mining_type: str = 'pos', verbose : bool = False, n_jobs : int = 4):
    """
    hypothesis_criterion: 'contr_class', если используем базовый критерий, 
                                когда смотрится пересечение с противоположным классом(старый критерий отбора гипотез)
                           'both_classes', когда интересует пересечение по обоим классам(новый критерий отбора гипотез)
    sample_type: 'random', если берем произвольную выборку интервальных представлений
                 'local', если берем произвольную выборку из локальной области
    
    returns list of hypothesises
    """
    
    train_data = train_pos if mining_type == 'pos' else train_neg
    other_data = train_neg if mining_type == 'pos' else train_pos
    other_data_size = train_neg.shape[0] if mining_type == 'pos' else train_pos.shape[0]
    
    sample_size = int(train_data.shape[0] * sample_ratio)
    print(f'using sample size = {sample_size}')
    print(f'other data size = {other_data_size}')
    print(f'other data rate size = {int(other_data_size * alpha)}')
    print('start generating hypothesises')
    
    mining = partial(generate_hypothesis, 
                     obj=test_obj, 
                     train_data=train_data, 
                     other_data=other_data, 
                     sample_size=sample_size, 
                     hypothesis_criterion=hypothesis_criterion,
                     sample_type=sample_type,
                     verbose=verbose, 
                     other_data_size=other_data_size,
                     alpha=alpha
                    )
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        hypothesises = executor.map(mining, range(num_iters))

    hypothesises = [res for res in hypothesises if res is not None]
    return hypothesises
    

In [234]:
test_local_obj = transformed_train.sample(1)
test_local_obj

Unnamed: 0_level_0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
44881,"(0.04, 0.04)","(78, 78)","(0, 0)","(1438.0, 1438.0)","(0, 0)","(8, 8)","(0, 0)","(1, 1)","(0, 0)","(0, 0)"


In [235]:
generated_obj = generate_local_area(test_local_obj)
generated_obj

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
44881,"(0.03, 0.05)","(77, 79)","(-1, 1)","(1437.99, 1438.01)","(-1, 1)","(7, 9)","(-1, 1)","(0, 2)","(-1, 1)","(-1, 1)"


In [236]:
generate_local_area(generated_obj)

Unnamed: 0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
44881,"(0.019999999999999997, 0.060000000000000005)","(76, 80)","(-2, 2)","(1437.98, 1438.02)","(-2, 2)","(6, 10)","(-2, 2)","(-1, 3)","(-2, 2)","(-2, 2)"


RevolvingUtilizationOfUnsecuredLines 0.01
age 1
NumberOfTime30-59DaysPastDueNotWorse 1
DebtRatio 0.01
MonthlyIncome 100
NumberOfOpenCreditLinesAndLoans 1
NumberOfTimes90DaysLate 1
NumberRealEstateLoansOrLines 1
NumberOfTime60-89DaysPastDueNotWorse 1
NumberOfDependents 1


In [25]:
# test_sample = transformed_test.loc[:2]
# test_sample

In [24]:
num_iters, alpha, sample_ratio

(1000, 0.003, 0.003)

In [241]:
%%time
for i, obj in test_sample.iterrows():
    
    print(f'start mining from pos objects')
    pos_hyps = mining_step(test_obj=obj, train_pos=train_pos, train_neg=train_neg, 
                           num_iters=num_iters,sample_ratio=sample_ratio, alpha = alpha,
                           hypothesis_criterion='contr_class', mining_type='pos',
                           verbose=True, n_jobs=4
                          )
    
    print(f'start mining from neg objects')
    neg_hyps = mining_step(test_obj=obj, train_pos=train_pos, train_neg=train_neg,
                           num_iters=num_iters, sample_ratio=sample_ratio, alpha = alpha, 
                           hypothesis_criterion='contr_class',mining_type='neg', 
                           verbose=True, n_jobs=4
                          )

In [242]:
# a = pd.Series({'feat1':(1,1), 'feat2':(0, 1.4), 'feat3':(3, 4)})
# b = pd.Series({'feat1':(1,2), 'feat2':(1, 1.4), 'feat3':(7, 7)})
# c = pd.Series({'feat1':(1,1), 'feat2':(0.8, 1.6), 'feat3':(3, 4)})
# d = pd.Series({'feat1':(2,3), 'feat2':(0, 1.4), 'feat3':(4, 6)})


# print(a.equals(b))
# print(a == b)

# test1 = pd.Series({'feat1':(1,3), 'feat2':(1, 3.4), 'feat3':(1, 1.9)})
# test2 = pd.Series({'feat1':(2,2.2), 'feat2':(2, 2.4), 'feat3':(3, 3.9)})
# test3 = pd.Series({'feat1':(3,3), 'feat2':(4, 4.4), 'feat3':(4, 4.9)})

g1 = pd.Series({'feat1':(1,1), 'feat2':(1.5, 1.5)})
g2 = pd.Series({'feat1':(-1,-1), 'feat2':(0, 0)})
g3 = pd.Series({'feat1':(0.5,0.5), 'feat2':(1, 1)})

test1 = pd.Series({'feat1':(0.1, 1), 'feat2':(-0.5, 0)})
test2 = pd.Series({'feat1':(-0.1, 1), 'feat2':(0.5, 0.8)})
tst = pd.Series({'feat1':(-1, 1), 'feat2':(0, 1.5)})

In [243]:
d = similarity(g1, g2)
d

feat1     (-1, 1)
feat2    (0, 1.5)
dtype: object

In [244]:
d

feat1     (-1, 1)
feat2    (0, 1.5)
dtype: object

In [245]:
d.equals(tst)

True

In [92]:
tst

feat1     (-1, 1)
feat2    (0, 1.5)
dtype: object

In [103]:
inter_sample = get_similarity_sample_repr(sample)
inter_sample

feat1    (0.5, 1)
feat2    (1, 1.5)
dtype: object

In [247]:
sample = pd.DataFrame([g1, g2, g3, test1, test2])
sample

Unnamed: 0,feat1,feat2
0,"(1, 1)","(1.5, 1.5)"
1,"(-1, -1)","(0, 0)"
2,"(0.5, 0.5)","(1, 1)"
3,"(0.1, 1)","(-0.5, 0)"
4,"(-0.1, 1)","(0.5, 0.8)"


In [248]:
g1

feat1        (1, 1)
feat2    (1.5, 1.5)
dtype: object

In [249]:
sample

Unnamed: 0,feat1,feat2
0,"(1, 1)","(1.5, 1.5)"
1,"(-1, -1)","(0, 0)"
2,"(0.5, 0.5)","(1, 1)"
3,"(0.1, 1)","(-0.5, 0)"
4,"(-0.1, 1)","(0.5, 0.8)"


In [252]:
pd.DataFrame(is_included_in_repr(d, sample)).shape

(4, 2)

In [125]:
d

feat1     (-1, 1)
feat2    (0, 1.5)
dtype: object

Тестовая функция провекри np.random.choice

In [40]:
def compute_stat(times, data):
    print(f"times = {times}")
    sample = np.random.RandomState().choice(data, replace=False, size=10)
    print(f"sample = {sample}")
    sample_sum = np.sum(sample)
    print(f"sample sum = {sample_sum}")
    return sample_sum

def run_computing(n_jobs=2):
    data = list(range(100))
    
    compute_func = partial(compute_stat, data=data)
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        results = executor.map(compute_func, list(range(26)))
    
    return results

In [42]:
t = run_computing(n_jobs=6)

times = 0
times = 1
times = 2
times = 3
times = 4
times = 5
sample = [32  5 50 21  9 43 94  8 96 38]
sample = [11 35 37 89 88 72 50 28 38 34]
sample = [97 53 52 93 81 86 58 35 49 68]
sample = [72  5 41 66 65 18 26 71 74 44]
sample = [85 23 90 86 18 21 27 80 62 76]
sample = [19  6 31 10 98 18 76 71 35 43]
sample sum = 482
sample sum = 396
sample sum = 672
sample sum = 482
sample sum = 568
sample sum = 407
times = 6
times = 7
times = 8
times = 9
times = 10
times = 11
sample = [77 67  8 51 54 27 10 52  3 11]
sample = [68 85 39 41 77 10 19 81 71 53]
sample = [12 88 89 32 26 54 38  2 17 42]
sample = [84 89 72 36  3 58 20 17 15 52]
sample = [94 46 49  2 79 53 42 91  6 55]
sample = [90  3 95  5 76 30 73 56 37 34]
sample sum = 360
sample sum = 544
sample sum = 400
sample sum = 446
times = 12
sample sum = 499
sample sum = 517
times = 14
times = 13
times = 16
times = 17
sample = [70 43 42 85 41 61 33 34 93 10]
times = 15
sample = [33 14 60 28  7 45 36 68 91 29]
sample = [32 48 45 34 66 47  9 73 