In [17]:
import os
import numpy as np
import pandas as pd
import functions as func


from multiprocessing import Process, Pool
from concurrent.futures import ProcessPoolExecutor, Executor, as_completed

In [3]:
def read_data(path, filename):
    print(f'reading file = {os.path.join(path, filename)}')
    data = pd.read_csv(os.path.join(path, filename))
    data = data.rename(columns={'Unnamed: 0':'Id'})
    print(f'data shape = {data.shape}')
    types_info = pd.DataFrame(data.dtypes.value_counts(), columns=['columns_count'])
    print('types info about df columns: ')
    print(types_info)
    return data

In [4]:
ls -la -h Datasets/GiveMeSomeCredit/

total 28272
drwx------@ 6 adam  staff   192B Dec 23 21:30 [34m.[m[m/
drwxr-xr-x  8 adam  staff   256B Dec 23 21:36 [34m..[m[m/
-rwxr-xr-x@ 1 adam  staff    15K Dec 11  2019 [31mData Dictionary.xls[m[m*
-rwxr-xr-x@ 1 adam  staff   4.8M Dec 11  2019 [31mcs-test.csv[m[m*
-rwxr-xr-x@ 1 adam  staff   7.2M Dec 11  2019 [31mcs-training.csv[m[m*
-rwxr-xr-x@ 1 adam  staff   1.8M Dec 11  2019 [31msampleEntry.csv[m[m*


In [5]:
train_data = read_data(path='Datasets/GiveMeSomeCredit/', filename='cs-training.csv')
test_data = read_data(path='Datasets/GiveMeSomeCredit/', filename='cs-test.csv')
descript = pd.read_excel("Datasets/GiveMeSomeCredit/Data Dictionary.xls")
# sample_data = pd.read_csv("Datasets/GiveMeSomeCredit/sampleEntry.csv")

reading file = Datasets/GiveMeSomeCredit/cs-training.csv
data shape = (150000, 12)
types info about df columns: 
         columns_count
int64                8
float64              4
reading file = Datasets/GiveMeSomeCredit/cs-test.csv
data shape = (101503, 12)
types info about df columns: 
         columns_count
int64                7
float64              5


In [6]:
def transform_to_description(data):
    transformed_data = pd.DataFrame(columns=data.columns)
    
    for col in data:
        transformed_data[col] = data[col].apply(lambda x: (x, x))
    
    return transformed_data

In [7]:
train_data.fillna(0, inplace=True)
test_data.fillna(0, inplace=True)

#ставим колонку Id как индекс клиента
train_data.set_index('Id', inplace=True)
test_data.set_index('Id', inplace=True)

#сохраняем метку класса
train_label = train_data['SeriousDlqin2yrs'].copy()
train_data.drop('SeriousDlqin2yrs', axis=1, inplace=True)
#удаляем колонку класса из тестовых данных, так как она не несет никакой информации
test_data.drop('SeriousDlqin2yrs', axis=1, inplace=True)

In [8]:
train_data.shape, test_data.shape

((150000, 10), (101503, 10))

In [9]:
float_cols = train_data.select_dtypes('float').columns
train_data.loc[:, float_cols] = train_data.loc[:, float_cols].round(2)
train_data.head()

Unnamed: 0_level_0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.77,45,2,0.8,9120.0,13,0,6,0,2.0
2,0.96,40,0,0.12,2600.0,4,0,0,0,1.0
3,0.66,38,1,0.09,3042.0,2,1,0,0,0.0
4,0.23,30,0,0.04,3300.0,5,0,0,0,0.0
5,0.91,49,1,0.02,63588.0,7,0,1,0,0.0


In [10]:
float_cols = test_data.select_dtypes('float').columns
test_data.loc[:, float_cols] = test_data.loc[:, float_cols].round(2)
test_data.head()

Unnamed: 0_level_0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0.89,43,0,0.18,5700.0,4,0,0,0,0.0
2,0.46,57,0,0.53,9141.0,15,0,4,0,2.0
3,0.04,59,0,0.69,5083.0,12,0,1,0,2.0
4,0.28,38,1,0.93,3200.0,7,0,2,0,0.0
5,1.0,27,0,0.02,3865.0,4,0,0,0,1.0


In [11]:
transformed_train = transform_to_description(train_data)
transformed_test = transform_to_description(test_data)
transformed_train.head(5)

Unnamed: 0_level_0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,"(0.77, 0.77)","(45, 45)","(2, 2)","(0.8, 0.8)","(9120.0, 9120.0)","(13, 13)","(0, 0)","(6, 6)","(0, 0)","(2.0, 2.0)"
2,"(0.96, 0.96)","(40, 40)","(0, 0)","(0.12, 0.12)","(2600.0, 2600.0)","(4, 4)","(0, 0)","(0, 0)","(0, 0)","(1.0, 1.0)"
3,"(0.66, 0.66)","(38, 38)","(1, 1)","(0.09, 0.09)","(3042.0, 3042.0)","(2, 2)","(1, 1)","(0, 0)","(0, 0)","(0.0, 0.0)"
4,"(0.23, 0.23)","(30, 30)","(0, 0)","(0.04, 0.04)","(3300.0, 3300.0)","(5, 5)","(0, 0)","(0, 0)","(0, 0)","(0.0, 0.0)"
5,"(0.91, 0.91)","(49, 49)","(1, 1)","(0.02, 0.02)","(63588.0, 63588.0)","(7, 7)","(0, 0)","(1, 1)","(0, 0)","(0.0, 0.0)"


In [12]:
transformed_train.shape, transformed_test.shape

((150000, 10), (101503, 10))

In [13]:
def similarity(vect1, vect2):
    
    """
    previous version:
     vect1 = transformed_train.iloc[0]
     vect2 = transformed_train.iloc[2]
     func = (lambda x,y: (min(x[0], y[0]), max(x[1], y[1])))
     pd.Series(map(func, vect1, vect2), index=train_data.columns)
    
     for col in cols:
     vect_min = min(vect1.loc[col][0], vect2.loc[col][0])
     vect_max = max(vect1.loc[col][1], vect2.loc[col][1])
     vect[col] = (vect_min, vect_max)
    """
    
    func = lambda x, y: (min(x[0], y[0]), max(x[1], y[1]))
    vect = pd.Series(map(func, vect1, vect2), index=vect1.index)
    return vect

def inclusion(obj, patterns):
    """
    check where an obj is inluded in list of patterns
    is_include = any([all(obj == elem) for elem in patterns])
    """     
    is_include = any([obj.equals(elem) for elem in patterns])
    return  is_include

**Алгоритм из работы Алексея(QBCA)**

In [98]:
alpha = 0.0001
sample_ratio = 0.003#parameter
num_iters = 10#parameter
N_neg = train_label.value_counts().reset_index().iloc[0, 1]
N_pos = train_label.value_counts().reset_index().iloc[1, 1]
N_neg, N_pos

(139974, 10026)

In [15]:
train_pos = transformed_train.loc[train_label[train_label == 1].index]
train_neg = transformed_train.loc[train_label[train_label == 0].index]
#не обновляем индекс так как индекс - это id клиента, имеет значимую информацию
train_neg.shape, train_pos.shape

((139974, 10), (10026, 10))

In [16]:
def get_similarity_sample_repr(sample: pd.DataFrame):
    
    """
    get sample of feature represantations from pos or neg class dataset
    returns feature represantation for sample by similarity operation
    """
    pattern = None
    for i, obj in sample.iterrows():
        if pattern is None:
            pattern = obj
        pattern = similarity(pattern, obj)
    return pattern
    
#операция нахождения объектов по признаковому представлению
def is_included_in_repr(d, train_data):
    """
    returns objects from train dataset(from train pos and neg data) that is included in d representation
    """
    
    d_list = []
    
    for i, obj in train_data.iterrows():
        feature_repr = similarity(obj, d)
        is_included = d.equals(feature_repr)
        if is_included:
            d_list.append(obj)
            
    return d_list


def generate_hypothesises():
    return None

### Mining step


    для положительного класса нас интересуют объекты отрицательного класса, 
    а для отрицательного - положительные
    Уже после определения объектов из другого класса, попадающие в признаковое представление семпла данных, будет приниматься решение
     о включение этого признакого представление в список гипотез представления(областей или интервальных представлений)
    

При первом запуске сразу видно, что требуется очень много времени для генерации гипотез

1) Нужно переписать цикл, и выразить его в виде функции
2) Протестировать время работы генерации признаков для полож и отриц класса
3) Посмотреть как можно добавить распараллеливание
4) Протестировать в параллельном режиме

To-do-list:

1. **Нужно добавить логинг**
2. **Нужно также проанализировать время работы и качество**
3. **классификации при разном включении признаков из исходного множества**

To-do-list
1. Нужно раcпараллелить через процессы
2. Подумать о том, каким образом генерить гипотезы. То есть выбирать не случайно выборку из множества объектов, а какую-то локальную область.

In [92]:
def generate_gypothesis(iteration: int,
                        obj: pd.Series, 
                        train_data: pd.DataFrame, 
                        other_data: pd.DataFrame, 
                        sample_size: int, 
                        verbose: bool,
                        other_data_size: int, 
                        alpha: float):
        
        """
        #get feature represantation for this objects sample
        #test object + random sample from pos or neg class)
        #get objects from other class that has feature represantation
        #which is included in d
        
        """
        print(f'iteration: {iteration}')
        
        inds = np.random.choice(train_data.index, replace=False, size=sample_size)
        sample = train_data.loc[inds, :].copy()
        sample = sample.append(obj)
        
        d = get_similarity_sample_repr(sample)
        if verbose:
            print('got feature represantation for sample')
        
        d_sim_objects = is_included_in_repr(d, train_data=other_data)
        #print(f'got {len(d_sim_objects)} d_sim_objects')
        
        if verbose:
            print('got objects that is included in sample represantation')
            
        if d_sim_objects is None:
            print('did not find any hypothesis on this iteration')
            return None
        if len(d_sim_objects) <= int(other_data_size * alpha):
            return d
        else:
            return None

def mining_step(test_obj: pd.Series, train_pos: pd.DataFrame, train_neg: pd.DataFrame,
                num_iters: int, sample_ratio: float, alpha: float, mining_type: str = 'pos', 
                verbose : bool = False):
    """
    returns list of hypothesises
    """
    
    n_jobs = 4
    train_data = train_pos if mining_type == 'pos' else train_neg
    other_data = train_neg if mining_type == 'pos' else train_pos
    other_data_size = train_neg.shape[0] if mining_type == 'pos' else train_pos.shape[0]
    
    sample_size = int(train_data.shape[0] * sample_ratio)
    print(f'using sample size = {sample_size}')
    print(f'other data size = {other_data_size}')
    print(f'other data rate size = {int(other_data_size * alpha)}')
    print('start generating hypothesises')
    
    mining = partial(generate_gypothesis, 
                     obj=test_obj, 
                     train_data=train_data, 
                     other_data=other_data, 
                     sample_size=sample_size, 
                     verbose=verbose, 
                     other_data_size=other_data_size,
                     alpha=alpha
                    )
    
    with ProcessPoolExecutor(max_workers=n_jobs) as executor:
        hypothesises = executor.map(mining, range(num_iters))

    hypothesises = [res for res in hypothesises if res is not None]
    return hypothesises
    

In [84]:
test_sample = transformed_test.loc[:2]
test_sample

Unnamed: 0_level_0,RevolvingUtilizationOfUnsecuredLines,age,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,"(0.89, 0.89)","(43, 43)","(0, 0)","(0.18, 0.18)","(5700.0, 5700.0)","(4, 4)","(0, 0)","(0, 0)","(0, 0)","(0.0, 0.0)"
2,"(0.46, 0.46)","(57, 57)","(0, 0)","(0.53, 0.53)","(9141.0, 9141.0)","(15, 15)","(0, 0)","(4, 4)","(0, 0)","(2.0, 2.0)"


In [99]:
num_iters, alpha, sample_ratio

(10, 0.0001, 0.003)

In [80]:
a = [1, 2, 3]
b = [10, 10]

In [81]:
b.extend(a)

In [100]:
for i, obj in test_sample.iterrows():
    
    print(f'start mining from pos objects')
    pos_hyps = mining_step(test_obj=obj, 
                           train_pos=train_pos,
                           train_neg=train_neg,
                           num_iters=num_iters,
                           sample_ratio=sample_ratio, 
                           alpha = alpha, 
                           mining_type='pos', 
                           verbose=False
                          )
    print(f'start mining from neg objects')
    neg_hyps = mining_step(test_obj=obj,
                           train_pos=train_pos,
                           train_neg=train_neg,
                           num_iters=num_iters,
                           sample_ratio=sample_ratio,
                           alpha = alpha,
                           mining_type='neg',
                           verbose=False
                          )
    break

start mining from pos objects
using sample size = 30
other data size = 139974
other data rate size = 13
start generating hypothesises
iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9
start mining from neg objects
using sample size = 419
other data size = 10026
other data rate size = 1
start generating hypothesises
iteration: 0
iteration: 1
iteration: 2
iteration: 3
iteration: 4
iteration: 5
iteration: 6
iteration: 7
iteration: 8
iteration: 9


[]

In [58]:
# some_data = [1,1,1,3,3,3,5,5,5]
def some_func_internal(x):
    print(x)
    print(some_data)
    return sum(some_data[:x])

def some_func_external(some_data):
    some_data = list(range(16))
    
    with ProcessPoolExecutor(max_workers=2) as executor:
        results = executor.map(some_func_internal, range(8))
        
#     print(results)

In [59]:
some_func_external(some_data)

NameError: name 'some_data' is not defined

In [60]:
from functools import partial

In [71]:
def some_strange_func(x, y, z):
    return x+y+z

In [76]:
new_some_func = partial(some_strange_func, z=2, y=5)

In [82]:
# #здесь явно нужно написать отдельнцю функцию для этого
# for i, obj in test_sample.iterrows():
#     pos_gypothesis = [] 
#     neg_gypothesis = []
#     sample_size = int(N_pos * sample_ratio)
#     print('start generating pos gypothesis')
#     for itr in range(num_iters):
#         print(f'[{itr}] ...')
#         #generating sample
#         inds = np.random.choice(range(N_pos), size=sample_size)
#         sample = train_pos.loc[inds, :]
#         #append test object to pos sample
#         sample = sample.append(obj)
#         #get feature represantation for this objects sample
#         #test object + random sample from pos class)
#         d = get_similarity_sample_repr(sample)
#         #get objects from neg class that has feature represantation
#         #which is included in d
#         d_sim_objects = is_included_in_repr(d, train_data=train_neg)
#         if d_sim_objects is None:
#             continue
#         if len(d_sim_objects) <= int(N_neg * alpha):
#             pos_gypothesis.append(xd)
            
#     #the same mining step for neg class
#     sample_size = int(N_neg * sample_ratio)
#     print('start generating neg gypothesis')
#     for itr in range(num_iters):
#         print(f'[{itr}] ...')
#         #generating sample
#         inds = np.random.choice(range(N_neg), size=sample_size)
#         sample = train_neg.loc[inds, :]
#         #append test object to neg sample
#         sample = sample.append(obj)
#         #get feature represantation for this objects sample
#         #test object + random sample from neg class)
#         d = get_similarity_sample_repr(sample)
#         #get objects from pos class that has feature represantation
#         #which is included in d
#         d_sim_objects = is_included_in_repr(d, train_data=train_pos)
#         if d_sim_objects is None:
#             continue
#         if len(d_sim_objects) <= int(N_pos * alpha):
#             neg_gypothesis.append(d)

Id                                        (1, 143650)
SeriousDlqin2yrs                             (0.0, 1)
RevolvingUtilizationOfUnsecuredLines       (0.0, 1.0)
age                                          (29, 62)
NumberOfTime30-59DaysPastDueNotWorse           (0, 9)
DebtRatio                               (0.0, 2538.0)
MonthlyIncome                           (0.0, 7028.0)
NumberOfOpenCreditLinesAndLoans               (1, 23)
NumberOfTimes90DaysLate                        (0, 5)
NumberRealEstateLoansOrLines                   (0, 2)
NumberOfTime60-89DaysPastDueNotWorse           (0, 2)
NumberOfDependents                         (0.0, 1.0)
dtype: object

In [30]:
a = pd.Series({'feat1':(1,1), 'feat2':(0, 1.4), 'feat3':(3, 4)})
b = pd.Series({'feat1':(1,2), 'feat2':(1, 1.4), 'feat3':(7, 7)})
c = pd.Series({'feat1':(1,1), 'feat2':(0.8, 1.6), 'feat3':(3, 4)})
d = pd.Series({'feat1':(2,3), 'feat2':(0, 1.4), 'feat3':(4, 6)})

patterns = []
patterns.append(a)
# patterns.append(b)
patterns.append(c)
# patterns.append(d)

print(a.equals(b))
print(a == b)

test1 = pd.Series({'feat1':(1,3), 'feat2':(1, 3.4), 'feat3':(1, 1.9)})
test2 = pd.Series({'feat1':(2,2.2), 'feat2':(2, 2.4), 'feat3':(3, 3.9)})
test3 = pd.Series({'feat1':(3,3), 'feat2':(4, 4.4), 'feat3':(4, 4.9)})

In [7]:
from time import sleep

In [69]:
def compute(x):
#     print(f'starting computing for {x}')
#     print(f'process name = {multiprocessing.current_process().name}')
    return x * 2

In [23]:
vals = [3, 5, 8]
processes = []
for x in vals:
    p = Process(target=compute, args=(x,))
    processes.append(p)
    
for p in processes:
    p.start()
       

starting computing for 3
starting computing for 5
starting computing for 8
got the result for 3
got the result for 5
got the result for 8
