In [1]:
import numpy as np
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score,\
                            accuracy_score, f1_score

In [3]:
from catboost import CatBoostClassifier

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
data = pd.read_csv("data_banknote.txt", header=None)
data.tail(3)

Unnamed: 0,0,1,2,3,4
1369,-3.7503,-13.4586,17.5932,-2.7771,1
1370,-3.5637,-8.3827,12.393,-1.2823,1
1371,-2.5419,-0.65804,2.6842,1.1952,1


In [6]:
data[4].value_counts()

0    762
1    610
Name: 4, dtype: int64

In [7]:
X = data[[0, 1, 2, 3]]
y = data[[4]]

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                    random_state=16)

In [9]:
model = CatBoostClassifier()
model.fit(X_train, y_train)
y_predict = model.predict(X_test)

Learning rate set to 0.010429
0:	learn: 0.6757039	total: 147ms	remaining: 2m 26s
1:	learn: 0.6569636	total: 150ms	remaining: 1m 15s
2:	learn: 0.6409992	total: 154ms	remaining: 51.2s
3:	learn: 0.6242331	total: 157ms	remaining: 39.2s
4:	learn: 0.6114354	total: 161ms	remaining: 32s
5:	learn: 0.5992250	total: 164ms	remaining: 27.2s
6:	learn: 0.5821320	total: 167ms	remaining: 23.7s
7:	learn: 0.5669161	total: 171ms	remaining: 21.2s
8:	learn: 0.5519002	total: 174ms	remaining: 19.2s
9:	learn: 0.5364176	total: 178ms	remaining: 17.6s
10:	learn: 0.5230850	total: 181ms	remaining: 16.3s
11:	learn: 0.5101693	total: 187ms	remaining: 15.4s
12:	learn: 0.5014238	total: 190ms	remaining: 14.4s
13:	learn: 0.4866533	total: 193ms	remaining: 13.6s
14:	learn: 0.4760870	total: 197ms	remaining: 12.9s
15:	learn: 0.4658104	total: 200ms	remaining: 12.3s
16:	learn: 0.4555858	total: 204ms	remaining: 11.8s
17:	learn: 0.4446134	total: 207ms	remaining: 11.3s
18:	learn: 0.4350671	total: 211ms	remaining: 10.9s
19:	learn: 

In [10]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    
evaluate_results(y_test, y_predict)

Classification results:
f1: 100.00%
roc: 100.00%
recall: 100.00%
precision: 100.00%


# PU learning

In [11]:
mod_data = data.copy()
pos_ind = np.where(mod_data[4].values == 1)[0]
np.random.shuffle(pos_ind)
pos_sample_len = int(np.ceil(0.3 * len(pos_ind)))
print(f'pos_sample={pos_sample_len}')
pos_sample = pos_ind[:pos_sample_len]

pos_sample=183


In [12]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print(mod_data['class_test'].value_counts())

-1    1189
 1     183
Name: class_test, dtype: int64


In [13]:
mod_data.head(3)

Unnamed: 0,0,1,2,3,4,class_test
0,3.6216,8.6661,-2.8073,-0.44699,0,-1
1,4.5459,8.1674,-2.4586,-1.4621,0,-1
2,3.866,-2.6383,1.9242,0.10645,0,-1


In [14]:
x_data = mod_data[[0, 1, 2, 3]].values
y_labeled = mod_data['class_test'].values
y_positive = mod_data[4].values

# negative sampling

In [15]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)

sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(183, 6) (183, 6)


In [16]:
model = CatBoostClassifier()

model.fit(sample_train[[0, 1, 2, 3]].values, 
          sample_train[4].values)
y_predict = model.predict(sample_test[[0, 1, 2, 3]].values)
evaluate_results(sample_test[4].values, y_predict)

Learning rate set to 0.006707
0:	learn: 0.6828054	total: 3.22ms	remaining: 3.21s
1:	learn: 0.6726170	total: 6.05ms	remaining: 3.02s
2:	learn: 0.6622930	total: 9.1ms	remaining: 3.02s
3:	learn: 0.6522978	total: 12ms	remaining: 2.99s
4:	learn: 0.6442056	total: 14.8ms	remaining: 2.94s
5:	learn: 0.6351283	total: 17.9ms	remaining: 2.97s
6:	learn: 0.6269035	total: 20.6ms	remaining: 2.92s
7:	learn: 0.6188260	total: 23.4ms	remaining: 2.9s
8:	learn: 0.6091944	total: 26.2ms	remaining: 2.88s
9:	learn: 0.6013036	total: 29.2ms	remaining: 2.89s
10:	learn: 0.5921882	total: 32.1ms	remaining: 2.89s
11:	learn: 0.5830987	total: 34.9ms	remaining: 2.88s
12:	learn: 0.5772302	total: 37.8ms	remaining: 2.87s
13:	learn: 0.5686986	total: 40.5ms	remaining: 2.85s
14:	learn: 0.5594184	total: 43.4ms	remaining: 2.85s
15:	learn: 0.5536593	total: 46ms	remaining: 2.83s
16:	learn: 0.5462503	total: 48.9ms	remaining: 2.83s
17:	learn: 0.5377690	total: 52.6ms	remaining: 2.87s
18:	learn: 0.5303361	total: 55.7ms	remaining: 2.88

##### 1) На размеченных  данных классификатор CatBoost отработал идеально.

##### 2) Ниже представлены метрики качества в зависимости от доли участия в обучении множества P (positives). Увеличивать долю более 0,3 не имеет смысла.

In [23]:
data = {'': ['1%', '10%', '30%'], 'f1': [86.61, 91.00, 97.82],
        'roc': [87.98, 92.76, 98.76], 'recall': [87.83, 99.05, 100.00],
        'precision': [85.41, 84.17, 95.73]} 
 
df = pd.DataFrame(data)
df.style.hide_index()

Unnamed: 0,f1,roc,recall,precision
1%,86.61,87.98,87.83,85.41
10%,91.0,92.76,99.05,84.17
30%,97.82,98.76,100.0,95.73
