Расмотрим пример на датасете из репозитория UCI

Описание данных - https://archive.ics.uci.edu/ml/datasets/Heart+Disease

In [76]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

In [77]:
data = pd.read_csv("processed.cleveland.data", header=None)
data.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1


У нас есть 13 признаков и 1 целевая переменная - нужно определить есть сердечно-сосудистые заболевания или нет. Поле "цель" относится к наличию у пациента сердечно-сосудистых заболеваний. Это целое число, равное от 0 (отсутствие присутствия) до 4. Эксперименты с базой данных Кливленда были сосредоточены на простой попытке отличить присутствие (значения 1,2,3,4) от отсутствия (значение 0).

In [78]:
print(data.shape)

(303, 14)


In [79]:
data[13].value_counts()

0    164
1     55
2     36
3     35
4     13
Name: 13, dtype: int64

Переименуем столбец с целевой переменной и сделаем ее бинарной. 0 - отсутсвие сердечно сосудистых заболеваний. 1,2,3,4 - наличие

In [80]:
data.loc[data[13] != 0, 13] = 1
data.rename(columns={13: 'target'}, inplace=True)
data['target'] = pd.to_numeric(data['target'])

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [81]:
data['target'].value_counts()

0    164
1    139
Name: target, dtype: int64

In [82]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
0         303 non-null float64
1         303 non-null float64
2         303 non-null float64
3         303 non-null float64
4         303 non-null float64
5         303 non-null float64
6         303 non-null float64
7         303 non-null float64
8         303 non-null float64
9         303 non-null float64
10        303 non-null float64
11        303 non-null object
12        303 non-null object
target    303 non-null int64
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [83]:
data[11] = pd.to_numeric(data[11], errors='coerce', downcast='float')

In [84]:
data[11] = data[11].fillna(0)

In [85]:
data[12] = pd.to_numeric(data[11], errors='coerce', downcast='float')

In [86]:
data[12] = data[12].fillna(0)

In [87]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
0         303 non-null float64
1         303 non-null float64
2         303 non-null float64
3         303 non-null float64
4         303 non-null float64
5         303 non-null float64
6         303 non-null float64
7         303 non-null float64
8         303 non-null float64
9         303 non-null float64
10        303 non-null float64
11        303 non-null float32
12        303 non-null float32
target    303 non-null int64
dtypes: float32(2), float64(11), int64(1)
memory usage: 30.9 KB


In [88]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,0.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,2.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,0.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,0.0,0


In [89]:
x_data = data.iloc[:,:-1]
y_data = data.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.3, random_state=42)

In [90]:
model = GradientBoostingClassifier(random_state=42)

model.fit(x_train, y_train)
y_predict = model.predict(x_test)

Проверяем качество

In [91]:
metrics=[]
def evaluate_results(model, y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0))
    
    result = [model, prc, rec, roc, f1]
    metrics.append(result)
    
evaluate_results('Model', y_test, y_predict)

Classification results:
f1: 79.55%
roc: 80.28%
recall: 81.40%
precision: 77.78%


### Теперь очередь за PU learning

Представим, что нам неизвестны негативы и часть позитивов

In [92]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 35/139 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [93]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    268
 1     35
Name: class_test, dtype: int64


In [94]:
mod_data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,target,class_test
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,0.0,0,-1
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1,-1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,2.0,1,-1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,0.0,0,-1
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,0.0,0,-1
5,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,0.0,0,-1
6,62.0,0.0,4.0,140.0,268.0,0.0,2.0,160.0,0.0,3.6,3.0,2.0,2.0,1,1
7,57.0,0.0,4.0,120.0,354.0,0.0,0.0,163.0,1.0,0.6,1.0,0.0,0.0,0,-1
8,63.0,1.0,4.0,130.0,254.0,0.0,2.0,147.0,0.0,1.4,2.0,1.0,1.0,1,1
9,53.0,1.0,4.0,140.0,203.0,1.0,2.0,155.0,1.0,3.1,3.0,0.0,0.0,1,-1


In [95]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [96]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(35, 15) (35, 15)


In [98]:
model = GradientBoostingClassifier(random_state=42)

model.fit(sample_train.iloc[:,:-2].values, 
          sample_train.iloc[:,-2].values)
y_predict = model.predict(sample_test.iloc[:,:-2].values)
evaluate_results('sample_Model', sample_test.iloc[:,-2].values, y_predict)

Classification results:
f1: 74.64%
roc: 79.00%
recall: 86.67%
precision: 65.55%


In [99]:
df_metrics = pd.DataFrame(np.array(metrics),
                    columns=["model", "precision", "recall", "roc_auc", "f_score"])

df_metrics

Unnamed: 0,model,precision,recall,roc_auc,f_score
0,Model,0.7777777777777778,0.813953488372093,0.8028100775193799,0.7954545454545455
1,sample_Model,0.6554621848739496,0.8666666666666667,0.78997668997669,0.7464114832535885
