1. Взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
2. Обучить любой классификатор (какой вам нравится)
3. Разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные примеры (класс 1), а только лишь часть
4. Применить random negative sampling для построения классификатора в новых условиях
5. Сравнить качество с решением из пункта 3 (построить отчет - таблицу метрик)
6. *Поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [30]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix
import itertools

import matplotlib.pyplot as plt

from google.colab import files

files.upload()

In [32]:
df = pd.read_csv("cardiovascular_diseases_dv3.csv", sep=';')
df.head(3)

Unnamed: 0,AGE,GENDER,HEIGHT,WEIGHT,AP_HIGH,AP_LOW,CHOLESTEROL,GLUCOSE,SMOKE,ALCOHOL,PHYSICAL_ACTIVITY,CARDIO_DISEASE
0,50,2,168,62,110,80,1,1,0,0,1,0
1,55,1,156,85,140,90,3,1,0,0,1,1
2,52,1,165,64,130,70,3,1,0,0,0,1


In [33]:
df['CARDIO_DISEASE'].value_counts()

0    34742
1    34041
Name: CARDIO_DISEASE, dtype: int64

In [34]:
df.rename({'CARDIO_DISEASE': 'target'}, axis=1, inplace=True)

In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68783 entries, 0 to 68782
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   AGE                68783 non-null  int64
 1   GENDER             68783 non-null  int64
 2   HEIGHT             68783 non-null  int64
 3   WEIGHT             68783 non-null  int64
 4   AP_HIGH            68783 non-null  int64
 5   AP_LOW             68783 non-null  int64
 6   CHOLESTEROL        68783 non-null  int64
 7   GLUCOSE            68783 non-null  int64
 8   SMOKE              68783 non-null  int64
 9   ALCOHOL            68783 non-null  int64
 10  PHYSICAL_ACTIVITY  68783 non-null  int64
 11  target             68783 non-null  int64
dtypes: int64(12)
memory usage: 6.3 MB


In [36]:
df.describe()

Unnamed: 0,AGE,GENDER,HEIGHT,WEIGHT,AP_HIGH,AP_LOW,CHOLESTEROL,GLUCOSE,SMOKE,ALCOHOL,PHYSICAL_ACTIVITY,target
count,68783.0,68783.0,68783.0,68783.0,68783.0,68783.0,68783.0,68783.0,68783.0,68783.0,68783.0,68783.0
mean,53.326781,1.348749,164.361252,74.121934,126.614585,81.381591,1.364727,1.225986,0.088001,0.053632,0.803367,0.494904
std,6.768163,0.476578,8.18485,14.331392,16.763763,9.676679,0.678976,0.571961,0.283299,0.225293,0.397455,0.499978
min,30.0,1.0,55.0,11.0,60.0,40.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,48.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,54.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,58.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,65.0,2.0,250.0,200.0,240.0,190.0,3.0,3.0,1.0,1.0,1.0,1.0


In [37]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], random_state=23)

In [38]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        return pd.get_dummies(X, prefix=self.key)[self.columns]

In [39]:
continuous_columns = ['AGE', 'HEIGHT', 'WEIGHT', 'AP_HIGH', 'AP_LOW', 'CHOLESTEROL']

In [40]:
final_transformers = list()

for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    
    final_transformers.append((cont_col, cont_transformer))
    
feats = FeatureUnion(final_transformers)
feature_processing = Pipeline([('feats', feats)])

In [41]:
pipeline = Pipeline([('features', feats),
                ('classifier', XGBClassifier(random_state=23))])

In [42]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('AGE',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='AGE'))])),
                                                ('HEIGHT',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='HEIGHT'))])),
                                                ('WEIGHT',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='WEIGHT'))])),
                                                ('AP_HIGH',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='AP_HIGH'))])),
                           

In [43]:
preds = pipeline.predict_proba(X_test)[:, 1]

In [44]:
metrics_df = pd.DataFrame(columns=['model', 'thresh', 'F-Score', 'Precision', 'Recall', 'ROC AUC'])
metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC


In [45]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.36596012115478516, F-Score=0.737, Precision=0.676, Recall=0.809


In [46]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.7968396720295399

In [47]:
metrics_df = metrics_df.append({
    'model': 'XGB',
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,XGB,0.36596,0.736718,0.676482,0.80873,0.79684


In [48]:
df_copy = X_train.copy()
df_copy['target'] = y_train
df_copy.head()

Unnamed: 0,AGE,GENDER,HEIGHT,WEIGHT,AP_HIGH,AP_LOW,CHOLESTEROL,GLUCOSE,SMOKE,ALCOHOL,PHYSICAL_ACTIVITY,target
26935,40,1,159,106,100,80,1,1,0,0,1,0
52920,64,2,168,71,160,90,1,1,0,0,1,1
62061,51,2,171,115,120,80,1,1,0,0,1,0
38458,58,1,165,65,120,79,3,3,0,0,0,0
36479,46,1,165,70,130,90,1,1,0,0,1,1


In [49]:
pos_ind = df_copy[df_copy['target'] == 1].sample(frac=1, random_state=42).index

perc = 0.25
pos_sample_len = int(np.ceil(perc * len(pos_ind)))

print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 6380/25519 as positives and unlabeling the rest


In [50]:
df_copy['class_test'] = -1
df_copy.loc[pos_sample,'class_test'] = 1
print('target variable:\n', df_copy.iloc[:,-1].value_counts())

target variable:
 -1    45207
 1     6380
Name: class_test, dtype: int64


In [51]:
df_copy.head(10)

Unnamed: 0,AGE,GENDER,HEIGHT,WEIGHT,AP_HIGH,AP_LOW,CHOLESTEROL,GLUCOSE,SMOKE,ALCOHOL,PHYSICAL_ACTIVITY,target,class_test
26935,40,1,159,106,100,80,1,1,0,0,1,0,-1
52920,64,2,168,71,160,90,1,1,0,0,1,1,-1
62061,51,2,171,115,120,80,1,1,0,0,1,0,-1
38458,58,1,165,65,120,79,3,3,0,0,0,0,-1
36479,46,1,165,70,130,90,1,1,0,0,1,1,-1
36685,52,1,162,77,150,90,3,1,0,0,1,1,-1
42031,64,2,156,66,160,90,1,1,1,1,0,1,-1
53300,60,2,177,78,120,80,1,1,0,0,0,1,-1
33158,50,1,154,58,110,70,1,1,0,0,1,0,-1
4897,39,2,177,82,110,80,1,1,1,1,1,0,-1


In [52]:
df_copy = df_copy.sample(frac=1)


data_N = df_copy[df_copy['class_test'] == -1]
data_P = df_copy[df_copy['class_test'] == 1]

neg_sample = data_N[:data_P.shape[0]]
sample_test = data_N[data_P.shape[0]:]
pos_sample = data_P.copy()

print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(6380, 13) (6380, 13)


In [53]:
sample_train.loc[sample_train['class_test'] == -1, 'class_test'] = 0

PU_pipeline = Pipeline([('features', feats),
                ('classifier', XGBClassifier(random_state=23))])

PU_pipeline.fit(sample_train.drop(columns=['class_test','target']), 
             sample_train['class_test'])

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('AGE',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='AGE'))])),
                                                ('HEIGHT',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='HEIGHT'))])),
                                                ('WEIGHT',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='WEIGHT'))])),
                                                ('AP_HIGH',
                                                 Pipeline(steps=[('selector',
                                                                  NumberSelector(key='AP_HIGH'))])),
                           

In [54]:
preds_pu = PU_pipeline.predict_proba(X_test)[:, 1]

In [55]:
precision, recall, thresholds = precision_recall_curve(y_test, preds_pu)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.3878251314163208, F-Score=0.734, Precision=0.650, Recall=0.843


In [56]:
roc_auc = roc_auc_score(y_test, preds_pu)
roc_auc

0.789381071070674

In [57]:
metrics_df = metrics_df.append({
    'model': 'PU_XGB',
    'thresh': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc
}, ignore_index=True)

In [58]:
metrics_df

Unnamed: 0,model,thresh,F-Score,Precision,Recall,ROC AUC
0,XGB,0.36596,0.736718,0.676482,0.80873,0.79684
1,PU_XGB,0.387825,0.733626,0.649656,0.842525,0.789381
