### Задание 1

1. Взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)

In [180]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, precision_score, precision_recall_curve
from sklearn.metrics import classification_report, confusion_matrix

import itertools

import matplotlib.pyplot as plt

%matplotlib inline

In [181]:
columns = ['#0', '#1', '#2', '#3', '#4', '#5', '#6', '#7', '#8', '#9', '#10', '#11', '#12', '#13', '#14'] 

df = pd.read_csv("adult.data", header=None, names = columns)
df.head(10)

Unnamed: 0,#0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,#11,#12,#13,#14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


**Задача: спрогнозировать, превышает ли доход 50 тысяч долларов в год**

In [182]:
binary_to_numbers = {' <=50K': 0, ' >50K': 1}

df['#14'] = df['#14'].replace(binary_to_numbers)
df.head(3)

Unnamed: 0,#0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,#11,#12,#13,#14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0


Посмотрим на соотношение классов (P vs U), где P - позитивы и U - все остальные неразмеченные данные

In [183]:
df['#14'].value_counts()

0    24720
1     7841
Name: #14, dtype: int64

Будем используем обычный random negative sample

In [184]:
# разделим данные на train/test
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['#14']), df['#14'], random_state=0)

In [185]:
#соберем наш простой pipeline, но нам понадобится написать класс для выбора нужного поля
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    

class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [186]:
df.head(3)

Unnamed: 0,#0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,#11,#12,#13,#14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0


Зададим списки признаков

In [187]:
categorical_columns = ['#1', '#3', '#5', '#6', '#7', '#8', '#9', '#13']
continuous_columns = ['#0', '#2', '#4', '#10', '#11', '#12']

Теперь нам нужно под каждый признак создать трансформер и объединить их в список.

In [188]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col)),
                ('scaler', StandardScaler())
            ])
    
    final_transformers.append((cont_col, cont_transformer))

In [189]:
feats = FeatureUnion(final_transformers)

### Задание 2

Обучить любой классификатор (какой вам нравится)

In [190]:
pipeline = Pipeline([
    ('features', feats),
    ('classifier', GradientBoostingClassifier(random_state=42)),
])

In [191]:
# обучим наш пайплайн
pipeline.fit(X_train, y_train)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('#1',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='#1')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='#1'))])),
                                                ('#3',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='#3')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='#3'))])),
                                                ('#5',
                                                 Pipeline(steps=[('selector',
                                                

In [192]:
# наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.02538151, 0.03246543, 0.15726247, 0.61258093, 0.64075648,
       0.55177425, 0.04456638, 0.05776223, 0.06937272, 0.00785221])

In [193]:
metrics = pd.DataFrame(columns=['Model', 'Threshold', 'F-Score', 'Precision', 'Recall', 'ROC AUC'])

In [194]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)
fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.3693507219240659, F-Score=0.721, Precision=0.700, Recall=0.742


In [195]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.9220348782818708

In [196]:
metrics = metrics.append({
    'Model': 'GradientBoostingClassifier',
    'Threshold': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc}, ignore_index=True)
metrics

Unnamed: 0,Model,Threshold,F-Score,Precision,Recall,ROC AUC
0,GradientBoostingClassifier,0.369351,0.720725,0.700476,0.74218,0.922035


### Задание 3

Разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные примеры (класс 1), а только лишь часть

Представим, что нам неизвестны негативы и часть позитивов

In [197]:
mod_data = X_train.copy()
mod_data['#14'] = y_train
mod_data = mod_data.reset_index(drop=True)

# mod_data = data.copy()
# get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:, -1].values == 1)[0]

# shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
perc = 0.25
pos_sample_len = int(np.ceil(perc * len(pos_ind)))

print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 1465/5859 as positives and unlabeling the rest


Создаем столбец для новой целевой переменной, где у нас два класса - P (1) и U (-1)

In [198]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    22955
 1     1465
Name: class_test, dtype: int64


In [199]:
mod_data.head(10)

Unnamed: 0,#0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,#11,#12,#13,#14,class_test
0,59,Private,61885,12th,8,Divorced,Transport-moving,Other-relative,Black,Male,0,0,35,United-States,0,-1
1,71,Private,180733,Masters,14,Never-married,Other-service,Not-in-family,White,Female,0,0,20,United-States,0,-1
2,42,Private,107762,Masters,14,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,1,-1
3,26,Private,35917,HS-grad,9,Married-civ-spouse,Sales,Husband,White,Male,0,0,40,United-States,0,-1
4,46,Private,256522,1st-4th,2,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,40,Puerto-Rico,0,-1
5,30,Private,345705,Some-college,10,Married-civ-spouse,Exec-managerial,Other-relative,White,Male,0,0,40,United-States,0,-1
6,18,Private,90934,Some-college,10,Never-married,Sales,Own-child,White,Male,0,0,28,United-States,0,-1
7,22,Private,180060,HS-grad,9,Never-married,Exec-managerial,Not-in-family,White,Male,0,0,40,Yugoslavia,0,-1
8,76,Private,316185,7th-8th,4,Widowed,Protective-serv,Not-in-family,White,Female,0,0,12,United-States,0,-1
9,37,Private,312766,HS-grad,9,Divorced,Other-service,Not-in-family,White,Female,0,0,40,United-States,0,-1


### Задание 4

Применить random negative sampling для построения классификатора в новых условиях

Помним, что (x_data) содержит целевой признак, который будем использовать для оценки качества

Отделими [:-2] как истиный класс для проверки, и [:-1] как данные для входной разметки PUL

In [200]:
mod_data = mod_data.sample(frac=1)


data_N = mod_data[mod_data['class_test'] == -1]
data_P = mod_data[mod_data['class_test'] == 1]

neg_sample = data_N[:data_P.shape[0]]
sample_test = data_N[data_P.shape[0]:]
pos_sample = data_P.copy()

print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(1465, 16) (1465, 16)


In [201]:
sample_train

Unnamed: 0,#0,#1,#2,#3,#4,#5,#6,#7,#8,#9,#10,#11,#12,#13,#14,class_test
15700,43,Private,104196,Some-college,10,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,0,-1
12117,42,Private,136986,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,35,United-States,1,1
2586,23,Private,239663,Some-college,10,Never-married,Handlers-cleaners,Own-child,White,Male,2597,0,50,United-States,0,-1
23827,21,Private,129980,9th,5,Never-married,Transport-moving,Own-child,White,Male,0,0,40,United-States,0,-1
18557,21,Private,205838,HS-grad,9,Never-married,Other-service,Own-child,White,Male,0,0,37,United-States,0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17577,47,Private,155489,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,7688,0,55,United-States,1,1
23457,29,Private,54932,Some-college,10,Divorced,Craft-repair,Unmarried,White,Male,0,0,35,United-States,1,1
17126,40,State-gov,345969,Assoc-acdm,12,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,0,40,United-States,1,1
17710,42,Private,303155,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,50,United-States,1,1


In [202]:
sample_train.loc[sample_train['class_test'] == -1, 'class_test'] = 0

pipeline = Pipeline([
    ('features', feats),
    ('classifier', GradientBoostingClassifier(random_state=42)),
])


pipeline.fit(sample_train.drop(columns=['class_test', '#14']), 
             sample_train['class_test'])

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('#1',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='#1')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='#1'))])),
                                                ('#3',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='#3')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='#3'))])),
                                                ('#5',
                                                 Pipeline(steps=[('selector',
                                                

In [203]:
# наши прогнозы для тестовой выборки
preds = pipeline.predict_proba(X_test)[:, 1]
preds[:10]

array([0.06994508, 0.08608894, 0.48456857, 0.22914361, 0.33939285,
       0.43966226, 0.11839005, 0.20129512, 0.26479319, 0.01718793])

In [204]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.5753967903323184, F-Score=0.695, Precision=0.646, Recall=0.752


In [205]:
roc_auc = roc_auc_score(y_test, preds)
roc_auc

0.9067291202901122

### Задание 5

Сравнить качество с решением из пункта 3 (построить отчет - таблицу метрик)

In [206]:
metrics = metrics.append({
    'Model': 'pu-learning',
    'Threshold': thresholds[ix],
    'F-Score': fscore[ix],
    'Precision': precision[ix],
    'Recall': recall[ix],
    'ROC AUC': roc_auc}, ignore_index=True)
metrics

Unnamed: 0,Model,Threshold,F-Score,Precision,Recall,ROC AUC
0,GradientBoostingClassifier,0.369351,0.720725,0.700476,0.74218,0.922035
1,pu-learning,0.575397,0.694801,0.64586,0.751766,0.906729
