1.взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)

2.сделать feature engineering

3.обучить любой классификатор (какой вам нравится)

4.далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть

5.применить random negative sampling для построения классификатора в новых условиях

6.сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)

7.поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

Возьму датасет https://archive.ics.uci.edu/ml/datasets/Adult и попробую развести данные по определению заработка >50k на основе данных. Датасет содержит 2 файла разбитых по принципу 2/3 к 1/3. 

In [1]:
import pandas as pd
import numpy as np
data_train = pd.read_csv("adult.data", header = None)
data_train.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
print(data_train.shape)

(32561, 15)


In [3]:
data_train[14]

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
32556     <=50K
32557      >50K
32558     <=50K
32559     <=50K
32560      >50K
Name: 14, Length: 32561, dtype: object

In [4]:
data_train[14] = data_train[14].apply(lambda x: 1 if x==" >50K" else 0, 1)

In [5]:
data_train.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0


In [6]:
data_train = data_train.rename(columns={0: "age", 1: "employment", 2:"income", 3:"education", 4:"number_emp_years", 
                          5:"marrital", 6:"position", 7:"family", 8:"race",
                          9:"sex", 10:"capital-gain", 11:"capital-loss",
                          12:"hours-per-week", 13:"country", 14:"target"})

In [7]:
data_train.head(3)

Unnamed: 0,age,employment,income,education,number_emp_years,marrital,position,family,race,sex,capital-gain,capital-loss,hours-per-week,country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0


In [8]:
data_train["target"].value_counts()

0    24720
1     7841
Name: target, dtype: int64

In [9]:
x_data = data_train.iloc[:,:-1]
y_data = data_train.iloc[:,-1]

In [10]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [11]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X[self.column]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in self.columns:
            if col_ not in test_columns:
                X[col_] = 0
        return X[self.columns]

In [12]:
categorical_columns = ["employment", "education", "marrital", "position", "family", "race",
                       "sex", "country"]
continuous_columns = ["age", "income", "number_emp_years", "capital-gain", "capital-loss",
                      "hours-per-week"]

In [13]:
from sklearn.pipeline import FeatureUnion

In [14]:
final_transformers = list()

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    final_transformers.append((cont_col, cont_transformer))

In [15]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

pipeline = Pipeline([
    ('features',feats),
    ('classifier', GradientBoostingClassifier(random_state = 42)),
])

In [17]:
pipeline.fit(x_data, y_data)

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('employment',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='employment')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='employment'))])),
                                                ('education',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='education')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='education'))])),
                                                ('marrital',
                                                 Pipeline(steps=[('selector

Обработаю тестовые данные

In [18]:
data_test = pd.read_csv("adult.test", header=None)
data_test.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [19]:
data_test = data_test.rename(columns={0: "age", 1: "employment", 2:"income", 3:"education", 4:"number_emp_years", 
                          5:"marrital", 6:"position", 7:"family", 8:"race",
                          9:"sex", 10:"capital-gain", 11:"capital-loss",
                          12:"hours-per-week", 13:"country", 14:"target"})

In [20]:
data_test.head(5)

Unnamed: 0,age,employment,income,education,number_emp_years,marrital,position,family,race,sex,capital-gain,capital-loss,hours-per-week,country,target
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [21]:
data_test["target"] = data_test["target"].apply(lambda x: 1 if x==" >50K." else 0, 1)

In [22]:
data_test.head(5)

Unnamed: 0,age,employment,income,education,number_emp_years,marrital,position,family,race,sex,capital-gain,capital-loss,hours-per-week,country,target
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,0
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,0
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,1
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,0


In [23]:
x_test = data_test.iloc[:,:-1]
y_test = data_test.iloc[:,-1]

In [24]:
y_test.describe()

count    16281.000000
mean         0.236226
std          0.424776
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: target, dtype: float64

In [25]:
y_predict = pipeline.predict(x_test)

In [26]:
preds = pipeline.predict_proba(x_test)[:, 1]
preds[:10]

array([0.01243884, 0.1994794 , 0.29944207, 0.96205302, 0.00605449,
       0.01398297, 0.02711329, 0.66658173, 0.01006423, 0.07003246])

In [27]:
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix

In [28]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        precision[ix],
                                                                        recall[ix]))

Best Threshold=0.346544, F-Score=0.713, Precision=0.679, Recall=0.749


In [29]:
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    #roc = roc_auc_score(y_test, y_predict)
    #print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 

    
evaluate_results(y_test, y_predict)

Classification results:
f1: 69.20%
recall: 61.23%
precision: 79.56%


В итоге - попробовала бустингом обучить бинарно модель. Теперь перейдем к варианту PU

In [30]:
#data = pd.concat([data_train, data_test])
data = data_train.append(data_test, ignore_index=True, sort=False)

In [31]:
data.head(5)

Unnamed: 0,age,employment,income,education,number_emp_years,marrital,position,family,race,sex,capital-gain,capital-loss,hours-per-week,country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [32]:
data.shape

(48842, 15)

In [33]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 2922/11687 as positives and unlabeling the rest


In [34]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    45920
 1     2922
Name: class_test, dtype: int64


In [35]:
X_data = mod_data.iloc[:,:-2].values # just the X 
Y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
Y_positive = mod_data.iloc[:,-2].values # original class

RANDOM NEGATIVE 

In [36]:
final_transformers = list()

categorical_columns = ["employment", "education", "marrital", "position", "family", "race",
                       "sex", "country"]
continuous_columns = ["age", "income", "number_emp_years", "capital-gain", "capital-loss",
                      "hours-per-week"]

for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', FeatureSelector(column=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))
    
for cont_col in continuous_columns:
    cont_transformer = Pipeline([
                ('selector', NumberSelector(key=cont_col))
            ])
    final_transformers.append((cont_col, cont_transformer))

In [37]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [39]:
import xgboost as xgb

pipeline = Pipeline([
    ('features',feats),
    #('classifier', GradientBoostingClassifier(random_state = 42)),
    ('classifier', xgb.XGBClassifier(random_state = 42)),
])

In [40]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(2922, 16) (2922, 16)


In [41]:
sample_train.head(10)

Unnamed: 0,age,employment,income,education,number_emp_years,marrital,position,family,race,sex,capital-gain,capital-loss,hours-per-week,country,target,class_test
20955,40,Private,286750,11th,7,Separated,Machine-op-inspct,Not-in-family,Black,Male,0,0,36,United-States,0,-1
23840,26,Self-emp-not-inc,68729,HS-grad,9,Never-married,Sales,Other-relative,Asian-Pac-Islander,Male,0,0,50,United-States,1,1
33438,20,Private,237956,HS-grad,9,Never-married,Protective-serv,Own-child,White,Male,0,0,40,Cuba,0,-1
44597,57,Private,144012,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1,1
26579,49,State-gov,131302,Assoc-voc,11,Divorced,Adm-clerical,Not-in-family,Black,Female,0,0,44,United-States,0,-1
28329,25,State-gov,262664,Bachelors,13,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,1,1
32025,50,Private,139703,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,1,1
34924,47,Private,171807,Some-college,10,Divorced,Tech-support,Not-in-family,White,Female,0,0,40,United-States,0,-1
34274,44,Local-gov,32627,Masters,14,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,1887,60,United-States,1,1
28647,49,Private,250733,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,46,United-States,0,-1


In [43]:
#pipeline.fit(sample_train.iloc[:,:-2].values, sample_train.iloc[:,-2].values)
pipeline.fit(sample_train.iloc[:,:-2], sample_train.iloc[:,-2])

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('employment',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='employment')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='employment'))])),
                                                ('education',
                                                 Pipeline(steps=[('selector',
                                                                  FeatureSelector(column='education')),
                                                                 ('ohe',
                                                                  OHEEncoder(key='education'))])),
                                                ('marrital',
                                                 Pipeline(steps=[('selector

In [44]:
#pipeline.fit(sample_train.iloc[:,:-2].values, 
          #sample_train.iloc[:,-2].values)
#Y_predict = pipeline.predict(sample_test.iloc[:,:-2].values)
Y_predict = pipeline.predict(sample_test.iloc[:,:-2])
#evaluate_results(sample_test.iloc[:,-2].values, Y_predict)
evaluate_results(sample_test.iloc[:,-2], Y_predict)

Classification results:
f1: 62.34%
recall: 88.61%
precision: 48.09%


Данные плохие...  но на .values ошибка вылетает с индексами. Не понимаю, как ее исправить.. 