Домашнее задание
1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
2. сделать feature engineering
3. обучить любой классификатор (какой вам нравится)
4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
5. применить random negative sampling для построения классификатора в новых условиях
6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
7. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

Описание данных: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

In [2]:
df = pd.read_csv("bank-additional-full.csv", sep=';')
df.head(3)

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [4]:
len(df)

41188

In [5]:
df.iloc[:, -1].value_counts()

no     36548
yes     4640
Name: y, dtype: int64

In [6]:
list_feature_for_change = ['default', 'housing', 'loan']

In [7]:
for feature in list_feature_for_change:
    df.loc[df[feature] == 'yes', feature] = 1
    df.loc[df[feature] == 'no', feature] = -1
    df.loc[df[feature] == 'unknown', feature] = 0

In [8]:
df.loc[df['y'] == 'yes', 'y'] = 1
df.loc[df['y'] == 'no', 'y'] = 0

In [9]:
list_cat_feature = ['job', 'marital', 'education', 'contact', 'month', 'day_of_week', 'poutcome']

In [10]:
for cat_feature in list_cat_feature:
    df = pd.concat([df, pd.get_dummies(df[cat_feature])], axis=1)
    df = df.drop([cat_feature], axis=1)

In [11]:
x_data = df.iloc[:,:-1]
y_data = df.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=7)

In [12]:
model = GradientBoostingClassifier(random_state = 21)

In [13]:
model.fit(x_train, y_train)
y_predict = model.predict(x_test)

In [14]:
dict_result = dict()

In [15]:
def evaluate_results(y_test, y_predict):
    print('Classification results:')
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 
    return [prc, rec, f1, roc]

    
dict_result['Graient Boosting'] = evaluate_results(y_test, y_predict)

Classification results:
f1: 100.00%
roc: 100.00%
recall: 100.00%
precision: 100.00%


Применим random negative sampling для построения классификатора в новых условиях

In [16]:
def random_negative_sampling(p, df):
    mod_data = df.copy()
    pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
    np.random.shuffle(pos_ind)
    pos_sample_len = int(np.ceil(p * len(pos_ind)))
    print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
    pos_sample = pos_ind[:pos_sample_len]
    mod_data['class_test'] = -1
    mod_data.loc[pos_sample,'class_test'] = 1
    x_data = mod_data.iloc[:,:-2].values # just the X 
    y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
    y_positive = mod_data.iloc[:,-2].values # original class
    mod_data = mod_data.sample(frac=1)
    neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
    sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
    pos_sample = mod_data[mod_data['class_test']==1]
    sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)
    
    model = GradientBoostingClassifier(learning_rate=0.1, n_estimators=50, random_state = 21)
    model.fit(sample_train.iloc[:,:-2].values, sample_train.iloc[:,-2].values)
    y_predict = model.predict(sample_test.iloc[:,:-2].values)
    dict_result['Random Negative Sampling p =' + str(p)] = evaluate_results(sample_test.iloc[:,-2].values, y_predict)
    print()

In [17]:
list_p = [0.001, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.75, 0.8, 0.9]
for p in list_p:
    random_negative_sampling(p, df)

Using 2/1373 as positives and unlabeling the rest
Classification results:
f1: 20.93%
roc: 83.72%
recall: 90.74%
precision: 11.83%

Using 14/1373 as positives and unlabeling the rest
Classification results:
f1: 95.03%
roc: 99.82%
recall: 100.00%
precision: 90.54%

Using 28/1373 as positives and unlabeling the rest
Classification results:
f1: 94.99%
roc: 99.82%
recall: 100.00%
precision: 90.45%

Using 42/1373 as positives and unlabeling the rest
Classification results:
f1: 94.94%
roc: 99.82%
recall: 100.00%
precision: 90.36%

Using 55/1373 as positives and unlabeling the rest
Classification results:
f1: 94.88%
roc: 99.82%
recall: 100.00%
precision: 90.27%

Using 69/1373 as positives and unlabeling the rest
Classification results:
f1: 94.83%
roc: 99.82%
recall: 100.00%
precision: 90.17%

Using 83/1373 as positives and unlabeling the rest
Classification results:
f1: 94.78%
roc: 99.82%
recall: 100.00%
precision: 90.07%

Using 97/1373 as positives and unlabeling the rest
Classification resul

In [18]:
df_result = pd.DataFrame.from_dict(dict_result, orient='index', columns=['Presicion', 'Recall', 'F1', 'ROC AUC'])
df_result

Unnamed: 0,Presicion,Recall,F1,ROC AUC
Graient Boosting,1.0,1.0,1.0,1.0
Random Negative Sampling p =0.001,0.118296,0.907367,0.209304,0.837239
Random Negative Sampling p =0.01,0.905396,1.0,0.95035,0.998216
Random Negative Sampling p =0.02,0.904506,1.0,0.949859,0.998215
Random Negative Sampling p =0.03,0.903598,1.0,0.949358,0.998215
Random Negative Sampling p =0.04,0.902673,1.0,0.948847,0.998214
Random Negative Sampling p =0.05,0.901662,1.0,0.948288,0.998214
Random Negative Sampling p =0.06,0.900699,1.0,0.947756,0.998213
Random Negative Sampling p =0.07,0.899576,1.0,0.947133,0.998213
Random Negative Sampling p =0.08,1.0,1.0,1.0,1.0
