## Домашнее задание, урок 5

### Задание 1. Загрузка данных

In [194]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.metrics import recall_score, precision_score, roc_auc_score, f1_score

from catboost import CatBoostClassifier

Для нашей задачи lookalike возьмем данные по доходу населения, полученные в результате переписи: https://archive.ics.uci.edu/ml/datasets/Adult

In [195]:
data = pd.read_csv("adult.data", header=None, names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                                                     'marital-status', 'occupation', 'relationship', 'race', 'sex', 
                                                     'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 
                                                     'income'])

In [196]:
data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### Задание 2. Feature Engineering

In [197]:
# Посмотрим на распределение целевой переменной
data.income.value_counts(dropna=False)

 <=50K    24720
 >50K      7841
Name: income, dtype: int64

Будем считать положительным классом людей с доходом выше чем 50 тыс. 
Соответственно, сделаем замену значений: >50K на 1, <=50K на 0.

In [198]:
data = data.replace({'income': {' <=50K': 0, ' >50K':1}})


In [199]:
data.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


Для классификациии будем использовать CatBoost, поэтому нет необходимости преобразовывать категориальные переменные в вещественные (например, с помощью дамми-переменных): это будет сделано  автоматически. Генерировать новые фичи пока не будем.

In [200]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  native-country  32561 non-null  object
 14  income          32561 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 3.7+ MB


In [201]:
cat_features = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 
                'native-country']

In [202]:
data[cat_features]

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States


### Задание 3. Обучение классификатора

Разобьем данные на тренировочную и тестовую выборки.

In [203]:
X_train, X_test, y_train, y_test = train_test_split(data.drop('income', 1), data['income'], test_size=0.3, random_state=0)

In [204]:
model = CatBoostClassifier(iterations=20, thread_count=2, random_state=42)

In [205]:
model.fit(X_train, y_train, cat_features)

Learning rate set to 0.5
0:	learn: 0.4815147	total: 64ms	remaining: 1.22s
1:	learn: 0.4056302	total: 113ms	remaining: 1.01s
2:	learn: 0.3674883	total: 176ms	remaining: 995ms
3:	learn: 0.3430232	total: 257ms	remaining: 1.03s
4:	learn: 0.3295063	total: 287ms	remaining: 860ms
5:	learn: 0.3218110	total: 315ms	remaining: 735ms
6:	learn: 0.3147391	total: 360ms	remaining: 668ms
7:	learn: 0.3100631	total: 416ms	remaining: 624ms
8:	learn: 0.3072806	total: 450ms	remaining: 549ms
9:	learn: 0.3036997	total: 479ms	remaining: 479ms
10:	learn: 0.3016267	total: 516ms	remaining: 422ms
11:	learn: 0.2994465	total: 540ms	remaining: 360ms
12:	learn: 0.2970549	total: 563ms	remaining: 303ms
13:	learn: 0.2961017	total: 603ms	remaining: 258ms
14:	learn: 0.2937480	total: 641ms	remaining: 214ms
15:	learn: 0.2926430	total: 671ms	remaining: 168ms
16:	learn: 0.2910962	total: 711ms	remaining: 125ms
17:	learn: 0.2900102	total: 746ms	remaining: 82.9ms
18:	learn: 0.2891439	total: 776ms	remaining: 40.9ms
19:	learn: 0.28

<catboost.core.CatBoostClassifier at 0x7fd2c21f4d90>

In [206]:
y_predict= model.predict(X_test)

Создадим dataframe для хранения метрик, полученных после проведенного выше обучения и после random negative sampling.

In [207]:
metrics = pd.DataFrame(columns=['f1-score', 'ROC-AUC', 'recall', 'precision'])

Рассчитаем метрики для проведенной классификации.

In [208]:
def evaluate_results(y_test, y_predict):
    f1 = f1_score(y_test, y_predict)
    print("f1: %.2f%%" % (f1 * 100.0)) 
    roc = roc_auc_score(y_test, y_predict)
    print("roc: %.2f%%" % (roc * 100.0)) 
    rec = recall_score(y_test, y_predict, average='binary')
    print("recall: %.2f%%" % (rec * 100.0)) 
    prc = precision_score(y_test, y_predict, average='binary')
    print("precision: %.2f%%" % (prc * 100.0)) 
    return f1, roc, rec, prc

In [209]:
metrics.loc['CatBoost'] = evaluate_results(y_test, y_predict)

f1: 68.09%
roc: 77.68%
recall: 61.43%
precision: 76.37%


In [210]:
metrics

Unnamed: 0,f1-score,ROC-AUC,recall,precision
CatBoost,0.680901,0.776846,0.61431,0.763684


### Задание 4. Разделение данных на P и U

Представим, что нам неизвестны негативы и часть позитивов

In [211]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 1961/7841 as positives and unlabeling the rest


In [212]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    30600
 1     1961
Name: class_test, dtype: int64


Таким образом, у нас получился 1961 объект (из 7841), отмеченный как P (positive), и 30600 объектов U (unlabeled).

In [213]:
mod_data.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,class_test
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0,-1
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0,-1
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0,-1
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0,-1
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0,-1
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,0,-1
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,0,-1
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,1,-1
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,1,-1
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,1,1


In [214]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

### Задание 5. Random negative sampling

In [215]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(1961, 16) (1961, 16)


In [216]:
model = CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True)
model.fit(sample_train.iloc[:,:-2], 
          sample_train.iloc[:,-1], cat_features)

<catboost.core.CatBoostClassifier at 0x7fd2c80f7370>

In [217]:
y_predict

array([0, 0, 0, ..., 1, 0, 0])

In [218]:
y_predict = model.predict(sample_test.iloc[:,:-2])

Заменим в y_predict -1 на 0, чтобы можно было сравнивать предсказанные значения с известными.

In [219]:
y_predict = np.where(y_predict == -1, 0, y_predict)

In [220]:
metrics.loc['Random negative sampling_25%'] = evaluate_results(sample_test.iloc[:,-2], y_predict)

f1: 62.76%
roc: 82.52%
recall: 86.04%
precision: 49.40%


### Задание 6. Сравнение результатов

In [221]:
metrics

Unnamed: 0,f1-score,ROC-AUC,recall,precision
CatBoost,0.680901,0.776846,0.61431,0.763684
Random negative sampling_25%,0.627615,0.82522,0.860436,0.493957


Видим, что CatBoost дал существенно более высокие результаты. Впрочем, это ожидаемо, учитывая, что во втором случае мы намеренно "утеряли" часть полезных данных.

### Задание 7. Подбор доли P

##### Попробуем повторить шаги 4-6, взяв для P не 25% исходного класса положительных элементов, а 35%.

In [222]:
mod_data = data.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 35% of the positives marked
pos_sample_len = int(np.ceil(0.35 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 2745/7841 as positives and unlabeling the rest


In [223]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    29816
 1     2745
Name: class_test, dtype: int64


In [224]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [225]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(2745, 16) (2745, 16)


In [226]:
model = CatBoostClassifier(iterations=20, thread_count=2, random_state=42, silent=True)
model.fit(sample_train.iloc[:,:-2], 
          sample_train.iloc[:,-1], cat_features)

<catboost.core.CatBoostClassifier at 0x7fd2c80f7940>

In [227]:
y_predict = model.predict(sample_test.iloc[:,:-2])

In [228]:
y_predict = np.where(y_predict == -1, 0, y_predict)

In [229]:
metrics.loc['Random negative sampling_35%'] = evaluate_results(sample_test.iloc[:,-2], y_predict)

f1: 59.15%
roc: 82.01%
recall: 85.14%
precision: 45.32%


In [230]:
metrics

Unnamed: 0,f1-score,ROC-AUC,recall,precision
CatBoost,0.680901,0.776846,0.61431,0.763684
Random negative sampling_25%,0.627615,0.82522,0.860436,0.493957
Random negative sampling_35%,0.591511,0.820104,0.851386,0.453183


Видим, что при изменении объема выборки P качество почти не изменилось, даже немного ухудшилось.