# Урок 6. Задача look-alike
1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
2. сделать feature engineering
3. обучить любой классификатор (какой вам нравится)
4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
5. применить random negative sampling для построения классификатора в новых условиях
6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
7. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [78]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, RobustScaler

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import confusion_matrix, f1_score, roc_auc_score, \
                            precision_score, classification_report, precision_recall_curve

import time

In [2]:
def corr_matrix(data, features):
    corr_matrix = np.round(data.loc[:, features].corr(), 2)
    corr_matrix[np.abs(corr_matrix) < 0.1] = 0
    
    plt.figure(figsize = (9,7))
    sns.heatmap(corr_matrix, annot=True, linewidths=.5, cmap='GnBu')
    plt.title('Correlation matrix')
    plt.show()

# EDA

In [3]:
DATA_PATH = "./materials/train.csv"

In [4]:
df = pd.read_csv(DATA_PATH)
df.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income_>50K
0,67,Private,366425,Doctorate,16,Divorced,Exec-managerial,Not-in-family,White,Male,99999,0,60,United-States,1
1,17,Private,244602,12th,8,Never-married,Other-service,Own-child,White,Male,0,0,15,United-States,0
2,31,Private,174201,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1


In [5]:
df.dropna(inplace=True)

In [6]:
X = df[['age', 'workclass', 'fnlwgt', 'education', 'educational-num',
       'marital-status', 'occupation', 'relationship', 'race', 'gender',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country']]
y = df['income_>50K']

In [7]:
num_features = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']

oe_features = ['education']
education_prio = ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', '10th', '11th', '12th', \
                  'HS-grad', 'Prof-school', 'Assoc-acdm', 'Assoc-voc', \
                  'Some-college', 'Bachelors', 'Masters', 'Doctorate']

ohe_features = ['workclass', 'marital-status', 'occupation', 'relationship', \
                'race', 'gender', 'native-country']

# In the next homeworks I'll work with nans and will try to evaluate usefullness of the features
columns_with_nan = ['workclass', 'occupation', 'native-country']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

#### Создаем Pipeline

In [9]:
rs = RobustScaler()
oe = OrdinalEncoder(categories=[education_prio])
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

ct = make_column_transformer((rs, num_features),
                             (oe, oe_features),
                             (ohe, ohe_features),
                             remainder='passthrough')


gbc = GradientBoostingClassifier(random_state=42)


pipe = make_pipeline(ct, gbc)

#### Анализируем данные, после трансформации

In [10]:
df_trans = ct.fit_transform(df)
df_trans = pd.DataFrame(df_trans)
df_trans

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,80,81,82,83,84,85,86,87,88,89
0,1.578947,1.557604,99999.0,0.0,4.0,15.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,16.0,1.0
1,-1.052632,0.549052,0.0,0.0,-5.0,7.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,8.0,0.0
2,-0.315789,-0.033786,0.0,0.0,0.0,13.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13.0,1.0
3,1.105263,-0.563648,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0.0
4,-0.631579,-0.240368,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40722,0.789474,-0.904876,0.0,0.0,2.0,13.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13.0,1.0
40723,-0.947368,-0.510969,0.0,0.0,0.0,8.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,9.0,0.0
40724,-0.368421,0.162803,0.0,0.0,3.6,12.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,10.0,0.0
40725,0.473684,-0.665610,0.0,0.0,-1.0,13.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,13.0,0.0


#### При трансформации имена колонок становятся не явными. Для того, чтобы разобраться что произошло, нужны следующие 2 ячейки

In [11]:
ct.transformers_

[('robustscaler',
  RobustScaler(),
  ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']),
 ('ordinalencoder',
  OrdinalEncoder(categories=[['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th',
                              '10th', '11th', '12th', 'HS-grad', 'Prof-school',
                              'Assoc-acdm', 'Assoc-voc', 'Some-college',
                              'Bachelors', 'Masters', 'Doctorate']]),
  ['education']),
 ('onehotencoder',
  OneHotEncoder(handle_unknown='ignore', sparse=False),
  ['workclass',
   'marital-status',
   'occupation',
   'relationship',
   'race',
   'gender',
   'native-country']),
 ('remainder', 'passthrough', [4, 14])]

In [12]:
ct.transformers_[2][1].get_feature_names()

array(['x0_Federal-gov', 'x0_Local-gov', 'x0_Private', 'x0_Self-emp-inc',
       'x0_Self-emp-not-inc', 'x0_State-gov', 'x0_Without-pay',
       'x1_Divorced', 'x1_Married-AF-spouse', 'x1_Married-civ-spouse',
       'x1_Married-spouse-absent', 'x1_Never-married', 'x1_Separated',
       'x1_Widowed', 'x2_Adm-clerical', 'x2_Armed-Forces',
       'x2_Craft-repair', 'x2_Exec-managerial', 'x2_Farming-fishing',
       'x2_Handlers-cleaners', 'x2_Machine-op-inspct', 'x2_Other-service',
       'x2_Priv-house-serv', 'x2_Prof-specialty', 'x2_Protective-serv',
       'x2_Sales', 'x2_Tech-support', 'x2_Transport-moving', 'x3_Husband',
       'x3_Not-in-family', 'x3_Other-relative', 'x3_Own-child',
       'x3_Unmarried', 'x3_Wife', 'x4_Amer-Indian-Eskimo',
       'x4_Asian-Pac-Islander', 'x4_Black', 'x4_Other', 'x4_White',
       'x5_Female', 'x5_Male', 'x6_Cambodia', 'x6_Canada', 'x6_China',
       'x6_Columbia', 'x6_Cuba', 'x6_Dominican-Republic', 'x6_Ecuador',
       'x6_El-Salvador', 'x6_Englan

#### При таком кол-ве фичей строить матрицу корреляции бесполезно, поэтому посмотрим только на корелляцию фичей с таргетом

In [13]:
corr_matrix = np.round(df_trans.loc[:, df_trans.columns].corr(), 2)
corr_matrix[np.abs(corr_matrix) < 0.1] = 0
corr_matrix = pd.DataFrame(corr_matrix)

corr_matrix[89].loc[corr_matrix[89] != 0]

0     0.24
2     0.22
3     0.15
4     0.23
5     0.26
8    -0.12
9     0.14
13   -0.13
15    0.45
17   -0.32
20   -0.10
23    0.21
27   -0.17
29    0.18
34    0.40
35   -0.20
37   -0.22
38   -0.15
39    0.12
45   -0.22
46    0.22
88    0.33
89    1.00
Name: 89, dtype: float64

Из-за трансформатора OHE + OE, цифровые колонки стали колонками 0-4, включительно. Колонка 5 - образование. Колонка 88 - наш таргет
  
Видим, что больше всего на таргет влияет:  
  
0  'age' (0.24),  
2  'capital-gain' (0.22),   
3  'capital-loss' (0.15),  
4  'hours-per-week' (0.23),  
5  'education' (0.26),
  
13 'x1_Divorced' (-0.13),  
15 'x1_Married-civ-spouse' (0.45),   
17 'x1_Never-married' (-0.32),  
  
20 'x2_Adm-clerical'  (-0.10),  
23 'x2_Exec-managerial' (0.21),  
27 'x2_Other-service' (-0.17),  
29 'x2_Prof-specialty' (0.18),  
  
34 'x3_Husband' (0.40),  
35 'x3_Not-in-family' (-0.20),  
37 'x3_Own-child' (-0.22),  
38 'x3_Unmarried' (-0.15),  
39 'x3_Wife' (0.12),  
  
45 'x5_Female' (-0.22),   
46 'x5_Male' (0.22)  
  
Колонка 88 - это тоже education, только проставленное в изначальных данных, не сразу это увидел и сделал свою версию через OrdinalEncoder :)  

***
Чуть ниже я дропну колонки, которые вносят меньше 0.1 коэффициента корреляции. Возможно стоит повысить этот трешолд

# Try GradientBoosting Perfimance on the Data

In [14]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('robustscaler',
                                                  RobustScaler(),
                                                  ['age', 'fnlwgt',
                                                   'capital-gain',
                                                   'capital-loss',
                                                   'hours-per-week']),
                                                 ('ordinalencoder',
                                                  OrdinalEncoder(categories=[['Preschool',
                                                                              '1st-4th',
                                                                              '5th-6th',
                                                                              '7th-8th',
                                                                       

In [15]:
cv_score = cross_val_score(pipe, X, y, cv=3, scoring='f1_weighted')
cv_score.mean()

0.8571297575629545

#### Quick data cleaning after correlation analysis

In [16]:
columns_to_drop = ['fnlwgt', 'educational-num', 'workclass', 'race', 'native-country', 'gender']
df_new = df.drop(columns=columns_to_drop)
df_new

Unnamed: 0,age,education,marital-status,occupation,relationship,capital-gain,capital-loss,hours-per-week,income_>50K
0,67,Doctorate,Divorced,Exec-managerial,Not-in-family,99999,0,60,1
1,17,12th,Never-married,Other-service,Own-child,0,0,15,0
2,31,Bachelors,Married-civ-spouse,Exec-managerial,Husband,0,0,40,1
3,58,7th-8th,Married-civ-spouse,Transport-moving,Husband,0,0,40,0
4,25,Some-college,Never-married,Other-service,Not-in-family,0,0,40,0
...,...,...,...,...,...,...,...,...,...
43952,52,Bachelors,Married-civ-spouse,Exec-managerial,Husband,0,0,50,1
43953,19,HS-grad,Never-married,Other-service,Own-child,0,0,40,0
43954,30,Some-college,Divorced,Sales,Not-in-family,0,0,58,0
43955,46,Bachelors,Never-married,Sales,Not-in-family,0,0,35,0


In [17]:
X_new = df_new.loc[:, df_new.columns[:-1]]
y_new = df_new['income_>50K']

#### Quick tuning of the hyperparameters 

In [34]:
ct = make_column_transformer((rs, ['age', 'capital-gain', 'capital-loss', 'hours-per-week']),
                             (oe, ['education']),
                             (ohe, ['marital-status', 'occupation', 'relationship']),
                             remainder='passthrough')

pipe = make_pipeline(ct, gbc)

params={'gradientboostingclassifier__learning_rate':[0.1, 0.05, 0.01],
        'gradientboostingclassifier__n_estimators':[200],
        'gradientboostingclassifier__min_samples_leaf':[1, 5, 10],
        'gradientboostingclassifier__max_depth':[3, 5, 7]
        }

grid = GridSearchCV(pipe,
                    param_grid=params,
                    cv=2,
                    refit=False,
#                     verbose=20
                   )

t1 = time.time()
search = grid.fit(X_new, y_new)
t2 = time.time()

results = search.best_params_

print(f'Model tuning took {t2-t1} sec\n\nResults:')
for parameter in results:
    print(f'{parameter}: {results[parameter]}')

Model tuning took 298.65485644340515 sec

Results:
gradientboostingclassifier__learning_rate: 0.05
gradientboostingclassifier__max_depth: 5
gradientboostingclassifier__min_samples_leaf: 1
gradientboostingclassifier__n_estimators: 200


In [35]:
gbc = GradientBoostingClassifier(random_state=42, learning_rate=0.05, n_estimators=200, min_samples_leaf=1, max_depth=5)

pipe = make_pipeline(ct, gbc)

cv_score = cross_val_score(pipe, X_new, y_new, cv=3, scoring='f1_weighted')
cv_score.mean()

0.861915322584994

In [36]:
final_results = {}
final_results['original_data'] = cv_score.mean()

# PU Learning

### Mocking the Data (50% of positive samples leaves)

In [90]:
pos_sample_len = int(np.ceil(0.5 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')

pos_sample = pos_ind[:pos_sample_len]
df_alt['class_test'] = 0
df_alt.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', df_alt.iloc[:,-1].value_counts())

Using 5046/10092 as positives and unlabeling the rest
target variable:
 0    35681
1     5046
Name: class_test, dtype: int64


In [91]:
X_alt = df_alt.iloc[:,:-2]
y_alt = df_alt.iloc[:,-1]
y_orig = df_alt.iloc[:,-2]

In [92]:
X_train, X_test, y_train, y_test, y_train_orig, y_test_orig = train_test_split(X_alt, y_alt, y_orig, test_size=0.2, random_state=42)

In [93]:
pipe.fit(X_train, y_train)
preds = pipe.predict(X_test)

#### Evaluate the model

In [94]:
f1_score_alt = f1_score(y_test, preds, average='weighted')
print(f'Score of how could the model predict new labels: {f1_score_alt}')

f1_score_25 = f1_score(y_test_orig, preds, average='weighted')
print(f'Model score on original data (with real labels): {f1_score_25}')
final_results['.5_pos'] = f1_score_25

Score of how could the model predict new labels: 0.8378135538928996
Model score on original data (with real labels): 0.7138940283707828


### Mocking the Data (25% of positive samples leaves)

In [100]:
df_alt = df_new.copy()

pos_ind = df_alt.loc[df_alt['income_>50K'] == 1].index.tolist()
np.random.shuffle(pos_ind)

pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')

pos_sample = pos_ind[:pos_sample_len]
df_alt['class_test'] = 0
df_alt.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', df_alt.iloc[:,-1].value_counts())

Using 2523/10092 as positives and unlabeling the rest
target variable:
 0    38204
1     2523
Name: class_test, dtype: int64


In [101]:
X_alt = df_alt.iloc[:,:-2]
y_alt = df_alt.iloc[:,-1]
y_orig = df_alt.iloc[:,-2]

In [102]:
X_train, X_test, y_train, y_test, y_train_orig, y_test_orig = train_test_split(X_alt, y_alt, y_orig, test_size=0.2, random_state=42)

In [103]:
pipe.fit(X_train, y_train)
preds = pipe.predict(X_test)

#### Evaluate the model

In [104]:
f1_score_alt = f1_score(y_test, preds, average='weighted')
print(f'Score of how could the model predict new labels: {f1_score_alt}')

f1_score_25 = f1_score(y_test_orig, preds, average='weighted')
print(f'Model score on original data (with real labels): {f1_score_25}')
final_results['.25_pos'] = f1_score_25

Score of how could the model predict new labels: 0.9087651929150573
Model score on original data (with real labels): 0.6495625527961028


### Mocking the Data (10% of positive samples leaves)

In [95]:
pos_sample_len = int(np.ceil(0.1 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')

pos_sample = pos_ind[:pos_sample_len]
df_alt['class_test'] = 0
df_alt.loc[pos_sample, 'class_test'] = 1
print('target variable:\n', df_alt.iloc[:,-1].value_counts())

Using 1010/10092 as positives and unlabeling the rest
target variable:
 0    39717
1     1010
Name: class_test, dtype: int64


In [96]:
X_alt = df_alt.iloc[:,:-2]
y_alt = df_alt.iloc[:,-1]
y_orig = df_alt.iloc[:,-2]

In [97]:
X_train, X_test, y_train, y_test, y_train_orig, y_test_orig = train_test_split(X_alt, y_alt, y_orig, test_size=0.2, random_state=42)

In [98]:
pipe.fit(X_train, y_train)
preds = pipe.predict(X_test)

In [99]:
f1_score_alt = f1_score(y_test, preds, average='weighted')
print(f'Score of how could the model predict new labels: {f1_score_alt}')

f1_score_25 = f1_score(y_test_orig, preds, average='weighted')
print(f'Model score on original data (with real labels): {f1_score_25}')
final_results['.1_pos'] = f1_score_25

Score of how could the model predict new labels: 0.9579693018355241
Model score on original data (with real labels): 0.6478880961721948


***
# Conclusions:

In [105]:
final_results

{'original_data': 0.861915322584994,
 '.25_pos': 0.6495625527961028,
 '.5_pos': 0.7138940283707828,
 '.1_pos': 0.6478880961721948}

Метод random negative sampling нужен тогда, когда у нас много данных без лейбла. И смотреть на результаты моделей надо с этой перспективы  
  
Мы смоделировали ситуацию, где "подселили" в выборку "неразмеченных" (0) данных, данные, для которых размечен наш таргет, и с этой точки зрения получился очень даже неплохой результат  
  
Вместо оригинального F1 - 0.86, оставив всего 10% размеченных таргетом данных, мы получили F1 - 0.65  
  
Естественно методы 2-step approach и Spy выглядят более надежно в теории. И в будущем предпочтительно попробовать использовать их