# GeekBrains
## Машинное обучение в бизнесе
## ДЗ Урока 6 (Задача look-alike)
## Виталий Казанцев

### Домашнее задание

1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)
3. сделать feature engineering
4. обучить любой классификатор (какой вам нравится)
5. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть
6. применить random negative sampling для построения классификатора в новых условиях
7. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)
8. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score

__Задание 1__

# HR Analytics: Job Change of Data Scientists
https://www.kaggle.com/datasets/arashnic/hr-analytics-job-change-of-data-scientists

In [2]:
df = pd.read_csv('../data/aug_train.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19158 entries, 0 to 19157
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   enrollee_id             19158 non-null  int64  
 1   city                    19158 non-null  object 
 2   city_development_index  19158 non-null  float64
 3   gender                  14650 non-null  object 
 4   relevent_experience     19158 non-null  object 
 5   enrolled_university     18772 non-null  object 
 6   education_level         18698 non-null  object 
 7   major_discipline        16345 non-null  object 
 8   experience              19093 non-null  object 
 9   company_size            13220 non-null  object 
 10  company_type            13018 non-null  object 
 11  last_new_job            18735 non-null  object 
 12  training_hours          19158 non-null  int64  
 13  target                  19158 non-null  float64
dtypes: float64(2), int64(2), object(10)
me

In [3]:
df.shape

(19158, 14)

In [4]:
df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,,1,36,1.0
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0.0
2,11561,city_21,0.624,,No relevent experience,Full time course,Graduate,STEM,5,,,never,83,0.0
3,33241,city_115,0.789,,No relevent experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1.0
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0.0


__Задание 2__

In [5]:
df['target'] = df['target'].astype(np.int8)

In [6]:
for col in df.select_dtypes('object').columns:
    df[col] = df[col].fillna(df[col].value_counts().index[0])

__Задание 3__

In [7]:
def quality_model(model, X_train, y_train, X_test, y_test):
    """
    Обучить и оценить модель.
    """
    model = CatBoostClassifier(cat_features=cat_feats)
    model.fit(X_train, y_train, verbose=False)
    y_pred = model.predict(X_test)
    
    f1 = f1_score(y_test, y_pred)
    roc = roc_auc_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred, average='binary')
    rec = recall_score(y_test, y_pred, average='binary')
    
    return {'f1': [f1], 'roc-auc': [roc], 'precision': [prec], 'recall': [rec]}

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['target']), df['target'], test_size=0.2, random_state=42)

In [9]:
cat_feats = ['city', 'gender', 'relevent_experience', 'enrolled_university', 'education_level',
             'major_discipline', 'experience', 'company_size', 'company_type', 'last_new_job']

In [10]:
metrics = pd.DataFrame(quality_model(CatBoostClassifier(cat_features=cat_feats),
                                      X_train,
                                      y_train,
                                      X_test,
                                      y_test))


In [11]:
metrics

Unnamed: 0,f1,roc-auc,precision,recall
0,0.510812,0.673614,0.575758,0.459034


In [12]:
def create_unlabeled(df, pos_frac=0.2):
    """
    Сэмплирует долю pos_frac наблюдений класса 1 как positive, остальные как unlabeled.
    """
    pos_mask = (df['target'] == 1)
    pos_ind = df[pos_mask].sample(frac=pos_frac).index
    unlab_ind = df[~df.index.isin(pos_ind)].index
    
    # Помечаем данные признаком is_labeled - Positive = 1, Unlabeled = 0
    df.loc[pos_ind, 'is_labeled'] = 1
    df.loc[unlab_ind, 'is_labeled'] = 0
    df['is_labeled'] = df['is_labeled'].astype(int)
    return df

In [13]:
rns_df = create_unlabeled(df, pos_frac=0.2)
rns_df.head()

Unnamed: 0,enrollee_id,city,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,target,is_labeled
0,8949,city_103,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,50-99,Pvt Ltd,1,36,1,1
1,29725,city_40,0.776,Male,No relevent experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0,0
2,11561,city_21,0.624,Male,No relevent experience,Full time course,Graduate,STEM,5,50-99,Pvt Ltd,never,83,0,0
3,33241,city_115,0.789,Male,No relevent experience,no_enrollment,Graduate,Business Degree,<1,50-99,Pvt Ltd,never,52,1,1
4,666,city_162,0.767,Male,Has relevent experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0,0


__Задание 5__

In [14]:
def get_rns_samples(rns_df):
    """
    Создает тренировочную и тестовую выборки для RNS на основе признака is_labeled.
    """
    rns_df = rns_df.sample(frac=1)

    pos_sample = rns_df[rns_df['is_labeled'] == 1]
    neg_sample = rns_df[rns_df['is_labeled'] == 0][:pos_sample.shape[0]]
    train_samples = pd.concat([neg_sample, pos_sample]).sample(frac=1)
    test_samples = rns_df[rns_df['is_labeled'] == 0][pos_sample.shape[0]:]
    
    return train_samples, test_samples

In [15]:
train_samples, test_samples = get_rns_samples(rns_df)

In [16]:
metrics_task5 = quality_model(CatBoostClassifier(cat_features=cat_feats),
                             train_samples.iloc[:, :-2],
                             train_samples['is_labeled'],
                             test_samples.iloc[:, :-2],
                             test_samples['target'])

In [17]:
metrics = metrics.append(pd.DataFrame(metrics_task5))
metrics.index = ['normal', 'RNS']
metrics

Unnamed: 0,f1,roc-auc,precision,recall
normal,0.510812,0.673614,0.575758,0.459034
RNS,0.53574,0.732966,0.423915,0.727701


In [18]:
rns_metrics = pd.DataFrame(columns=['f1', 'roc-auc', 'precision', 'recall'])

fracs = np.linspace(0.1, 0.9, 9)
for frac in fracs:
    train_samples, test_samples = get_rns_samples(create_unlabeled(df, pos_frac=frac))
    frac_metrics = quality_model(CatBoostClassifier(cat_features=cat_feats),
                                 train_samples.iloc[:, :-2],
                                 train_samples['is_labeled'],
                                 test_samples.iloc[:, :-2],
                                 test_samples['target'])
    rns_metrics = rns_metrics.append(pd.DataFrame(frac_metrics))

In [19]:
rns_metrics.index = fracs
rns_metrics

Unnamed: 0,f1,roc-auc,precision,recall
0.1,0.527744,0.701156,0.459993,0.6189
0.2,0.538991,0.734759,0.428595,0.725989
0.3,0.497887,0.723715,0.379129,0.724976
0.4,0.483696,0.735138,0.362139,0.728087
0.5,0.447618,0.740399,0.319734,0.745993
0.6,0.393118,0.729862,0.270833,0.71673
0.7,0.34449,0.74928,0.222927,0.75763
0.8,0.26172,0.733853,0.159829,0.721992
0.9,0.162094,0.754175,0.090717,0.760355
