In [1]:
import pandas as pd; pd.set_option('display.max_columns', None)
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import f1_score, roc_auc_score, precision_score, classification_report, precision_recall_curve, confusion_matrix, log_loss
import catboost as catb
import xgboost as xgb
import lightgbm as lgb

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

from sklearn.base import BaseEstimator, TransformerMixin

from scipy.sparse import hstack
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.pipeline import FeatureUnion

%matplotlib inline

In [2]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    
class NumberSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    Use on numeric columns in the data
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[[self.key]]
    
class OHEEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        self.columns = []

    def fit(self, X, y=None):
        self.columns = [col for col in pd.get_dummies(X, prefix=self.key).columns]
        return self

    def transform(self, X):
        X = pd.get_dummies(X, prefix=self.key)
        test_columns = [col for col in X.columns]
        for col_ in test_columns:
            if col_ not in self.columns:
                X[col_] = 0
        return X[self.columns]

### 1. взять любой набор данных для бинарной классификации (можно скачать один из модельных с https://archive.ics.uci.edu/ml/datasets.php)

### http://archive.ics.uci.edu/ml/datasets/in-vehicle+coupon+recommendation

In [3]:
df = pd.read_csv('./data/in-vehicle-coupon-recommendation.csv')

### 2. сделать feature engineering

In [4]:
df.head()

Unnamed: 0,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,maritalStatus,has_children,education,occupation,income,car,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,No Urgent Place,Alone,Sunny,55,2PM,Restaurant(<20),1d,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,0,0,0,1,1
1,No Urgent Place,Friend(s),Sunny,80,10AM,Coffee House,2h,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,0,0,0,1,0
2,No Urgent Place,Friend(s),Sunny,80,10AM,Carry out & Take away,2h,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,1,0,0,1,1
3,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,2h,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,1,0,0,1,0
4,No Urgent Place,Friend(s),Sunny,80,2PM,Coffee House,1d,Female,21,Unmarried partner,1,Some college - no degree,Unemployed,$37500 - $49999,,never,never,,4~8,1~3,1,1,0,0,1,0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12684 entries, 0 to 12683
Data columns (total 26 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   destination           12684 non-null  object
 1   passanger             12684 non-null  object
 2   weather               12684 non-null  object
 3   temperature           12684 non-null  int64 
 4   time                  12684 non-null  object
 5   coupon                12684 non-null  object
 6   expiration            12684 non-null  object
 7   gender                12684 non-null  object
 8   age                   12684 non-null  object
 9   maritalStatus         12684 non-null  object
 10  has_children          12684 non-null  int64 
 11  education             12684 non-null  object
 12  occupation            12684 non-null  object
 13  income                12684 non-null  object
 14  car                   108 non-null    object
 15  Bar                   12577 non-null

In [6]:
droped_cols = ['occupation', 'car']
df = df.drop(droped_cols,1)

In [7]:
df = df.rename(columns={df.columns[-1]: "target"})
df['target'].value_counts()

1    7210
0    5474
Name: target, dtype: int64

In [8]:
continuous_columns = df.select_dtypes(include=[np.number]).drop('target',1)
print(f"count of numeric_features {continuous_columns.shape[1]}")
continuous_columns = continuous_columns.columns.to_list()
continuous_columns

count of numeric_features 7


['temperature',
 'has_children',
 'toCoupon_GEQ5min',
 'toCoupon_GEQ15min',
 'toCoupon_GEQ25min',
 'direction_same',
 'direction_opp']

In [9]:
cat_feature_num = [
    feature for feature in continuous_columns
    if len(df[feature].unique())<20
]
categorical_columns = df.select_dtypes(include=[np.object]).columns.to_list()
categorical_columns = list(categorical_columns + cat_feature_num)
categorical_columns

['destination',
 'passanger',
 'weather',
 'time',
 'coupon',
 'expiration',
 'gender',
 'age',
 'maritalStatus',
 'education',
 'income',
 'Bar',
 'CoffeeHouse',
 'CarryAway',
 'RestaurantLessThan20',
 'Restaurant20To50',
 'temperature',
 'has_children',
 'toCoupon_GEQ5min',
 'toCoupon_GEQ15min',
 'toCoupon_GEQ25min',
 'direction_same',
 'direction_opp']

In [10]:
result = []

In [11]:
final_transformers = list()
    
for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', ColumnSelector(key=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))

In [12]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [13]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('target', 1), 
                                                    df['target'], test_size=0.3, random_state=0)

### 3. обучить любой классификатор (какой вам нравится) и посчитать метрики качества (roc auc, pr/rec/f1, logloss)

In [14]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', lgb.LGBMClassifier(random_state = 42)),
])


#запустим кросс-валидацию
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=10 ,scoring='roc_auc')
cv_score = np.mean(cv_scores)
cv_score_std = np.std(cv_scores)
print('CV score is {}+-{}'.format(cv_score, cv_score_std))

#обучим пайплайн на всем тренировочном датасете
pipeline.fit(X_train, y_train)
y_predict = pipeline.predict(X_test)
y_score = pipeline.predict_proba(X_test)[:, 1]

CV score is 0.8173681906203882+-0.013899731640974255


In [15]:
precision, recall, thresholds = precision_recall_curve(y_test, y_score)


fscore = (2 * precision * recall) / (precision + recall)
roc = roc_auc_score(y_test, y_predict)
log_los = log_loss(y_test, y_score)

# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Roc_auc=%.3f, Log_loss=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        roc,
                                                                        log_los,
                                                                        precision[ix],
                                                                        recall[ix]
                                                                                     ))
result.append({"method":"light_gbm_normal" ,"roc_auc" : roc, "fscore" : fscore[ix],
               "precision" : precision[ix], "recall" : recall[ix],
               "log_los" : log_los
              })



Best Threshold=0.453093, F-Score=0.789, Roc_auc=0.735, Log_loss=0.516, Precision=0.726, Recall=0.864


### 4. далее разделить ваш набор данных на два множества: P (positives) и U (unlabeled). Причем брать нужно не все положительные (класс 1) примеры, а только лишь часть

In [16]:
mod_data = df.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.25 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 1803/7210 as positives and unlabeling the rest


In [17]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    10881
 1     1803
Name: class_test, dtype: int64


In [18]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

### 5. применить random negative sampling для построения классификатора в новых условиях

In [19]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(1803, 25) (1803, 25)


In [20]:
continuous_columns = sample_train.select_dtypes(include=[np.number]).drop('target',1)
print(f"count of numeric_features {continuous_columns.shape[1]}")
continuous_columns = continuous_columns.columns.to_list()
continuous_columns = continuous_columns[:-1]
continuous_columns

count of numeric_features 8


['temperature',
 'has_children',
 'toCoupon_GEQ5min',
 'toCoupon_GEQ15min',
 'toCoupon_GEQ25min',
 'direction_same',
 'direction_opp']

In [21]:
cat_feature_num = [
    feature for feature in continuous_columns
    if len(sample_train[feature].unique())<20
]
categorical_columns = sample_train.select_dtypes(include=[np.object]).columns.to_list()
categorical_columns = list(categorical_columns + cat_feature_num)
categorical_columns = categorical_columns[:-1]
categorical_columns

['destination',
 'passanger',
 'weather',
 'time',
 'coupon',
 'expiration',
 'gender',
 'age',
 'maritalStatus',
 'education',
 'income',
 'Bar',
 'CoffeeHouse',
 'CarryAway',
 'RestaurantLessThan20',
 'Restaurant20To50',
 'temperature',
 'has_children',
 'toCoupon_GEQ5min',
 'toCoupon_GEQ15min',
 'toCoupon_GEQ25min',
 'direction_same']

In [22]:
final_transformers = list()
    
for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', ColumnSelector(key=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))

In [23]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [24]:
droped_colums = ['target', 'class_test']

In [25]:
X_train, X_test, y_train, y_test = train_test_split(sample_train.drop(['target','class_test'], 1), 
                                                    sample_train[droped_colums], test_size=0.3, random_state=0)

In [26]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', lgb.LGBMClassifier(random_state = 42)),
])

#обучим пайплайн на всем тренировочном датасете
pipeline.fit(X_train, y_train['class_test'])
y_predict = pipeline.predict(X_test)
y_score = pipeline.predict_proba(X_test)[:, 1]

In [27]:
precision, recall, thresholds = precision_recall_curve(y_test['target'], y_score)

fscore = (2 * precision * recall) / (precision + recall)
roc = roc_auc_score(y_test['target'], y_predict)
log_los = log_loss(y_test['target'], y_score)

# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Roc_auc=%.3f, Log_loss=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        roc,
                                                                        log_los,
                                                                        precision[ix],
                                                                        recall[ix]
                                                                                     ))

result.append({"method":"light_gbm_PU_0.25" ,"roc_auc" : roc, "fscore" : fscore[ix],
               "precision" : precision[ix], "recall" : recall[ix],
               "log_los" : log_los
              })

Best Threshold=0.167189, F-Score=0.868, Roc_auc=0.652, Log_loss=0.651, Precision=0.779, Recall=0.979


### 6. сравнить качество с решением из пункта 4 (построить отчет - таблицу метрик)

In [28]:
models_results = pd.DataFrame(result)
pd.pivot_table(models_results, columns = 'method').reset_index()

method,index,light_gbm_PU_0.25,light_gbm_normal
0,fscore,0.8678,0.788745
1,log_los,0.651368,0.515703
2,precision,0.77931,0.725896
3,recall,0.97896,0.863507
4,roc_auc,0.651726,0.734661


### 7. поэкспериментировать с долей P на шаге 5 (как будет меняться качество модели при уменьшении/увеличении размера P)

PU__0.10

In [29]:
mod_data = df.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.10 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 721/7210 as positives and unlabeling the rest


In [30]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    11963
 1      721
Name: class_test, dtype: int64


In [31]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [32]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(721, 25) (721, 25)


In [33]:
continuous_columns = sample_train.select_dtypes(include=[np.number]).drop('target',1)
print(f"count of numeric_features {continuous_columns.shape[1]}")
continuous_columns = continuous_columns.columns.to_list()
continuous_columns = continuous_columns[:-1]
continuous_columns

count of numeric_features 8


['temperature',
 'has_children',
 'toCoupon_GEQ5min',
 'toCoupon_GEQ15min',
 'toCoupon_GEQ25min',
 'direction_same',
 'direction_opp']

In [34]:
cat_feature_num = [
    feature for feature in continuous_columns
    if len(sample_train[feature].unique())<20
]
categorical_columns = sample_train.select_dtypes(include=[np.object]).columns.to_list()
categorical_columns = list(categorical_columns + cat_feature_num)
categorical_columns = categorical_columns[:-1]
categorical_columns

['destination',
 'passanger',
 'weather',
 'time',
 'coupon',
 'expiration',
 'gender',
 'age',
 'maritalStatus',
 'education',
 'income',
 'Bar',
 'CoffeeHouse',
 'CarryAway',
 'RestaurantLessThan20',
 'Restaurant20To50',
 'temperature',
 'has_children',
 'toCoupon_GEQ5min',
 'toCoupon_GEQ15min',
 'toCoupon_GEQ25min',
 'direction_same']

In [35]:
final_transformers = list()
    
for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', ColumnSelector(key=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))

In [36]:
feats = FeatureUnion(final_transformers)

feature_processing = Pipeline([('feats', feats)])

In [37]:
droped_colums = ['target', 'class_test']

In [38]:
X_train, X_test, y_train, y_test = train_test_split(sample_train.drop(['target','class_test'], 1), 
                                                    sample_train[droped_colums], test_size=0.3, random_state=0)

In [39]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', lgb.LGBMClassifier(random_state = 42)),
])

#обучим пайплайн на всем тренировочном датасете
pipeline.fit(X_train, y_train['class_test'])
y_predict = pipeline.predict(X_test)
y_score = pipeline.predict_proba(X_test)[:, 1]

In [40]:
precision, recall, thresholds = precision_recall_curve(y_test['target'], y_score)

fscore = (2 * precision * recall) / (precision + recall)
roc = roc_auc_score(y_test['target'], y_predict)
log_los = log_loss(y_test['target'], y_score)

# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Roc_auc=%.3f, Log_loss=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        roc,
                                                                        log_los,
                                                                        precision[ix],
                                                                        recall[ix]
                                                                                     ))

result.append({"method":"light_gbm_PU_0.10" ,"roc_auc" : roc, "fscore" : fscore[ix],
               "precision" : precision[ix], "recall" : recall[ix],
               "log_los" : log_los
              })

Best Threshold=0.008156, F-Score=0.880, Roc_auc=0.545, Log_loss=0.849, Precision=0.785, Recall=1.000


PU__0.40

In [41]:
mod_data = df.copy()
#get the indices of the positives samples
pos_ind = np.where(mod_data.iloc[:,-1].values == 1)[0]
#shuffle them
np.random.shuffle(pos_ind)
# leave just 25% of the positives marked
pos_sample_len = int(np.ceil(0.40 * len(pos_ind)))
print(f'Using {pos_sample_len}/{len(pos_ind)} as positives and unlabeling the rest')
pos_sample = pos_ind[:pos_sample_len]

Using 2884/7210 as positives and unlabeling the rest


In [42]:
mod_data['class_test'] = -1
mod_data.loc[pos_sample,'class_test'] = 1
print('target variable:\n', mod_data.iloc[:,-1].value_counts())

target variable:
 -1    9800
 1    2884
Name: class_test, dtype: int64


In [43]:
x_data = mod_data.iloc[:,:-2].values # just the X 
y_labeled = mod_data.iloc[:,-1].values # new class (just the P & U)
y_positive = mod_data.iloc[:,-2].values # original class

In [44]:
mod_data = mod_data.sample(frac=1)
neg_sample = mod_data[mod_data['class_test']==-1][:len(mod_data[mod_data['class_test']==1])]
sample_test = mod_data[mod_data['class_test']==-1][len(mod_data[mod_data['class_test']==1]):]
pos_sample = mod_data[mod_data['class_test']==1]
print(neg_sample.shape, pos_sample.shape)
sample_train = pd.concat([neg_sample, pos_sample]).sample(frac=1)

(2884, 25) (2884, 25)


In [45]:
continuous_columns = sample_train.select_dtypes(include=[np.number]).drop('target',1)
print(f"count of numeric_features {continuous_columns.shape[1]}")
continuous_columns = continuous_columns.columns.to_list()
continuous_columns = continuous_columns[:-1]
continuous_columns

count of numeric_features 8


['temperature',
 'has_children',
 'toCoupon_GEQ5min',
 'toCoupon_GEQ15min',
 'toCoupon_GEQ25min',
 'direction_same',
 'direction_opp']

In [46]:
cat_feature_num = [
    feature for feature in continuous_columns
    if len(sample_train[feature].unique())<20
]
categorical_columns = sample_train.select_dtypes(include=[np.object]).columns.to_list()
categorical_columns = list(categorical_columns + cat_feature_num)
categorical_columns = categorical_columns[:-1]
categorical_columns

['destination',
 'passanger',
 'weather',
 'time',
 'coupon',
 'expiration',
 'gender',
 'age',
 'maritalStatus',
 'education',
 'income',
 'Bar',
 'CoffeeHouse',
 'CarryAway',
 'RestaurantLessThan20',
 'Restaurant20To50',
 'temperature',
 'has_children',
 'toCoupon_GEQ5min',
 'toCoupon_GEQ15min',
 'toCoupon_GEQ25min',
 'direction_same']

In [47]:
final_transformers = list()
    
for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', ColumnSelector(key=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))

In [48]:
final_transformers = list()
    
for cat_col in categorical_columns:
    cat_transformer = Pipeline([
                ('selector', ColumnSelector(key=cat_col)),
                ('ohe', OHEEncoder(key=cat_col))
            ])
    final_transformers.append((cat_col, cat_transformer))

In [49]:
droped_colums = ['target', 'class_test']

In [50]:
X_train, X_test, y_train, y_test = train_test_split(sample_train.drop(['target','class_test'], 1), 
                                                    sample_train[droped_colums], test_size=0.3, random_state=0)

In [51]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', lgb.LGBMClassifier(random_state = 42)),
])

#обучим пайплайн на всем тренировочном датасете
pipeline.fit(X_train, y_train['class_test'])
y_predict = pipeline.predict(X_test)
y_score = pipeline.predict_proba(X_test)[:, 1]

In [52]:
pipeline = Pipeline([
    ('features',feats),
    ('classifier', lgb.LGBMClassifier(random_state = 42)),
])

#обучим пайплайн на всем тренировочном датасете
pipeline.fit(X_train, y_train['class_test'])
y_predict = pipeline.predict(X_test)
y_score = pipeline.predict_proba(X_test)[:, 1]

In [53]:
precision, recall, thresholds = precision_recall_curve(y_test['target'], y_score)

fscore = (2 * precision * recall) / (precision + recall)
roc = roc_auc_score(y_test['target'], y_predict)
log_los = log_loss(y_test['target'], y_score)

# locate the index of the largest f score
ix = np.argmax(fscore)
print('Best Threshold=%f, F-Score=%.3f, Roc_auc=%.3f, Log_loss=%.3f, Precision=%.3f, Recall=%.3f' % (thresholds[ix], 
                                                                        fscore[ix],
                                                                        roc,
                                                                        log_los,
                                                                        precision[ix],
                                                                        recall[ix]
                                                                                     ))

result.append({"method":"light_gbm_PU_0.40" ,"roc_auc" : roc, "fscore" : fscore[ix],
               "precision" : precision[ix], "recall" : recall[ix],
               "log_los" : log_los
              })

Best Threshold=0.166455, F-Score=0.849, Roc_auc=0.651, Log_loss=0.652, Precision=0.748, Recall=0.982


In [54]:
models_results = pd.DataFrame(result)
pd.pivot_table(models_results, columns = 'method').reset_index()

method,index,light_gbm_PU_0.10,light_gbm_PU_0.25,light_gbm_PU_0.40,light_gbm_normal
0,fscore,0.87969,0.8678,0.849381,0.788745
1,log_los,0.849301,0.651368,0.652024,0.515703
2,precision,0.785219,0.77931,0.748485,0.725896
3,recall,1.0,0.97896,0.981717,0.863507
4,roc_auc,0.544734,0.651726,0.651039,0.734661


Вывовд : С увеличением P f score уменьшается а roc_auc увеличивается 