In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

from sklearn.metrics import recall_score, precision_score, roc_auc_score, accuracy_score, f1_score, precision_recall_curve, classification_report

from sklearn.feature_selection import SelectFromModel, GenericUnivariateSelect, mutual_info_classif

In [None]:
TRAIN_DATA = 'data/data_train.csv'
TEST_DATA = 'data/data_train.csv'
FEATURES_DATA = 'data/features.csv'
RANDOM_STATE = 9

In [None]:
def undersample_df_by_target(df, target_name):

    num_0 = len(df[df[target_name]==0])
    num_1 = len(df[df[target_name]==1])
    undersampled_data = pd.concat([df[df[target_name]==0].sample(num_1), df[df[target_name]==1]])
    
    return undersampled_data

In [None]:
def run_grid_search(estimator, X, y, params_grid, scoring='f1'):
    gsc = GridSearchCV(estimator, params_grid, scoring=scoring, cv=3, n_jobs=-1)

    gsc.fit(X, y)
    print("Best %s score: %.2f" % (scoring, gsc.best_score_))
    print()
    print("Best parameters set found on development set:")
    print()
    print(gsc.best_params_)
    print()
    print("Grid scores on development set:")
    print()

    for i, params in enumerate(gsc.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (gsc.cv_results_['mean_test_score'][i], gsc.cv_results_['std_test_score'][i] * 2, params))

    print()
    
    return gsc

In [None]:
def treshold_search(y_true, y_pred):
    top = [0.5, f1_score(y_true, y_pred[: , 1] > 0.5, average='macro')]
    for treshold in np.linspace(0, 1, 20):         
        fscore = f1_score(y_true, y_pred[: , 1] > treshold, average='macro')
        if fscore > top[1]:
            top[0] = treshold
            top[1] = fscore
    print(f'Лучшая отсечка : {top[0]}, Метрика F1_macro: {top[1]}')
    print("=" * 80)
    print(classification_report(y_true, y_pred[:, 1] > top[0]))

In [None]:
def preprocess_data_train(prep_data_df, FEATURES_DATA):
    prep_data_df['buy_time'] = pd.to_datetime(prep_data_df['buy_time'], unit='s')
    prep_data_df = prep_data_df.drop('Unnamed: 0', axis=1)
    prep_data_df['monthday'] = prep_data_df['buy_time'].dt.day
    prep_data_df = prep_data_df.sort_values('buy_time')
    prep_data_df['not_first_offer'] = prep_data_df.duplicated('id').astype(int)
    
    features_data_df = dd.read_csv(FEATURES_DATA, sep='\t')
    features_data_df = features_data_df.drop('Unnamed: 0', axis=1)
    train_list_index = list(prep_data_df['id'].unique())
    features_data_df = features_data_df.loc[features_data_df['id'].isin(train_list_index)].compute()
    features_data_df['buy_time'] = pd.to_datetime(features_data_df['buy_time'], unit='s')
    features_data_df = features_data_df.sort_values(by="buy_time")
    
    result_data = pd.merge_asof(prep_data_df, features_data_df, on='buy_time', by='id', direction='nearest')
    
    result_data.drop(['id', 'buy_time'], axis=1, inplace=True)
    result_data.drop_duplicates(inplace=True)
    
    return result_data, train_list_index

In [None]:
def preprocess_data_test(prep_data_df, FEATURES_DATA, train_list_index):
    prep_data_df['buy_time'] = pd.to_datetime(prep_data_df['buy_time'], unit='s')
    prep_data_df = prep_data_df.drop('Unnamed: 0', axis=1)
    prep_data_df['monthday'] = prep_data_df['buy_time'].dt.day
    prep_data_df = prep_data_df.sort_values('buy_time')
    prep_data_df['not_first_offer'] = (prep_data_df['id'].isin(train_list_index)).astype(int)
    
    features_data_df = dd.read_csv(FEATURES_DATA, sep='\t')
    features_data_df = features_data_df.drop('Unnamed: 0', axis=1)
    test_list_index = list(prep_data_df['id'].unique())
    features_data_df = features_data_df.loc[features_data_df['id'].isin(test_list_index)].compute()
    features_data_df['buy_time'] = pd.to_datetime(features_data_df['buy_time'], unit='s')
    features_data_df = features_data_df.sort_values(by="buy_time")
    
    result_data = pd.merge_asof(prep_data_df, features_data_df, on='buy_time', by='id', direction='nearest')
    
    result_data.drop(['id', 'buy_time'], axis=1, inplace=True)
    result_data.sort_index(inplace=True)
    
    return result_data

In [None]:
def select_type_cols(merged_data):
    
    X_nunique = merged_data.apply(lambda x: x.nunique(dropna=False))
    f_all = set(X_nunique.index.tolist())
    f_const = set(X_nunique[X_nunique == 1].index.tolist())
    f_categorical = set(X_nunique[X_nunique <= 30].index.tolist())
    f_numeric = (merged_data.fillna(0).astype(int).sum() - merged_data.fillna(0).sum()).abs()
    f_numeric = set(f_numeric[f_numeric > 0].index.tolist())
    f_binary = set(merged_data.loc[:, f_all].columns[(
                      (merged_data.loc[:, f_all].max() == 1) & \
                      (merged_data.loc[:, f_all].min() == 0) & \
                      (merged_data.loc[:, f_all].isnull().sum() == 0))])
    f_categorical = f_categorical - f_const - f_binary
    f_numeric = f_numeric - f_categorical - f_const
    
    assert(X_nunique.shape[0] == len(f_const) + len(f_binary) + len(f_numeric) + len(f_categorical))
    
    f_all = list(f_binary | f_categorical | f_numeric)
    f_binary, f_categorical, f_numeric = list(f_binary), list(f_categorical), list(f_numeric)
    
    return f_all, f_binary, f_categorical, f_numeric


    

In [None]:
train_df = pd.read_csv(TRAIN_DATA)

In [None]:
train_df.head(2)

##### Сперва достанем данные из временной метки, и посмотрим распределение целевого признака в разрезе остальных признаков.

In [None]:
train_df = train_df.drop('Unnamed: 0', axis=1)

In [None]:
train_df['buy_time'] = pd.to_datetime(train_df['buy_time'], unit='s') 

In [None]:
train_df['monthday'] = train_df['buy_time'].dt.day
train_df['month'] = train_df['buy_time'].dt.month

- В процессе выяснил, что все звонки были в воскресенье в 21 час. По этому нет смысла добавлять день недели и время звонка. (А жаль, мне кажется было бы полезно)

In [None]:
train_df.head(2)

In [None]:
train_df = train_df.sort_values('buy_time')

In [None]:
train_df['not_first_offer'] = train_df.duplicated('id').astype(int)

- Выделим пользователей, которым делали предложения больше 1 раза.

##### Целевая переменная

In [None]:
plt.figure(figsize=(8, 5))

sns.countplot(x='target', data=train_df)

plt.title('Target variable distribution')
plt.show()

- Таргет имеет сильный дисбаланс

In [None]:
plt.figure(figsize=(10, 8))

sns.countplot(x="vas_id", hue='target', data=train_df)
plt.title('vas_id grouped by target variable')
plt.legend(title='Target', loc='upper right')

plt.show()

- Видим, что на услугу "6" положительный отклик, относительно отрицательного, на много выше остальных. Также "4" услуга имеет больший спрос. 

In [None]:
plt.figure(figsize=(10, 8))

sns.countplot(x="monthday", hue='target', data=train_df)
plt.title('monthday grouped by target variable')
plt.legend(title='Target', loc='upper right')

plt.show()

- Наблюдается сильный прекос отклика в положительную сторону в середине месяца, может это период зарплаты, когда люди более лояльны к новым покупкам. Ну и в целом видно, что в это время пик предложений, видимо маркетинг тоже пришел к выводу, что лучше звонить в эти дни. 

In [None]:
plt.figure(figsize=(10, 8))

sns.countplot(x="not_first_offer", hue='target', data=train_df)
plt.title('not_first_offer grouped by target variable')
plt.legend(title='Target', loc='upper right')

plt.show()

- Интересно, что те, кому звонили не в первый раз, имеют явный перекос в сторону положительного отклика. 

In [None]:
plt.figure(figsize=(10, 8))

sns.countplot(x="month", hue='target', data=train_df)
plt.title('month grouped by target variable')
plt.legend(title='Target', loc='upper right')

plt.show()

- Видно, что в Ноябре \ Декабре имеется явное увеличение положительных откликов. В декабре большое количество предложений, возможно предлагали новогоднюю акцию или старались закрыть план за год.

In [None]:
sample = train_df.loc[(train_df['not_first_offer'] == 1) & (train_df['monthday'].isin([16, 17, 18, 19, 20]))]

In [None]:
sample['target'].value_counts(normalize=True)

In [None]:
plt.figure(figsize=(10, 8))

sns.countplot(x="vas_id", hue='target', data=sample)
plt.title('vas_id grouped by target variable')
plt.legend(title='Target', loc='upper right')

plt.show()

##### Из приведенных выше данных, могу сделать бейзлайн рекомендации "Всегда нужно звонить повторно, звонить стоит в середине месяца". При этом не чаще раза в месяц, и предлагать разные услуги. Даже без дополнительных описательных характеристик пользователя, мы видим, что вероятность положительного отклика на любую услугу в этих рамках много выше. 

#### Посмотрим описательные характеристики

In [None]:
features_df = dd.read_csv(FEATURES_DATA, sep='\t')

In [None]:
features_df.head()

In [None]:
features_df = features_df.drop('Unnamed: 0', axis=1)

- Чтобы смерджить описательные фичи, возьмем из них только те id, которые есть в трейн датасете.

In [None]:
train_list_index = list(train_df['id'].unique())

In [None]:
features_df = features_df.loc[features_df['id'].isin(train_list_index)].compute()

- Т.к. имеются дубликаты ИД в описательном датасете, подразумевается, что со временем предпочтения менялись берем ближайшую по времени информацию. Если данных не будет, будет nan который мы обработаем в пайплайне заполнив на какую-нибудь статистику.

In [None]:
features_df['buy_time'] = pd.to_datetime(features_df['buy_time'], unit='s')

In [None]:
features_df = features_df.sort_values(by="buy_time")

In [None]:
train_data = pd.merge_asof(train_df, features_df, on='buy_time', by='id', direction='nearest')

- Проверим, совпадают ли значения

In [None]:
import random

rid = random.choice(train_list_index)
print(rid)
features_df.loc[features_df['id'] == rid]


In [None]:
train_data.loc[train_data['id'] == rid]

In [None]:
train_data.drop(['id', 'buy_time', 'month'], axis=1, inplace=True)

In [None]:
print("ID уникален? ", train_data.index.is_unique)
print("Есть ли дубли в строках?", train_data.duplicated().sum())
print("Сколько процент признаков могут принимать null-значениями? %d%%" % float((train_data.isnull().sum() > 0).sum()/train_data.shape[1]*100))

In [None]:
train_data.drop_duplicates(inplace=True)

In [None]:
train_data['target'].value_counts()

- Посмотрим на корреляцию между целевой и остальными признаками. 

In [None]:
corr_matrix = train_data.corr()[['target']]

In [None]:
corr_matrix.loc[abs(corr_matrix['target']) > 0.3].shape[0] - 1

In [None]:
corr_matrix.loc[abs(corr_matrix['target']) > 0.2].shape[0] - 1

In [None]:
corr_matrix.loc[abs(corr_matrix['target']) > 0.1].shape[0] - 1

In [None]:
corr_matrix.loc[abs(corr_matrix['target']) > 0.05].shape[0] - 1

In [None]:
corr_matrix.loc[abs(corr_matrix['target']) > 0.01].shape[0] - 1

In [None]:
corr_matrix.loc[abs(corr_matrix['target']) > 0.001]

 - Очень слабая корреляция. Либо связь не линейная, либо просто признаки бесполезные.

### Подготовим данные, разобьем на трейн\тест по времени.

In [None]:
train_df = pd.read_csv(TRAIN_DATA)

- Разделим на трейн\тест. Берем половину последнего месяца.

In [None]:
train_df['buy_time'] = pd.to_datetime(train_df['buy_time'], unit='s')

In [None]:
new_train_df = train_df.loc[~((train_df['buy_time'].dt.month == 12) & (train_df['buy_time'].dt.day > 10))]

In [None]:
valid_df = train_df.loc[((train_df['buy_time'].dt.month == 12) & (train_df['buy_time'].dt.day > 10))]

- Сделаем балансировку андерсемплингом так как данных много

In [None]:
X_train = undersample_df_by_target(new_train_df, 'target')

In [None]:
X_train, true_offers_ids = preprocess_data_train(X_train, FEATURES_DATA)

In [None]:
y_train = X_train['target']

In [None]:
X_train = X_train.drop('target', axis = 1)

In [None]:
y_train.value_counts()

In [None]:
valid_df = preprocess_data_test(valid_df, FEATURES_DATA, true_offers_ids)

In [None]:
y_test = valid_df['target']

In [None]:
X_test = valid_df.drop('target', axis = 1)

In [None]:
y_test.value_counts()

- И так, для начала сделаем Бейзлайн, оценив, что мы можем получить.

In [None]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("DataFrame не содердит следующие колонки: %s" % cols_error)

In [None]:
rf = RandomForestClassifier()

In [None]:
step_imputer = SimpleImputer(strategy="mean")

In [None]:
baseline = Pipeline([
    ('imuter', step_imputer),
    ('model', rf)
])

In [None]:
baseline.fit(X_train, y_train)

In [None]:
preds = baseline.predict(X_test)

In [None]:
f1_score(y_test, preds, average='macro')

In [None]:
print(classification_report(y_test, preds))

- Весьма не плохой результат для использования данных "как есть", F-Score=0.872, F1-macro 0.67

===========================================================================================================

#### Попробуем глянуть, что за фичи мы имеем.

- И так, соберем списки признаков

In [None]:
f_all, f_binary, f_categorical, f_numeric = select_type_cols(X_train)

- Создадим пайплайн для препроцессинга. В нем мы стандартизируем вещественные признаки, что бы попробовать использовать линейную модель, и и сделаем OHE категориальных признаков. Также добавим шаги по автозаполнению пропусков.  

In [None]:
f_prep_pipeline = make_pipeline(
    ColumnSelector(columns=f_all),
    FeatureUnion(transformer_list=[
        ("numeric_features", make_pipeline(
            ColumnSelector(f_numeric),
            SimpleImputer(strategy="mean"),
            StandardScaler()
        )),
        ("categorical_features", make_pipeline(
            ColumnSelector(f_categorical),
            SimpleImputer(strategy="most_frequent"),
            OneHotEncoder(handle_unknown='ignore')
        )),
        ("boolean_features", make_pipeline(
            ColumnSelector(f_binary),
        ))
    ])
)

- Попробуем наш бейзлайн на новых фичах. По сути для Леса изменилось только кодирование кат.фич.

In [None]:
rf_pipe = make_pipeline(
    f_prep_pipeline,
    RandomForestClassifier(random_state=RANDOM_STATE)
)

In [None]:
rf_pipe.fit(X_train, y_train)

In [None]:
preds = rf_pipe.predict(X_test)

In [None]:
f1_score(y_test, preds, average='macro')

In [None]:
print(classification_report(y_test, preds))

- Не плохой прирост, +4% точности F1-macro, это хороший результат.

========================================================================================================================

- Теперь попробуем линейный алгоритм.

In [None]:
lg_pipe = make_pipeline(
    f_prep_pipeline,
    LogisticRegression(random_state=RANDOM_STATE)
)

In [None]:
lg_pipe.fit(X_train, y_train)

In [None]:
preds = lg_pipe.predict(X_test)

In [None]:
f1_score(y_test, preds, average='macro')

In [None]:
print(classification_report(y_test, preds))

- Результат очень близкий, при этом скорость на много выше.

========================================================================================================================

- Произведем отбор фичей. Попробуем использовать метод с Лассо регрессией и Взаимной информации. В качестве контрольной модели будем использовать Лог рег, т.к. она хорошо себя показала, и работает довольно быстро.

In [None]:
lg_fs_pipe = make_pipeline(
    f_prep_pipeline,
    SelectFromModel(LogisticRegression(penalty='l2', random_state=RANDOM_STATE, solver='liblinear'), threshold=1e-5),
    LogisticRegression(random_state=RANDOM_STATE)
)

In [None]:
params = [
    {"selectfrommodel__max_features": [None,15, 30,80,120,200,250],
     "selectfrommodel__threshold": [-np.inf],
     "selectfrommodel__estimator__C" : [1, 0.5, 0.01, 0.1]
    }
    
]

run_grid_search(lg_fs_pipe, X_train, y_train, params, scoring='f1')

Best f1 score: 0.90

Best parameters set found on development set:

{'selectfrommodel__estimator__C': 1, 'selectfrommodel__max_features': 5, 'selectfrommodel__threshold': -inf}


- Удивительно, но даже при 5 фичах качество практически такое же, как и при полном наборе.

========================================================================================================================

In [None]:
lg_fs_pipe_kbest_selector = make_pipeline(
    f_prep_pipeline,
    GenericUnivariateSelect(score_func=mutual_info_classif, mode='k_best', param=100),
    LogisticRegression(random_state=RANDOM_STATE)
)


In [None]:
params = [
    {'genericunivariateselect__param' : [15, 30,50,70,80,100,120,150,200,250]},
    
]


Best f1 score: 0.90

Best parameters set found on development set:

{'genericunivariateselect__param': 70}


- Будем использовать: SelectFromModel(LogisticRegression(penalty='l1', random_state=RANDOM_STATE, solver='liblinear'), threshold=1e-5). Т.к. 


==================================================================================================

In [None]:
rf_fs_pipe = make_pipeline(
    f_prep_pipeline,
    SelectFromModel(LogisticRegression(penalty='l1', random_state=RANDOM_STATE, solver='liblinear'), max_features = 29, threshold = -np.inf),
    RandomForestClassifier(random_state=RANDOM_STATE, )
)

In [None]:
params = [
    {'randomforestclassifier__max_features': ['sqrt', 'log2', 2, 5, 8, 10],
     'randomforestclassifier__n_estimators' : [50, 100, 200, 300],    
    }   
]

Best f1 score: 0.90

Best parameters set found on development set:

{'randomforestclassifier__max_features': 8, 'randomforestclassifier__n_estimators': 300}

In [None]:
rf_fs_pipe = make_pipeline(
    f_prep_pipeline,
    SelectFromModel(LogisticRegression(penalty='l1', random_state=RANDOM_STATE, solver='liblinear'), max_features = 29, threshold = -np.inf),
    RandomForestClassifier(random_state=RANDOM_STATE, max_features = 8, n_estimators = 300)
)

In [None]:
rf_fs_pipe.fit(X_train, y_train)

In [None]:
preds_train = rf_fs_pipe.predict(X_train)
f1_score(y_train, preds_train, average='macro')

In [None]:
preds_test = rf_fs_pipe.predict(X_test)
f1_score(y_test, preds_test, average='macro')

In [None]:
preds_proba_train = rf_fs_pipe.predict_proba(X_train)

In [None]:
preds_proba_test = rf_fs_pipe.predict_proba(X_test)

In [None]:
treshold_search(y_train, preds_proba_train)

In [None]:
treshold_search(y_test, preds_proba_test)

=======================================================================================================

In [None]:
rf_gu_pipe = make_pipeline(
    f_prep_pipeline,
    GenericUnivariateSelect(score_func=mutual_info_classif, mode='k_best', param=50),
    RandomForestClassifier(random_state=RANDOM_STATE, n_estimators = 300)
)

In [None]:
rf_gu_pipe.fit(X_train, y_train)

In [None]:
preds_train = rf_gu_pipe.predict(X_train)
f1_score(y_train, preds_train, average='macro')

In [None]:
preds_test = rf_gu_pipe.predict(X_test)
f1_score(y_test, preds_test, average='macro')

In [None]:
preds_proba_train = rf_gu_pipe.predict_proba(X_train)

In [None]:
treshold_search(y_train, preds_proba_train)

In [None]:
preds_proba_test = rf_gu_pipe.predict_proba(X_test)

In [None]:
treshold_search(y_test, preds_proba_test)

================================================================================================================

In [None]:
xgb_fs_pipe = make_pipeline(
    f_prep_pipeline,
    SelectFromModel(LogisticRegression(penalty='l1', random_state=RANDOM_STATE, solver='liblinear'), max_features = 29, threshold = -np.inf),
    XGBClassifier(random_state=RANDOM_STATE)
)

In [None]:
params = [
    {'xgbclassifier__max_depth': [1,2,3,4,5],
     'xgbclassifier__n_estimators' : [200, 300, 400],     
    }    
]

Best parameters set found on development set:

{'xgbclassifier__max_depth': 2, 'xgbclassifier__n_estimators': 200}

In [None]:
xgb_fs_pipe = make_pipeline(
    f_prep_pipeline,
    SelectFromModel(LogisticRegression(penalty='l1', random_state=RANDOM_STATE, solver='liblinear'), max_features = 29, threshold = -np.inf),
    XGBClassifier(random_state=RANDOM_STATE, max_depth = 2, n_estimators = 200)
)

In [None]:
xgb_fs_pipe.fit(X_train, y_train)

In [None]:
preds_train = xgb_fs_pipe.predict(X_train)
f1_score(y_train, preds_train, average='macro')

In [None]:
preds_test = xgb_fs_pipe.predict(X_test)
f1_score(y_test, preds_test, average='macro')

In [None]:
preds_proba_train = xgb_fs_pipe.predict_proba(X_train)

In [None]:
treshold_search(y_train, preds_proba_train)

In [None]:
preds_proba_test = xgb_fs_pipe.predict_proba(X_test)

In [None]:
treshold_search(y_test, preds_proba_test)

=================================================================================

In [None]:
lg_fs_pipe = make_pipeline(
    f_prep_pipeline,
    SelectFromModel(LogisticRegression(penalty='l1', random_state=RANDOM_STATE, solver='liblinear'), max_features = 29),
    LogisticRegression(random_state=RANDOM_STATE)
)

In [None]:
lg_fs_pipe.fit(X_train, y_train)

In [None]:
preds_train = lg_fs_pipe.predict(X_train)
f1_score(y_train, preds_train, average='macro')

In [None]:
preds_test = lg_fs_pipe.predict(X_test)
f1_score(y_test, preds_test, average='macro')

In [None]:
preds_proba_train = lg_fs_pipe.predict_proba(X_train)

In [None]:
treshold_search(y_train, preds_proba_train)

In [None]:
preds_proba_test = lg_fs_pipe.predict_proba(X_test)

In [None]:
treshold_search(y_test, preds_proba_test)

In [None]:
lgbm_fs_pipe = make_pipeline(
    f_prep_pipeline,
    SelectFromModel(LogisticRegression(penalty='l1', random_state=RANDOM_STATE, solver='liblinear'),max_features = 29, threshold=1e-5),
    LGBMClassifier(random_state=RANDOM_STATE)
)

In [None]:
params = [
    {
     'lgbmclassifier__n_estimators' : [100, 200, 300]     
    } 
]

In [None]:
lgbm_fs_pipe = make_pipeline(
    f_prep_pipeline,
    SelectFromModel(LogisticRegression(penalty='l1', random_state=RANDOM_STATE, solver='liblinear'),max_features = 29, threshold=1e-5),
    LGBMClassifier(random_state=RANDOM_STATE, n_estimators = 200)
)

In [None]:
lgbm_fs_pipe.fit(X_train, y_train)

In [None]:
preds_train = lgbm_fs_pipe.predict(X_train)
f1_score(y_train, preds_train, average='macro')

In [None]:
preds_test = lgbm_fs_pipe.predict(X_test)
f1_score(y_test, preds_test, average='macro')

In [None]:
preds_proba_train = lgbm_fs_pipe.predict_proba(X_train)

In [None]:
treshold_search(y_train, preds_proba_train)

In [None]:
preds_proba_test = lgbm_fs_pipe.predict_proba(X_test)

In [None]:
treshold_search(y_test, preds_proba_test)

=================================================================================================

In [None]:
lg_fs_pipe = make_pipeline(
    f_prep_pipeline,
    SelectFromModel(LogisticRegression(penalty='l2', random_state=RANDOM_STATE, solver='liblinear', C = 0.1), max_features = 15),
    LogisticRegression(random_state=RANDOM_STATE)
)

In [None]:
lg_fs_pipe.fit(X_train, y_train)

In [None]:
preds_train = lg_fs_pipe.predict(X_train)
f1_score(y_train, preds_train, average='macro')

In [None]:
preds_test = lg_fs_pipe.predict(X_test)
f1_score(y_test, preds_test, average='macro')

In [None]:
preds_proba_train = lg_fs_pipe.predict_proba(X_train)

In [None]:
treshold_search(y_train, preds_proba_train)

In [None]:
preds_proba_test = lg_fs_pipe.predict_proba(X_test)

In [None]:
treshold_search(y_test, preds_proba_test)