In [1]:
import pandas as pd
import numpy as np

import dask.dataframe as dd
from sklearn.linear_model import LogisticRegression

from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer

from sklearn.metrics import f1_score, classification_report
from sklearn.feature_selection import SelectFromModel

import pickle

In [2]:
from until import ColumnSelector, undersample_df_by_target, run_grid_search, treshold_search, preprocess_data_train, preprocess_data_test, select_type_cols


In [3]:
TRAIN_DATA = 'data/data_train.csv'
FEATURES_DATA = 'data/features.csv'
RANDOM_STATE = 9

### Подготовка данных

In [4]:
train_df = pd.read_csv(TRAIN_DATA)

In [5]:
train_df['buy_time'] = pd.to_datetime(train_df['buy_time'], unit='s')

In [6]:
X_train = undersample_df_by_target(train_df, 'target')

In [7]:
X_train, true_offers_ids = preprocess_data_train(X_train, FEATURES_DATA)

In [8]:
with open("data/offer_mark.txt", "w") as file:
    print(*true_offers_ids, file=file, sep="\n")

In [9]:
y_train = X_train['target']

In [10]:
X_train = X_train.drop('target', axis = 1)

In [11]:
f_all, f_binary, f_categorical, f_numeric = select_type_cols(X_train)

AssertionError: 

### Обучение Модели

In [27]:
f_prep_pipeline = make_pipeline(
    ColumnSelector(columns=f_all),
    FeatureUnion(transformer_list=[
        ("numeric_features", make_pipeline(
            ColumnSelector(f_numeric),
            SimpleImputer(strategy="mean"),
            StandardScaler()
        )),
        ("categorical_features", make_pipeline(
            ColumnSelector(f_categorical),
            SimpleImputer(strategy="most_frequent"),
            OneHotEncoder(handle_unknown='ignore')
        )),
        ("boolean_features", make_pipeline(
            ColumnSelector(f_binary),
        ))
    ])
)

In [28]:
lg_fs_pipe = make_pipeline(
    f_prep_pipeline,
    SelectFromModel(LogisticRegression(penalty='l1', random_state=RANDOM_STATE, solver='liblinear'), max_features = 29),
    LogisticRegression(random_state=RANDOM_STATE)
)

In [29]:
lg_fs_pipe.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [32]:
with open('data/lg_fs_model.pickle', 'wb') as f:
    pickle.dump(lg_fs_pipe, f, protocol=pickle.HIGHEST_PROTOCOL)