In [1]:
import pandas as pd
import numpy as np
import box

from password_complexity.features.generate import generate_features
from password_complexity.utils.config import load_config
from password_complexity.metrics.metrics import rmsle_score, RMSLE_score, RMSLE
from password_complexity.utils.hyperparameters import params_distribution


## Preprocessing


#### Preprocess NaN passwowrds

In [21]:
common_path='/home/nikita/My_github/password_comp/data/'
train=pd.read_csv(common_path+'train.csv')
test=pd.read_csv(common_path+'Xtest.csv')

#fill nan's
train['Password'] = train['Password'].fillna('No_set_password')
test['Password'] = test['Password'].fillna('No_set_password')

train = train.groupby('Password', as_index=False)['Times'].sum()

train.head()

Unnamed: 0,Password,Times
0,!!!!!!,8
1,!!!!!!!,5
2,!!!!!!55,1
3,!!!!!25,1
4,!!!!!mno,1


In [22]:
train=generate_features(train)
train.head()

Unnamed: 0,Times,password_length,unique_chars_count,unique_chars_prop,is_alnum,is_alpha,is_lower,is_upper,is_numeric,letters_count,letters_prob,difits_count,digits_prob,is_lower_and_upper,palindrom
0,8,6,1,0.166667,0,0,0,0,0,0,0.0,0,0.0,0,True
1,5,7,1,0.142857,0,0,0,0,0,0,0.0,0,0.0,0,True
2,1,8,2,0.25,0,0,0,0,0,0,0.0,2,0.25,0,False
3,1,7,3,0.428571,0,0,0,0,0,0,0.0,2,0.285714,0,False
4,1,8,4,0.5,0,0,1,0,0,3,0.375,0,0.0,1,False


In [23]:
train.isna().sum()

Times                 0
password_length       0
unique_chars_count    0
unique_chars_prop     0
is_alnum              0
is_alpha              0
is_lower              0
is_upper              0
is_numeric            0
letters_count         0
letters_prob          0
difits_count          0
digits_prob           0
is_lower_and_upper    0
palindrom             0
dtype: int64

### Prepare Model

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import GridSearchCV, train_test_split, KFold
from sklearn.preprocessing import StandardScaler, RobustScaler, PowerTransformer, FunctionTransformer
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_squared_log_error

from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, PoissonRegressor, GammaRegressor

from lightgbm import LGBMRegressor

from optuna.samplers import RandomSampler
import optuna
# for test
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score


In [25]:
y_train = np.log1p(train['Times'])
X_train = train.drop('Times', axis=1)

X_train.shape, y_train.shape

((4151495, 14), (4151495,))

In [26]:
num_cols = X_train.filter(regex='_').columns.tolist()
num_cols

['password_length',
 'unique_chars_count',
 'unique_chars_prop',
 'is_alnum',
 'is_alpha',
 'is_lower',
 'is_upper',
 'is_numeric',
 'letters_count',
 'letters_prob',
 'difits_count',
 'digits_prob',
 'is_lower_and_upper']

In [27]:
## Создаем списки коннилчественных
## и категориальных столбцов
cat_columns = X_train.dtypes[X_train.dtypes == 'object'].index
num_columns = X_train.dtypes[X_train.dtypes != 'object'].index

## Cоздаем конвейер для количественных переменных
num_pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

# Создаем конвейер для категориальных переменных
cat_pipe = Pipeline([
    ('imputer', SimpleImputer()),
    ('feature_encoding', OneHotEncoder(sparse=False, handle_unknown='ignore'))
])

# создаем список трехэлементных кортежей, в котором
# первый элемент кортежа - название конвейера с
# преобразованиями для определённого типа признаков
transformers = [('num', num_pipe, num_columns),
               ('cat', cat_pipe, cat_columns)]

# передаем список трансформеров в  Columnransformer
transformer = ColumnTransformer(transformers=transformers)

# задаем итоговый конвейрер
pipe = Pipeline([
    ('transform', transformer),
    ('logreg', LinearRegression())
])

In [28]:
# немного модифицируем конвейер
pipe = Pipeline([('transform', transformer), 
                ('logreg', LogisticRegression())])   

In [29]:
#LinearRegression?

In [30]:
# пишем функцию, которую будем оптимизировать
def objective(trial):
    # задаем пространство поиска
    params = {
        'rank_': trial.suggest_int('rank_', 1, 64),
        'normalize': trial.suggest_categorical('normalize', [True, False])
    }
    pipe.set_params(**params)
    return np.mean(cross_val_score(pipe, X_train, y_train, cv=5))

In [31]:
#y_train2=np.random.randint(2, size=4151495)
y_train2=y_train[:15000]
X_train2=X_train.iloc[:15000]

from sklearn.linear_model import LogisticRegression, LinearRegression
# немного модифицируем конвейер
pipe = Pipeline([('transform', transformer), 
                ('linear', LinearRegression())])

# пишем функцию, которую будем оптимизировать
def objective(trial):
    # задаем пространство поиска
    params = {
        'transform__cat__imputer__strategy': trial.suggest_categorical('transform__cat__imputer__strategy', ['mean', 'median']),
        'linear__normalize': trial.suggest_categorical('linear__normalize', [True, False]),
        'linear__positive': trial.suggest_categorical('linear__positive', [True])
    }
    params_distributions(**params)
    return np.mean(cross_val_score(pipe, X_train2, y_train2, cv=5, scoring=RMSLE_score

    ))


# задаем новое стартовое значение генератора псевдослучайных чисел
sampl = RandomSampler(seed=42)
# создаем задачу оптимизации
study = optuna.create_study(sampler=sampl, direction='minimize')
# отключаем вывод результатов оптимизации в режиме реального времени
optuna.logging.enable_default_handler()
# выполняем оптимизацию
study.optimize(objective, n_trials=10)

[32m[I 2021-09-05 12:28:13,101][0m A new study created in memory with name: no-name-25b1c1c0-b511-407c-8dd0-59aae1a00b37[0m
[32m[I 2021-09-05 12:28:16,203][0m Trial 0 finished with value: -0.1686994193616374 and parameters: {'transform__cat__imputer__strategy': 'median', 'linear__normalize': True, 'linear__positive': True}. Best is trial 0 with value: -0.1686994193616374.[0m
[32m[I 2021-09-05 12:28:16,715][0m Trial 1 finished with value: -0.1686994193616378 and parameters: {'transform__cat__imputer__strategy': 'mean', 'linear__normalize': False, 'linear__positive': True}. Best is trial 1 with value: -0.1686994193616378.[0m
[32m[I 2021-09-05 12:28:17,293][0m Trial 2 finished with value: -0.1686994193616378 and parameters: {'transform__cat__imputer__strategy': 'median', 'linear__normalize': False, 'linear__positive': True}. Best is trial 1 with value: -0.1686994193616378.[0m
[32m[I 2021-09-05 12:28:17,848][0m Trial 3 finished with value: -0.1686994193616378 and parameters: 

In [32]:
optuna_cv_pipe = Pipeline([('transform', transformer), 
                           ('linear', LinearRegression())])
from optuna.integration import OptunaSearchCV
from optuna.distributions import *
params_distribution = {
        'transform__cat__imputer__strategy': CategoricalDistribution(['mean', 'median']),
        'linear__normalize': CategoricalDistribution([True, False]),
        'linear__positive': CategoricalDistribution([True])
    }


optuna_search = OptunaSearchCV(
    optuna_cv_pipe,
    params_distribution,
    scoring=RMSLE_score,
    random_state=42,
    n_trials=10,
    verbose=1,
    cv=5
)


optuna_search.fit(X_train2, y_train2)


  optuna_search = OptunaSearchCV(
[32m[I 2021-09-05 12:28:22,225][0m A new study created in memory with name: no-name-6a1327ee-34d1-4176-af97-0de6c269a8db[0m
[32m[I 2021-09-05 12:28:22,231][0m Searching the best hyperparameters using 15000 samples...[0m
[32m[I 2021-09-05 12:28:22,866][0m Trial 0 finished with value: -0.1686994193616378 and parameters: {'transform__cat__imputer__strategy': 'median', 'linear__normalize': False, 'linear__positive': True}. Best is trial 0 with value: -0.1686994193616378.[0m
[32m[I 2021-09-05 12:28:23,457][0m Trial 1 finished with value: -0.1686994193616374 and parameters: {'transform__cat__imputer__strategy': 'mean', 'linear__normalize': True, 'linear__positive': True}. Best is trial 1 with value: -0.1686994193616374.[0m
[32m[I 2021-09-05 12:28:24,037][0m Trial 2 finished with value: -0.1686994193616378 and parameters: {'transform__cat__imputer__strategy': 'median', 'linear__normalize': False, 'linear__positive': True}. Best is trial 1 with v

OptunaSearchCV(estimator=Pipeline(steps=[('transform',
                                          ColumnTransformer(transformers=[('num',
                                                                           Pipeline(steps=[('imputer',
                                                                                            SimpleImputer()),
                                                                                           ('scaler',
                                                                                            StandardScaler())]),
                                                                           Index(['password_length', 'unique_chars_count', 'unique_chars_prop',
       'is_alnum', 'is_alpha', 'is_lower', 'is_upper', 'is_numeric',
       'letters_count', 'letters_prob', 'difits_count', 'digits_prob',...
                                                                           Index([], dtype='object'))])),
                                         (

#### Предскажем значения с помощью нашего оптимизированного пайплайна

In [33]:

RMSLE(y_train2, optuna_search.best_estimator_.predict(X_train2))


0.17236045613766668

In [34]:
X_valid=X_train.iloc[15000:30000]
y_valid=y_train[15000:30000]

preds = optuna_search.best_estimator_.predict(X_valid)
RMSLE(y_valid, preds)


0.2541710383954245

# Норм , теперь добавим сохранение модели в формате pickle

## 

In [35]:
# Прописываем путь
save_path=os.path.join('..', 'models', 'model_config.joblib')


# Сохраняем модель
joblib.dump(optuna_search.best_estimator_, save_path)
# Загружаем модель
model=joblib.load(save_path)
# 
preds=model.predict(X_train.iloc[15000:30000])
y_valid=y_train[15000:30000]

RMSLE(y_valid, preds2)

NameError: name 'joblib' is not defined

0.2541710383954245

In [50]:
import pickle
import joblib


s = joblib.dump(optuna_search.best_estimator_, 'test.joblib')
model2=joblib.load('test.joblib')
preds2=model2.predict(X_train.iloc[15000:30000])
y_valid=y_train[15000:30000]

RMSLE(y_valid, preds)

0.2541710383954245

In [51]:
preds

array([0.97233739, 0.97233739, 0.97233739, ..., 0.76620565, 0.97233739,
       0.75245622])

In [52]:
y_valid

15000    0.693147
15001    0.693147
15002    0.693147
15003    0.693147
15004    0.693147
           ...   
29995    1.386294
29996    3.871201
29997    0.693147
29998    4.043051
29999    0.693147
Name: Times, Length: 15000, dtype: float64

In [53]:
study.best_trial

FrozenTrial(number=1, values=[-0.1686994193616378], datetime_start=datetime.datetime(2021, 8, 25, 17, 54, 17, 229964), datetime_complete=datetime.datetime(2021, 8, 25, 17, 54, 17, 808088), params={'transform__cat__imputer__strategy': 'mean', 'linear__normalize': False, 'linear__positive': True}, distributions={'transform__cat__imputer__strategy': CategoricalDistribution(choices=('mean', 'median')), 'linear__normalize': CategoricalDistribution(choices=(True, False)), 'linear__positive': CategoricalDistribution(choices=(True,))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=1, state=TrialState.COMPLETE, value=None)

In [54]:
study.best_params

{'transform__cat__imputer__strategy': 'mean',
 'linear__normalize': False,
 'linear__positive': True}

In [55]:
pipe.set_params(**study.best_params)

Pipeline(steps=[('transform',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['password_length', 'unique_chars_count', 'unique_chars_prop',
       'is_alnum', 'is_alpha', 'is_lower', 'is_upper', 'is_numeric',
       'letters_count', 'letters_prob', 'difits_count', 'digits_prob',
       'is_lower_and_upper', 'palindrom'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer()),
                                       

In [56]:
study.best_trial

FrozenTrial(number=1, values=[-0.1686994193616378], datetime_start=datetime.datetime(2021, 8, 25, 17, 54, 17, 229964), datetime_complete=datetime.datetime(2021, 8, 25, 17, 54, 17, 808088), params={'transform__cat__imputer__strategy': 'mean', 'linear__normalize': False, 'linear__positive': True}, distributions={'transform__cat__imputer__strategy': CategoricalDistribution(choices=('mean', 'median')), 'linear__normalize': CategoricalDistribution(choices=(True, False)), 'linear__positive': CategoricalDistribution(choices=(True,))}, user_attrs={}, system_attrs={}, intermediate_values={}, trial_id=1, state=TrialState.COMPLETE, value=None)

In [18]:
y_train2=y_train[:16000]

print(set(y_train2))


{1.791759469228055, 2.1972245773362196, 1.6094379124341003, 1.0986122886681096, 2.0794415416798357, 1.3862943611198906, 2.4849066497880004, 3.6375861597263857, 2.772588722239781, 2.302585092994046, 3.58351893845611, 4.61512051684126, 6.851184927493743, 6.295266001439646, 6.293419278846481, 3.295836866004329, 3.1354942159291497, 4.574710978503383, 4.02535169073515, 5.768320995793772, 2.5649493574615367, 2.995732273553991, 3.044522437723423, 3.4965075614664802, 3.7612001156935624, 4.04305126783455, 4.812184355372417, 4.356708826689592, 4.248495242049359, 4.07753744390572, 5.062595033026967, 4.2626798770413155, 4.143134726391533, 3.2188758248682006, 3.332204510175204, 3.784189633918261, 3.8712010109078907, 3.8066624897703196, 4.844187086458591, 4.442651256490317, 4.330733340286331, 5.0106352940962555, 5.043425116919247, 5.783825182329737, 3.5553480614894135, 2.8903717578961645, 3.6635616461296463, 3.8501476017100584, 3.5263605246161616, 4.59511985013459, 4.007333185232471, 4.0943445622221

In [58]:
params={'transform__cat__imputer__strategy': 'median', 'logreg__C': 0.3860863615203717, 'logreg__solver': 'lbfgs'}

pipe.fit(X_train2, y_train2, **params)

KeyError: 'logreg'

In [18]:


Trial 8 finished with value: 0.4998 and parameters: {'strategy': 'median', 'C': 0.014652044500958163, 'solver': 'lbfgs'}. Best is trial 8 with value: 0.4998

Trial 5 finished with value: -0.49362297187419496 and parameters: {'strategy': 'median', 'C': 0.10077311730135345, 'solver': 'lbfgs'}. Best is trial 5 with value: -0.49362297187419496.


Trial 2 finished with value: -0.4877954550601661 and parameters: {'strategy': 'median', 'C': 0.004427367513953402, 'solver': 'liblinear'}. Best is trial 2 with value: -0.4877954550601661.


SyntaxError: invalid syntax (1360556822.py, line 1)

In [18]:
def func():
    raise NameError('strange name')

func()


NameError: strange name

In [None]:
import logging
class OptunaTuner:
    def __init__(
        self,
        eval_metric,
        init_params={},
        vevrbose=True,
        n_jobs=-1,
        randoma_state=42,
        ):
        if eval_metric.name not in [          
            "auc",
            "logloss",
            "rmse",
            "mse",
            "mae",
            "mape",
            "r2",
            "spearman",
            "pearson",
            "f1",
            "average_precision",
            "accuracy",
            "user_defined_metric"
            ]:
            raise NameError(f"Metric {eval_metric.name} is not supported")
        
        

In [4]:
# задаем новое стартовое значение генератора псевдослучайных чисел
sampl = RandomSampler(seed=10)
# создаем задачу оптимизации
study = optuna.create_study(sampler=sampl, direction='maximize')
# отключаем вывод результатов оптимизации в режиме реального времени
optuna.logging.disable_default_handler()
# выполняем оптимизацию
study.optimize(objective, n_trials=100)

NameError: name 'RandomSampler' is not defined

In [8]:
num_cols = X_train.filter(regex='_').columns.tolist()
num_transformer = Pipeline([
    ('identity', FunctionTransformer(lambda x: x))
])

preprocessor = ColumnTransformer([
    ('num', num_transformer, num_cols)
])

pipe = Pipeline([
    ('preprocess', preprocessor),
    ('clf', LGBMRegressor(random_state=RANDOM_STATE))
])

cv = KFold(n_splits=3, shuffle=True, random_state=RANDOM_STATE)

param_grid = {
    'clf__num_leaves': [31], 
    'clf__max_depth': [-1]
}

grid_search = GridSearchCV(pipe, param_grid, 
                           scoring='neg_root_mean_squared_error', n_jobs=1, 
                           cv=cv, verbose=2)
grid_search.fit(X_train, y_train)

grid_search.best_score_

NameError: name 'RANDOM_STATE' is not defined