In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import os
import tqdm
import string

from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor

Data extraction

In [2]:
os.chdir('./data')

In [4]:
files = np.sort(np.array(os.listdir()))
files

array(['10-million-password-list-top-100.txt',
       '10-million-password-list-top-1000.txt',
       '10-million-password-list-top-10000.txt',
       '10-million-password-list-top-100000.txt',
       '10-million-password-list-top-1000000.txt',
       '10-million-password-list-top-500.txt', '10k-most-common.txt',
       '500-worst-passwords.txt', 'Xtest.csv.zip',
       'common-passwords-win.txt', 'lgbm_submit1.csv', 'lgbm_submit2.csv',
       'lgbm_submit3.csv', 'lgbm_submit4.csv', 'lgbm_submit5.csv',
       'rockyou-withcount.txt.bz2', 'sample_submission.csv.zip',
       'train.csv.zip', 'words.txt'], dtype='<U40')

In [5]:
train = pd.read_csv('train.csv.zip')
test = pd.read_csv('Xtest.csv.zip')

Functions for features creation

In [6]:
def add_top_features(data, names):
    output = data.copy()
    for name in tqdm.tqdm_notebook(names):
        f_name = 'top_'+ name[:-4].split('-')[-1]
        datka = pd.read_table(name, header=None, names=['Password', f_name])
        datka[f_name] = 1
        output = pd.merge(output, datka, left_on='Password', right_on='Password', how = 'left').fillna(0)
    return output

def all_features(data):
    features = []
    for word in tqdm.tqdm_notebook(data.Password.fillna(0).values):
        features.append(features_create(word))
    features = np.array(features)
    data['N_letter'], data['UpLetter'], data['DownLetter'], data['Num'] = features[:, 0], features[:, 1], features[:, 2], features[:, 3] 
    return data

def features_create(word):
    features = [[],[],[],[]]
    features[0] = len(word)
    up = 0
    lo = 0
    nu = 0
    for symb in word:
        if symb.isnumeric():
            nu += 1
        elif symb.isupper():
            up += 1
        else:
            lo += 1
    features[1] = up
    features[2] = lo
    features[3] = nu
    return features

In [7]:
symbols = string.printable[:-6]
def num_of_symbols(data):
    output = data.copy()
    features = np.zeros((len(symbols), output.shape[0]))
    words = output.Password.values
    for i in tqdm.tqdm_notebook(range(len(words))):
        word = words[i]
        for symb in word:
            try:
                features[symbols.find(symb)][i] += 1
            except BaseException:
                pass
    for symb in symbols:
        output[symb] = features[symbols.find(symb)]
    return output

In [8]:
def add_words_features(data):
    output = data.copy()
    f_name = 'words'
    datka = pd.read_table('words.txt', header=None, names=['Password', f_name])
    datka[f_name] = 1
    output = pd.merge(output, datka, left_on='Password', right_on='Password', how = 'left').fillna(0)
    return output

Features creation

In [9]:
X = add_top_features(train.fillna('0'), files[:7])
X = all_features(X)
X = num_of_symbols(X)
X = add_words_features(X)
y = np.log(X.Times.values)
X_train, X_test, y_train, y_test = train_test_split(X.drop(['Password', 'Times'], axis=1), y, random_state=47)

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4151496), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4151496), HTML(value='')))




Model control

In [10]:
model = LGBMRegressor(random_state=47)
model.fit(X_train, y_train, eval_set=[(np.array(X_test), np.array(y_test))], eval_metric='rmse')

[1]	valid_0's l2: 0.249188	valid_0's rmse: 0.499188
[2]	valid_0's l2: 0.209294	valid_0's rmse: 0.457487
[3]	valid_0's l2: 0.176934	valid_0's rmse: 0.420636
[4]	valid_0's l2: 0.150732	valid_0's rmse: 0.388242
[5]	valid_0's l2: 0.12944	valid_0's rmse: 0.359777
[6]	valid_0's l2: 0.112155	valid_0's rmse: 0.334896
[7]	valid_0's l2: 0.0981449	valid_0's rmse: 0.313281
[8]	valid_0's l2: 0.0867697	valid_0's rmse: 0.294567
[9]	valid_0's l2: 0.0775341	valid_0's rmse: 0.278449
[10]	valid_0's l2: 0.0700117	valid_0's rmse: 0.264597
[11]	valid_0's l2: 0.0638967	valid_0's rmse: 0.252778
[12]	valid_0's l2: 0.0589501	valid_0's rmse: 0.242796
[13]	valid_0's l2: 0.0548251	valid_0's rmse: 0.234148
[14]	valid_0's l2: 0.0514705	valid_0's rmse: 0.226871
[15]	valid_0's l2: 0.0487829	valid_0's rmse: 0.220869
[16]	valid_0's l2: 0.0465319	valid_0's rmse: 0.215713
[17]	valid_0's l2: 0.0447161	valid_0's rmse: 0.211462
[18]	valid_0's l2: 0.0432339	valid_0's rmse: 0.207928
[19]	valid_0's l2: 0.0419873	valid_0's rmse:

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       importance_type='split', learning_rate=0.1, max_depth=-1,
       min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
       n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
       random_state=47, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [11]:
def rmsle(y, y_pred):
    assert len(y) == len(y_pred)
    terms_to_sum = [(math.log(y_pred[i] + 1) - math.log(y[i] + 1)) ** 2.0 for i,pred in enumerate(y_pred)]
    return (sum(terms_to_sum) * (1.0/len(y))) ** 0.5

In [12]:
rmsle(y_test, model.predict(X_test))

0.10178206768299114

Prediction

In [13]:
%%time
model = LGBMRegressor(random_state=47, max_depth=7, n_estimators=500)
model.fit(X.drop(['Password', 'Times'], axis=1), y, eval_metric='rmse')

CPU times: user 13min 1s, sys: 26.5 s, total: 13min 28s
Wall time: 1min 54s


In [14]:
X_control = add_top_features(test.fillna('0'), files[:7])
X_control = all_features(X_control)
X_control = num_of_symbols(X_control)
X_control = add_words_features(X_control)
prediction = np.exp(model.predict(X_control.drop(['Password', 'Id'], axis=1)))
pd.DataFrame(prediction, index=range(test.shape[0]), columns=['Times']).to_csv('lgbm_submit6.csv', index_label='Id')

HBox(children=(IntProgress(value=0, max=7), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1037875), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1037875), HTML(value='')))


