# ❇️VECTORIZERS #

##  ` 0️⃣Opis zadania` ### 


## 🇵🇱 Zadanie 2.2.8** (opcjonalnie)

Nowy zbiór z `KLEJ` - [PolEmo2.0-IN](https://clarin-pl.eu/dspace/handle/11321/710). To zestaw recenzji online z dziedziny medycyny i hoteli. Zadaniem jest przewidzenie sentymentu recenzji.

* input: `../input/klej/klej_polemo2.0-in/train.tsv`
* models: `../models/word2vec/klej_polemo2.0-in_train`

##  ` 1️⃣Import bibliotek` ### 

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from collections import Counter, defaultdict
import spacy

from sklearn.model_selection import cross_val_score, cross_val_predict, StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import random
import os

from gensim.utils import simple_preprocess
from tensorflow.keras.utils import to_categorical

import mlflow
from itertools import product
import time
import datetime
import warnings
import tensorflow as tf

import team_helper as th
from team_helper import recall, precision, f1, get_y
from team_helper import random_polemo2_opinion as random_opinion

from sklearn.model_selection import cross_validate
from scikitplot.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score

In [15]:
from scikitplot.estimators import plot_learning_curve

##  ` 3️⃣Usunięcie niepotrzebnych ostrzeżeń + ustawienie mlflow` ### 

In [2]:
warnings.filterwarnings('ignore', category=DeprecationWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)

In [20]:
mlflow.set_tracking_uri("file:///home/jovyan/nlp2/shared-mlruns/team-three/mariusz")

##  ` 4️⃣Próba zapewnienia powtarzalności eksperymentu` ### 

In [3]:
seed_value= 0
os.environ['PYTHONHASHSEED']=str(seed_value)
random.seed(seed_value)
np.random.seed(seed_value)
tf.random.set_seed(seed_value)
tf.compat.v1.set_random_seed(seed_value)

##  ` 5️⃣Eksploracja zbioru` ### 

In [4]:
df = pd.read_csv('../input/klej/klej_polemo2.0-in/train.tsv', sep='\t')

In [5]:
df

Unnamed: 0,sentence,target
0,Super lekarz i człowiek przez duże C . Bardzo ...,__label__meta_plus_m
1,Bardzo olewcze podejscie do pacjenta . Przypro...,__label__meta_minus_m
2,Lekarz zalecił mi kurację alternatywną do doty...,__label__meta_amb
3,Konsumenci oczywiście kierują się ceną . Te l...,__label__meta_zero
4,Pani Doktor Iwona jest profesjonalistką w każd...,__label__meta_plus_m
...,...,...
5739,"Centralne skrzydło jest jednokondygnacyjne , z...",__label__meta_zero
5740,Ogólnie w hotelu panuje balagan informacyjny -...,__label__meta_minus_m
5741,Przybyli śmy z rodziną na krotki wypoczynek . ...,__label__meta_amb
5742,Opinię może wyrazić dzisiaj każdy i jest to je...,__label__meta_zero


In [6]:
df['target'].value_counts(normalize=True)

__label__meta_minus_m    0.380223
__label__meta_plus_m     0.270369
__label__meta_amb        0.181929
__label__meta_zero       0.167479
Name: target, dtype: float64

##  ` 6️⃣Wyodrębnienie zmiennych i preprocessing` ### 

In [7]:
X_oryg = df['sentence'].map(th.simple_tokens).apply(lambda x:' '.join(x))
Xprep = df['sentence'].map(th.preprocessing).map(simple_preprocess).apply(lambda x:' '.join(x))
y = df['target'].factorize()[0]

In [8]:
def lemmatize_text(doc):
    return ' '.join([token.lemma_ for token in th.nlp_sm(doc)])
Xprep = Xprep.apply(lemmatize_text)

In [9]:
import texthero as hero
df["clean_text"] = hero.clean(df["sentence"])

In [10]:
SELECTED_X = df['clean_text']

In [11]:
SELECTED_X = Xprep

In [12]:
EXP_NAME = 'Vectorizers_polemo2_0_IN_X_prep_t'
SELECTED_X = Xprep

# 
max_features_list = [300,500, 1000] 

# 
tokenizer_list = [th.polish_tokenizer_md]

# 
stop_words_list = [th.get_stopwords(th.spacy_stop_words_md),[]]
#stop_words_list = [[]]

# 
vectorizers_list = [CountVectorizer, TfidfVectorizer] 

min_df_list = [0.01, 0.1, 1]

max_df_list = [0.3, 0.5, 1.0]

models_list = th.get_models(use_dummy=False)

##  ` 7️⃣Eksperyment` ### 

In [None]:
params_cnt = len(max_features_list)*\
len(tokenizer_list)*\
len(stop_words_list)*\
len(vectorizers_list)*\
len(min_df_list)*len(max_df_list)

exp_start = datetime.datetime.now()

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=0) 

for n,(Vectorizer, max_features, tokenizer, stop_words, min_df, max_df) in \
enumerate(product(vectorizers_list, max_features_list, tokenizer_list, stop_words_list, min_df_list, max_df_list),1):
    
    model_str = f'max_features = {max_features} tokenizer={tokenizer.__name__}, vectorizer={Vectorizer.__name__}, stop_words = {len(stop_words)}, dfs=min:{min_df}, max:{max_df}'
    
    print(f'{n}/{params_cnt}')
    print(model_str)
    
    kwargs = {'scoring': ['f1_micro']}
    
    
    vectorizer_kwargs = {'max_features': max_features,
                         'tokenizer': tokenizer,
                         'stop_words': stop_words,
                         'min_df':min_df,
                         'max_df':max_df}
    vec = Vectorizer(**vectorizer_kwargs)

    start = time.time()
    X = vec.fit_transform(SELECTED_X).toarray()
    end = time.time()
    print('##'*10)
    print(f'Vectorization time: {end-start}')
    print('##'*10)

    for model_name, model_obj in models_list:
        with mlflow.start_run(experiment_id=th._eid(EXP_NAME), run_name=f'{model_str}-{model_name}'):
            start = time.time()
            scores = cross_validate(model_obj, X, y, cv=cv, scoring=kwargs['scoring'], return_train_score=True)    
            #print(scores)
            mean = np.around( np.mean(scores['test_f1_micro']), 2)
            std = np.around(np.std(scores['test_f1_micro']),2)

            print(f'model = {model_name} score = {mean}')

            y_pred = cross_val_predict(model_obj, X, y, cv=cv)

            #fig = plt.subplots(figsize=(10, 10))
            fig = plt.figure()
            fig = plot_confusion_matrix(y, y_pred, title='model: {}'.format(model_name))
            plt.savefig("confusion_matrix.png")
            
            fig = plt.figure()
            fig = plot_learning_curve(model_obj, X, y, title='model: {}'.format(model_name))
            plt.savefig("learning_curve.png")

            end = time.time()
            print('time: ',round((end-start),2))
            print('-'*10)

            # log params
            mlflow.log_param('model', model_name)
            mlflow.log_param('max_features', max_features)
            mlflow.log_param('tokenizer', tokenizer.__name__)
            mlflow.log_param('vectorizer', Vectorizer.__name__)
            mlflow.log_param('lenstop_words', len(stop_words))
            mlflow.log_param('min_df', min_df)
            mlflow.log_param('max_df', max_df)

            # log metrics
            mlflow.log_metric('mean_train_f1_micro', np.around( np.mean(scores['train_f1_micro']), 2))
            mlflow.log_metric('mean_test_f1_micro', np.around( np.mean(scores['test_f1_micro']), 2))

            mlflow.log_metric('std_train_f1_micro', np.around( np.std(scores['train_f1_micro']), 2))
            mlflow.log_metric('std_test_f1_micro', np.around( np.std(scores['test_f1_micro']), 2))
            
            mlflow.log_metric('calc_time', (end-start))

            # log artifacts
            mlflow.log_artifact("confusion_matrix.png")
            mlflow.log_artifact("learning_curve.png")
            #plt.show()


exp_end = datetime.datetime.now()
print('='*10)
print(f'Start of experiment: {exp_start}')
print(f'End of experiment: {exp_end}')
th.calculate_delta((exp_end-exp_start))

dfruns = mlflow.search_runs(experiment_ids=[th._eid(EXP_NAME)])

1/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.01, max:0.3
####################
Vectorization time: 167.49834942817688
####################
model = decision tree score = 0.48
time:  1.62
----------
model = random forest score = 0.55
time:  6.88
----------
model = extra-trees score = 0.45
time:  6.29
----------
model = lightgbm score = 0.69
time:  8.13
----------
model = catboost score = 0.72
time:  33.27
----------
model = xgboost score = 0.72
time:  45.41
----------
2/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.01, max:0.5




####################
Vectorization time: 155.29947757720947
####################
model = decision tree score = 0.48
time:  1.42
----------
model = random forest score = 0.55
time:  6.34
----------
model = extra-trees score = 0.47
time:  5.38
----------
model = lightgbm score = 0.69
time:  7.38
----------
model = catboost score = 0.72
time:  29.61
----------
model = xgboost score = 0.72
time:  56.84
----------
3/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.01, max:1.0




####################
Vectorization time: 160.43988275527954
####################
model = decision tree score = 0.48
time:  1.54
----------
model = random forest score = 0.55
time:  6.85
----------
model = extra-trees score = 0.45
time:  5.94
----------
model = lightgbm score = 0.7
time:  9.74
----------
model = catboost score = 0.72
time:  33.51
----------
model = xgboost score = 0.72
time:  64.27
----------
4/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.1, max:0.3




####################
Vectorization time: 159.390074968338
####################
model = decision tree score = 0.49
time:  0.61
----------
model = random forest score = 0.58
time:  5.0
----------
model = extra-trees score = 0.43
time:  3.86
----------
model = lightgbm score = 0.61
time:  4.66
----------
model = catboost score = 0.62
time:  6.48
----------
model = xgboost score = 0.61
time:  8.93
----------
5/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.1, max:0.5




####################
Vectorization time: 149.3212649822235
####################
model = decision tree score = 0.53
time:  0.57
----------
model = random forest score = 0.57
time:  4.64
----------
model = extra-trees score = 0.43
time:  3.5
----------
model = lightgbm score = 0.61
time:  4.38
----------
model = catboost score = 0.62
time:  6.57
----------
model = xgboost score = 0.62
time:  9.15
----------
6/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.1, max:1.0




####################
Vectorization time: 151.7968955039978
####################
model = decision tree score = 0.53
time:  0.69
----------
model = random forest score = 0.59
time:  5.23
----------
model = extra-trees score = 0.44
time:  3.93
----------
model = lightgbm score = 0.61
time:  6.8
----------
model = catboost score = 0.62
time:  9.05
----------
model = xgboost score = 0.62
time:  14.53
----------
7/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:1, max:0.3




####################
Vectorization time: 156.3796694278717
####################
model = decision tree score = 0.48
time:  1.46
----------
model = random forest score = 0.55
time:  6.15
----------
model = extra-trees score = 0.45
time:  5.66
----------
model = lightgbm score = 0.69
time:  6.87
----------
model = catboost score = 0.72
time:  31.05
----------
model = xgboost score = 0.71
time:  48.55
----------
8/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:1, max:0.5




####################
Vectorization time: 157.00638055801392
####################
model = decision tree score = 0.48
time:  1.49
----------
model = random forest score = 0.55
time:  6.28
----------
model = extra-trees score = 0.47
time:  5.95
----------
model = lightgbm score = 0.69
time:  7.6
----------
model = catboost score = 0.72
time:  29.3
----------
model = xgboost score = 0.72
time:  45.54
----------
9/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:1, max:1.0




####################
Vectorization time: 158.28506994247437
####################
model = decision tree score = 0.48
time:  1.46
----------
model = random forest score = 0.56
time:  6.33
----------
model = extra-trees score = 0.45
time:  5.62
----------
model = lightgbm score = 0.7
time:  7.21
----------
model = catboost score = 0.72
time:  29.27
----------
model = xgboost score = 0.72
time:  45.62
----------
10/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 0, dfs=min:0.01, max:0.3
####################
Vectorization time: 147.01375198364258
####################
model = decision tree score = 0.48
time:  1.58
----------
model = random forest score = 0.54
time:  6.54
----------
model = extra-trees score = 0.44
time:  6.03
----------
model = lightgbm score = 0.7
time:  8.55
----------
model = catboost score = 0.72
time:  30.29
----------
model = xgboost score = 0.73
time:  45.71
----------
11/108
max_features = 300 tokenizer=polish_tokenizer_



####################
Vectorization time: 171.61307191848755
####################
model = decision tree score = 0.48
time:  2.81
----------
model = random forest score = 0.55
time:  8.57
----------
model = extra-trees score = 0.45
time:  9.21
----------
model = lightgbm score = 0.7
time:  13.35
----------
model = catboost score = 0.72
time:  52.04
----------
model = xgboost score = 0.72
time:  73.61
----------
20/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.01, max:0.5




####################
Vectorization time: 153.91492676734924
####################
model = decision tree score = 0.48
time:  2.35
----------
model = random forest score = 0.55
time:  7.6
----------
model = extra-trees score = 0.46
time:  7.8
----------
model = lightgbm score = 0.7
time:  10.52
----------
model = catboost score = 0.72
time:  46.6
----------
model = xgboost score = 0.72
time:  81.29
----------
21/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.01, max:1.0




####################
Vectorization time: 155.5303144454956
####################
model = decision tree score = 0.48
time:  2.57
----------
model = random forest score = 0.55
time:  7.97
----------
model = extra-trees score = 0.45
time:  7.9
----------
model = lightgbm score = 0.7
time:  8.76
----------
model = catboost score = 0.73
time:  45.71
----------
model = xgboost score = 0.73
time:  73.89
----------
22/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.1, max:0.3




####################
Vectorization time: 151.66817545890808
####################
model = decision tree score = 0.49
time:  0.55
----------
model = random forest score = 0.58
time:  4.55
----------
model = extra-trees score = 0.43
time:  3.47
----------
model = lightgbm score = 0.61
time:  4.31
----------
model = catboost score = 0.62
time:  6.41
----------
model = xgboost score = 0.61
time:  9.04
----------
23/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.1, max:0.5




####################
Vectorization time: 150.46063470840454
####################
model = decision tree score = 0.53
time:  0.6
----------
model = random forest score = 0.57
time:  4.55
----------
model = extra-trees score = 0.43
time:  3.47
----------
model = lightgbm score = 0.61
time:  4.49
----------
model = catboost score = 0.62
time:  6.69
----------
model = xgboost score = 0.62
time:  9.52
----------
24/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.1, max:1.0




####################
Vectorization time: 153.03526043891907
####################
model = decision tree score = 0.53
time:  0.65
----------
model = random forest score = 0.59
time:  4.82
----------
model = extra-trees score = 0.44
time:  3.61
----------
model = lightgbm score = 0.61
time:  5.1
----------
model = catboost score = 0.62
time:  7.2
----------
model = xgboost score = 0.62
time:  10.3
----------
25/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:1, max:0.3




####################
Vectorization time: 161.70775961875916
####################
model = decision tree score = 0.48
time:  3.08
----------
model = random forest score = 0.56
time:  9.32
----------
model = extra-trees score = 0.46
time:  9.45
----------
model = lightgbm score = 0.7
time:  9.66
----------
model = catboost score = 0.72
time:  46.41
----------
model = xgboost score = 0.72
time:  77.48
----------
26/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:1, max:0.5




####################
Vectorization time: 153.7288920879364
####################
model = decision tree score = 0.48
time:  2.62
----------
model = random forest score = 0.55
time:  7.41
----------
model = extra-trees score = 0.46
time:  7.93
----------
model = lightgbm score = 0.7
time:  8.29
----------
model = catboost score = 0.72
time:  45.58
----------
model = xgboost score = 0.72
time:  73.1
----------
27/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:1, max:1.0




####################
Vectorization time: 151.09973764419556
####################
model = decision tree score = 0.48
time:  2.48
----------
model = random forest score = 0.55
time:  7.43
----------
model = extra-trees score = 0.45
time:  8.31
----------
model = lightgbm score = 0.7
time:  8.92
----------
model = catboost score = 0.73
time:  47.28
----------
model = xgboost score = 0.73
time:  78.09
----------
28/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 0, dfs=min:0.01, max:0.3
####################
Vectorization time: 151.48783445358276
####################
model = decision tree score = 0.48
time:  2.64
----------
model = random forest score = 0.55
time:  8.51
----------
model = extra-trees score = 0.45
time:  9.1
----------
model = lightgbm score = 0.71
time:  13.44
----------
model = catboost score = 0.74
time:  57.95
----------
model = xgboost score = 0.73
time:  89.95
----------
29/108
max_features = 500 tokenizer=polish_tokenizer



####################
Vectorization time: 154.04586505889893
####################
model = decision tree score = 0.48
time:  6.63
----------
model = random forest score = 0.53
time:  11.37
----------
model = extra-trees score = 0.45
time:  13.78
----------
model = lightgbm score = 0.7
time:  11.29
----------
model = catboost score = 0.74
time:  87.97
----------
model = xgboost score = 0.74
time:  214.8
----------
38/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.01, max:0.5




####################
Vectorization time: 157.92006158828735
####################
model = decision tree score = 0.48
time:  5.8
----------
model = random forest score = 0.52
time:  9.94
----------
model = extra-trees score = 0.43
time:  12.29
----------
model = lightgbm score = 0.71
time:  12.43
----------
model = catboost score = 0.73
time:  92.89
----------
model = xgboost score = 0.73
time:  154.25
----------
39/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.01, max:1.0




####################
Vectorization time: 158.49378848075867
####################
model = decision tree score = 0.48
time:  6.85
----------
model = random forest score = 0.53
time:  11.29
----------
model = extra-trees score = 0.44
time:  14.51
----------
model = lightgbm score = 0.71
time:  13.16
----------
model = catboost score = 0.73
time:  85.41
----------
model = xgboost score = 0.73
time:  153.77
----------
40/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.1, max:0.3




####################
Vectorization time: 155.48419332504272
####################
model = decision tree score = 0.49
time:  0.56
----------
model = random forest score = 0.58
time:  4.66
----------
model = extra-trees score = 0.43
time:  3.53
----------
model = lightgbm score = 0.61
time:  4.52
----------
model = catboost score = 0.62
time:  7.77
----------
model = xgboost score = 0.61
time:  9.85
----------
41/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.1, max:0.5




####################
Vectorization time: 192.31816363334656
####################
model = decision tree score = 0.53
time:  1.0
----------
model = random forest score = 0.57
time:  9.46
----------
model = extra-trees score = 0.43
time:  7.18
----------
model = lightgbm score = 0.61
time:  10.59
----------
model = catboost score = 0.62
time:  24.08
----------
model = xgboost score = 0.62
time:  23.51
----------
42/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:0.1, max:1.0




####################
Vectorization time: 157.10232305526733
####################
model = decision tree score = 0.53
time:  0.66
----------
model = random forest score = 0.59
time:  4.77
----------
model = extra-trees score = 0.44
time:  3.68
----------
model = lightgbm score = 0.61
time:  5.46
----------
model = catboost score = 0.62
time:  7.32
----------
model = xgboost score = 0.62
time:  9.88
----------
43/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:1, max:0.3




####################
Vectorization time: 182.22296166419983
####################
model = decision tree score = 0.48
time:  13.25
----------
model = random forest score = 0.52
time:  23.08
----------
model = extra-trees score = 0.43
time:  30.9
----------
model = lightgbm score = 0.71
time:  30.77
----------
model = catboost score = 0.73
time:  217.42
----------
model = xgboost score = 0.74
time:  354.33
----------
44/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:1, max:0.5




####################
Vectorization time: 191.22986364364624
####################
model = decision tree score = 0.48
time:  6.51
----------
model = random forest score = 0.53
time:  11.18
----------
model = extra-trees score = 0.44
time:  14.14
----------
model = lightgbm score = 0.71
time:  11.73
----------
model = catboost score = 0.74
time:  84.87
----------
model = xgboost score = 0.74
time:  152.12
----------
45/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 381, dfs=min:1, max:1.0




####################
Vectorization time: 158.4516351222992
####################
model = decision tree score = 0.48
time:  6.86
----------
model = random forest score = 0.53
time:  12.2
----------
model = extra-trees score = 0.45
time:  14.89
----------
model = lightgbm score = 0.7
time:  13.24
----------
model = catboost score = 0.74
time:  88.55
----------
model = xgboost score = 0.74
time:  151.31
----------
46/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=CountVectorizer, stop_words = 0, dfs=min:0.01, max:0.3
####################
Vectorization time: 155.23762226104736
####################
model = decision tree score = 0.48
time:  6.97
----------
model = random forest score = 0.53
time:  11.62
----------
model = extra-trees score = 0.43
time:  15.09
----------
model = lightgbm score = 0.72
time:  17.75
----------
model = catboost score = 0.74
time:  91.56
----------
model = xgboost score = 0.75
time:  150.77
----------
47/108
max_features = 1000 tokenizer=polish_t



####################
Vectorization time: 192.5678563117981
####################
model = decision tree score = 0.48
time:  3.6
----------
model = random forest score = 0.55
time:  8.99
----------
model = extra-trees score = 0.47
time:  7.92
----------
model = lightgbm score = 0.69
time:  30.22
----------
model = catboost score = 0.71
time:  130.39
----------
model = xgboost score = 0.71
time:  102.78
----------
56/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:0.01, max:0.5




####################
Vectorization time: 185.97773575782776
####################
model = decision tree score = 0.48
time:  2.63
----------
model = random forest score = 0.55
time:  10.99
----------
model = extra-trees score = 0.48
time:  6.34
----------
model = lightgbm score = 0.69
time:  34.42
----------
model = catboost score = 0.72
time:  129.51
----------
model = xgboost score = 0.72
time:  143.33
----------
57/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:0.01, max:1.0




####################
Vectorization time: 205.87580728530884
####################
model = decision tree score = 0.48
time:  3.74
----------
model = random forest score = 0.56
time:  10.36
----------
model = extra-trees score = 0.46
time:  11.62
----------
model = lightgbm score = 0.69
time:  49.38
----------
model = catboost score = 0.71
time:  134.99
----------
model = xgboost score = 0.71
time:  177.46
----------
58/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:0.1, max:0.3




####################
Vectorization time: 181.2309443950653
####################
model = decision tree score = 0.48
time:  1.53
----------
model = random forest score = 0.55
time:  6.49
----------
model = extra-trees score = 0.48
time:  4.03
----------
model = lightgbm score = 0.6
time:  7.78
----------
model = catboost score = 0.61
time:  18.27
----------
model = xgboost score = 0.61
time:  13.9
----------
59/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:0.1, max:0.5




####################
Vectorization time: 167.83669805526733
####################
model = decision tree score = 0.52
time:  0.93
----------
model = random forest score = 0.56
time:  6.55
----------
model = extra-trees score = 0.47
time:  4.05
----------
model = lightgbm score = 0.61
time:  8.67
----------
model = catboost score = 0.62
time:  19.0
----------
model = xgboost score = 0.61
time:  12.02
----------
60/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:0.1, max:1.0




####################
Vectorization time: 155.10988759994507
####################
model = decision tree score = 0.53
time:  0.89
----------
model = random forest score = 0.57
time:  5.98
----------
model = extra-trees score = 0.47
time:  3.64
----------
model = lightgbm score = 0.62
time:  7.28
----------
model = catboost score = 0.62
time:  15.34
----------
model = xgboost score = 0.62
time:  13.24
----------
61/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:1, max:0.3




####################
Vectorization time: 165.9286448955536
####################
model = decision tree score = 0.48
time:  2.54
----------
model = random forest score = 0.55
time:  8.77
----------
model = extra-trees score = 0.46
time:  7.23
----------
model = lightgbm score = 0.69
time:  15.76
----------
model = catboost score = 0.71
time:  83.73
----------
model = xgboost score = 0.71
time:  70.6
----------
62/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:1, max:0.5




####################
Vectorization time: 198.36026215553284
####################
model = decision tree score = 0.48
time:  2.43
----------
model = random forest score = 0.55
time:  9.57
----------
model = extra-trees score = 0.48
time:  7.55
----------
model = lightgbm score = 0.69
time:  31.5
----------
model = catboost score = 0.72
time:  131.2
----------
model = xgboost score = 0.72
time:  142.12
----------
63/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:1, max:1.0




####################
Vectorization time: 204.16364431381226
####################
model = decision tree score = 0.48
time:  2.76
----------
model = random forest score = 0.56
time:  8.47
----------
model = extra-trees score = 0.47
time:  8.9
----------
model = lightgbm score = 0.69
time:  32.4
----------
model = catboost score = 0.71
time:  116.51
----------
model = xgboost score = 0.72
time:  154.2
----------
64/108
max_features = 300 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 0, dfs=min:0.01, max:0.3
####################
Vectorization time: 198.3984730243683
####################
model = decision tree score = 0.48
time:  3.16
----------
model = random forest score = 0.55
time:  12.21
----------
model = extra-trees score = 0.46
time:  7.67
----------
model = lightgbm score = 0.7
time:  45.53
----------
model = catboost score = 0.72
time:  154.61
----------
model = xgboost score = 0.72
time:  57.55
----------
65/108
max_features = 300 tokenizer=polish_tokeniz



####################
Vectorization time: 202.39337491989136
####################
model = decision tree score = 0.48
time:  6.79
----------
model = random forest score = 0.55
time:  16.32
----------
model = extra-trees score = 0.46
time:  16.4
----------
model = lightgbm score = 0.7
time:  318.53
----------
model = catboost score = 0.72
time:  148.64
----------
model = xgboost score = 0.72
time:  694.99
----------
74/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:0.01, max:0.5




####################
Vectorization time: 188.32303643226624
####################
model = decision tree score = 0.48
time:  4.85
----------
model = random forest score = 0.55
time:  12.69
----------
model = extra-trees score = 0.47
time:  11.99
----------
model = lightgbm score = 0.7
time:  44.73
----------
model = catboost score = 0.72
time:  150.37
----------
model = xgboost score = 0.72
time:  162.93
----------
75/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:0.01, max:1.0




####################
Vectorization time: 187.92902088165283
####################
model = decision tree score = 0.49
time:  4.72
----------
model = random forest score = 0.55
time:  11.84
----------
model = extra-trees score = 0.46
time:  12.22
----------
model = lightgbm score = 0.7
time:  45.73
----------
model = catboost score = 0.72
time:  150.93
----------
model = xgboost score = 0.73
time:  189.91
----------
76/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:0.1, max:0.3




####################
Vectorization time: 190.41149401664734
####################
model = decision tree score = 0.48
time:  1.01
----------
model = random forest score = 0.55
time:  7.39
----------
model = extra-trees score = 0.48
time:  4.51
----------
model = lightgbm score = 0.6
time:  19.65
----------
model = catboost score = 0.61
time:  26.29
----------
model = xgboost score = 0.61
time:  34.32
----------
77/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:0.1, max:0.5




####################
Vectorization time: 189.22960376739502
####################
model = decision tree score = 0.52
time:  1.02
----------
model = random forest score = 0.56
time:  7.22
----------
model = extra-trees score = 0.47
time:  4.64
----------
model = lightgbm score = 0.61
time:  19.63
----------
model = catboost score = 0.62
time:  27.04
----------
model = xgboost score = 0.61
time:  33.62
----------
78/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:0.1, max:1.0




####################
Vectorization time: 187.35727405548096
####################
model = decision tree score = 0.53
time:  1.1
----------
model = random forest score = 0.57
time:  7.56
----------
model = extra-trees score = 0.47
time:  4.56
----------
model = lightgbm score = 0.62
time:  20.76
----------
model = catboost score = 0.62
time:  28.46
----------
model = xgboost score = 0.62
time:  37.73
----------
79/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:1, max:0.3




####################
Vectorization time: 178.84391379356384
####################
model = decision tree score = 0.48
time:  4.73
----------
model = random forest score = 0.55
time:  10.92
----------
model = extra-trees score = 0.46
time:  9.96
----------
model = lightgbm score = 0.69
time:  19.15
----------
model = catboost score = 0.72
time:  135.83
----------
model = xgboost score = 0.72
time:  188.81
----------
80/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:1, max:0.5




####################
Vectorization time: 184.4334774017334
####################
model = decision tree score = 0.48
time:  4.6
----------
model = random forest score = 0.55
time:  11.44
----------
model = extra-trees score = 0.47
time:  12.93
----------
model = lightgbm score = 0.7
time:  42.7
----------
model = catboost score = 0.72
time:  150.13
----------
model = xgboost score = 0.72
time:  188.5
----------
81/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:1, max:1.0




####################
Vectorization time: 182.3237283229828
####################
model = decision tree score = 0.48
time:  4.19
----------
model = random forest score = 0.55
time:  10.43
----------
model = extra-trees score = 0.46
time:  11.84
----------
model = lightgbm score = 0.7
time:  40.3
----------
model = catboost score = 0.72
time:  149.58
----------
model = xgboost score = 0.72
time:  189.33
----------
82/108
max_features = 500 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 0, dfs=min:0.01, max:0.3
####################
Vectorization time: 181.04524159431458
####################
model = decision tree score = 0.48
time:  5.4
----------
model = random forest score = 0.55
time:  12.42
----------
model = extra-trees score = 0.45
time:  13.64
----------
model = lightgbm score = 0.71
time:  27.25
----------
model = catboost score = 0.73
time:  128.71
----------
model = xgboost score = 0.73
time:  109.97
----------
83/108
max_features = 500 tokenizer=polish_to



####################
Vectorization time: 153.79788446426392
####################
model = decision tree score = 0.49
time:  7.87
----------
model = random forest score = 0.53
time:  12.35
----------
model = extra-trees score = 0.45
time:  14.21
----------
model = lightgbm score = 0.7
time:  19.53
----------
model = catboost score = 0.72
time:  141.15
----------
model = xgboost score = 0.73
time:  160.24
----------
92/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:0.01, max:0.5




####################
Vectorization time: 154.72181463241577
####################
model = decision tree score = 0.48
time:  8.18
----------
model = random forest score = 0.53
time:  12.76
----------
model = extra-trees score = 0.43
time:  14.74
----------
model = lightgbm score = 0.7
time:  19.8
----------
model = catboost score = 0.72
time:  142.38
----------
model = xgboost score = 0.73
time:  165.01
----------
93/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:0.01, max:1.0




####################
Vectorization time: 154.3720109462738
####################
model = decision tree score = 0.48
time:  7.91
----------
model = random forest score = 0.53
time:  12.43
----------
model = extra-trees score = 0.43
time:  14.34
----------
model = lightgbm score = 0.7
time:  20.07
----------
model = catboost score = 0.73
time:  142.91
----------
model = xgboost score = 0.73
time:  162.38
----------
94/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:0.1, max:0.3




####################
Vectorization time: 154.7251739501953
####################
model = decision tree score = 0.48
time:  0.82
----------
model = random forest score = 0.55
time:  5.7
----------
model = extra-trees score = 0.48
time:  3.56
----------
model = lightgbm score = 0.6
time:  6.46
----------
model = catboost score = 0.61
time:  14.17
----------
model = xgboost score = 0.61
time:  13.24
----------
95/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:0.1, max:0.5




####################
Vectorization time: 152.89959383010864
####################
model = decision tree score = 0.52
time:  0.86
----------
model = random forest score = 0.56
time:  5.88
----------
model = extra-trees score = 0.47
time:  3.7
----------
model = lightgbm score = 0.61
time:  6.87
----------
model = catboost score = 0.62
time:  14.66
----------
model = xgboost score = 0.61
time:  12.3
----------
96/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:0.1, max:1.0




####################
Vectorization time: 154.52091884613037
####################
model = decision tree score = 0.53
time:  0.93
----------
model = random forest score = 0.57
time:  6.23
----------
model = extra-trees score = 0.47
time:  3.71
----------
model = lightgbm score = 0.62
time:  7.88
----------
model = catboost score = 0.62
time:  15.85
----------
model = xgboost score = 0.62
time:  14.14
----------
97/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:1, max:0.3




####################
Vectorization time: 160.47012066841125
####################
model = decision tree score = 0.48
time:  7.94
----------
model = random forest score = 0.53
time:  13.08
----------
model = extra-trees score = 0.45
time:  14.17
----------
model = lightgbm score = 0.7
time:  20.54
----------
model = catboost score = 0.72
time:  143.96
----------
model = xgboost score = 0.73
time:  163.82
----------
98/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:1, max:0.5




####################
Vectorization time: 155.87742042541504
####################
model = decision tree score = 0.47
time:  7.89
----------
model = random forest score = 0.54
time:  12.33
----------
model = extra-trees score = 0.43
time:  14.15
----------
model = lightgbm score = 0.7
time:  19.57
----------
model = catboost score = 0.72
time:  142.85
----------
model = xgboost score = 0.73
time:  160.98
----------
99/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 381, dfs=min:1, max:1.0




####################
Vectorization time: 151.5137951374054
####################
model = decision tree score = 0.48
time:  8.58
----------
model = random forest score = 0.53
time:  12.98
----------
model = extra-trees score = 0.46
time:  15.13
----------
model = lightgbm score = 0.7
time:  19.89
----------
model = catboost score = 0.73
time:  145.43
----------
model = xgboost score = 0.73
time:  198.56
----------
100/108
max_features = 1000 tokenizer=polish_tokenizer_md, vectorizer=TfidfVectorizer, stop_words = 0, dfs=min:0.01, max:0.3
####################
Vectorization time: 164.6885859966278
####################
model = decision tree score = 0.48
time:  9.56
----------
model = random forest score = 0.53
time:  14.43
----------
model = extra-trees score = 0.44
time:  16.73
----------
model = lightgbm score = 0.71
time:  30.48
----------
model = catboost score = 0.73
time:  197.13
----------
model = xgboost score = 0.74
time:  197.7
----------
101/108
max_features = 1000 tokenizer=polis

In [None]:
!mlflow ui --backend-store-uri /home/jovyan/nlp2/shared-mlruns/team-three/mariusz --default-artifact-root /home/jovyan/nlp2/shared-mlruns/team-three/mariusz --port 5001

In [19]:
!mlflow ui

[2021-10-24 06:47:24 +0000] [623] [INFO] Starting gunicorn 20.1.0
[2021-10-24 06:47:24 +0000] [623] [INFO] Listening at: http://127.0.0.1:5000 (623)
[2021-10-24 06:47:24 +0000] [623] [INFO] Using worker: sync
[2021-10-24 06:47:24 +0000] [625] [INFO] Booting worker with pid: 625
^C
[2021-10-24 06:48:21 +0000] [623] [INFO] Handling signal: int
[2021-10-24 06:48:21 +0000] [625] [INFO] Worker exiting (pid: 625)
