In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from IPython.display import display, clear_output
from sklearn.externals import joblib
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

import time
import datetime
from sys import stdout# import sys
import os
from itertools import product
from itertools import zip_longest

from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix

RANDOM_SEED = 42
parameter = {'random_state':[RANDOM_SEED]}

%matplotlib inline



### Стандарт на предобработанный датафрейм
Колонки:
1. **id** - идентификатор текста (**не индекс!**)
2. **text** - основной текст документа
3. **subj** - коды отделов
4. **ipv** - коды РЖ
5. **rgnti** - коды ГРНТИ
6. **title** - название документа
7. **body** - тело текста
8. **keywords** - ключевые слова
9. **correct** - признак правильности записи (*###*)

In [2]:
class Codes_helper:
    
    def __init__(self, ipv_name = None, ipv_change = None):
        self.set_ipv_codes(ipv_name)
        self.set_ipv_change(ipv_change)
        self.subj_codes = ['e1', 'e2', 'e3', 'e4', 'e5', 'e7', 'e9',
                           'f1', 'f2', 'f3', 'f4', 'f5', 'f7', 'f8', 'f9'] 
    
    def clear_null(self, data, column_name):
        data.index.name = 'index'
        data = data.reset_index()
        data = data.drop(data[data[column_name].isnull()].index)
        data = data.set_index(data['index']).drop('index', axis=1)
        return data
    
    def set_ipv_codes(self, ipv_name):
        """
        Loads ipv codes if path is valid.

        Args:
        ipv_name    -- absolute or relative path to the ipv codes file.
        """
        if ipv_name and os.path.exists(ipv_name):
            self.ipv_codes = list(pd.read_csv(ipv_name, sep='\t', header=None)[0])
            math = []
            for i in self.ipv_codes:
                if i.startswith('13'):
                    math += [i]
            self.ipv_codes = list(set(self.ipv_codes)-set(math))
        else:
            self.ipv_codes = None
    
    
    def set_ipv_change(self, ipv_change):
        """
        Loads ipv changes file if path is valid.

        Args:
        ipv_change    -- absolute or relative path to the ipv changes file with two columns 
                         separated with tab: original ipv code and it's change.
        """
        if ipv_change and os.path.exists(ipv_change):
            self.ipv_change = pd.read_csv(ipv_change, sep='\t', header=None)
        else:
            self.ipv_change = None
        
        #############################
#         ToDo Test
    
    def change_ipv(self, data):
        """
        Changes ipv column in pd.dataFrame according to ipv_change.

        Args:
        data          -- pd.DataFrame with ipv column.
        """
        data = self.clear_null(data, 'ipv')
        for i in self.ipv_change.index:
            temp = list(self.ipv_change.loc[i])
            data.ipv[data.ipv == temp[0]] = temp[1]
        math = []
        for i in self.ipv_codes:
            if i.startswith('13'):
                math += [i]
        codes = list(set(self.ipv_codes)-set(math))
        for i in list(set(list(data.ipv.unique()))-set(codes)):
            data = data.drop(data[data.ipv == i].index, axis=0)
        return data
    
    def change_subj(self, data):
        data = self.clear_null(data, 'subj')
        if data.subj.isnull().any():
            data.index.name = 'index'
            data = data.reset_index()
        codes = list(set(data.subj.unique())-set(self.subj_codes))
        for i in codes:
            data = data.drop(data[data.subj == i].index, axis=0)
        if data.subj.isnull().any():
            data = data.set_index(data['index']).drop('index', axis=1)
        return data
    
    def change_rgnti(self, data):
        data = self.clear_null(data, 'rgnti')
        data.rgnti = self.cut_rgnti(data.rgnti)
        return data
    #############################
    
    def get_codes(self, name):
        """
        Gives list with all valid codes of a rubricator.

        Args:
        name        -- string rubricator name in VINITI format ("SUBJ", "IPV", etc.)
        """
        if name.lower() == 'ipv':
            return self.ipv_codes
        elif name.lower() == 'subj':
            s = self.subj_codes
        else:
            print('Name must be SUBJ or IPV')
            return None
        return s
    
    
    def cut_rgnti(self, data):
        """
        Transforms rgnti cide into xx.xx format.

        Args:
        data        -- list/pd.Series with rgnti column.
        """
        for i in data.unique():
            if type(i) == str:
                data[data == str(i)] = str(i)[:5]
        return data

In [3]:
class Worker():
#     def __init__(self):
#         self.w2v_model  = None
#         self.w2v_size   = None
#         self.lang       = None
#         self.conv_type  = None
#         self.rubr_id    = None
#         self.clf        = None
#         self.data_train = None
#         self.data_test  = None
#         self.name_train = None
#         self.name_test  = None
#         self.res_folder = None
        
    def init(self, 
             w2v_model  = None, 
             w2v_size   = None, 
             lang       = None, 
             conv_type  = None, 
             rubr_id    = None, 
             clf        = None, 
             data_train = None, 
             data_test  = None, 
             name_train = None, 
             name_test  = None, 
             res_folder = None):
        self.w2v_model  = w2v_model
        self.w2v_size   = w2v_size
        self.lang       = lang
        self.conv_type  = conv_type
        self.rubr_id    = rubr_id
        self.clf        = clf
        self.data_train = data_train
        self.data_test  = data_test
        self.name_train = name_train
        self.name_test  = name_test
        self.res_folder = res_folder
        
    def load_w2v(self, w2v_path):
        """
        Loads word2vec model.

        Args:
        w2v_path (str): absolute or relative path to the w2v model file.
        """
        if os.path.exists(w2v_path):
            self.w2v_model = Word2Vec.load(w2v_path)
            if     '_ru'  in w2v_path:
                self.lang = 'ru'
            elif   '_en'  in w2v_path:
                self.lang = 'en'
            self.w2v_size = self.w2v_model.layer1_size
            return True
        else:
            print('Path to word2vec model is not valid.')
            return False
    
    def load_clf(self, clf_path):
        """
        Loads claffifier from given file name.

        Args:
        clf_path (str): absolute or relative path to the classifier model file.
        """
        if os.path.exists(clf_path):
            if   '_sum'  in clf_path:
                self.conv_type = 'sum'
            elif '_max'  in clf_path:
                self.conv_type = 'max'
            elif '_mean' in clf_path:
                self.conv_type = 'mean'
            if     'ipv'   in  clf_path:
                self.rubr_id = 'ipv'
            elif   'subj'  in  clf_path:
                self.rubr_id = 'subj'
            elif   'rgnti' in  clf_path:
                self.rubr_id = 'rgnti'
            self.clf = joblib.load(clf_path)
            return True
        else:
            return False
    
    def load_data(self, train_path, test_path=None, split_ratio=0.8):
        """
        Loads test and train data. If test_path is equal to None then train set will be splitted into two parts 
        according to split_ratio. 
        Loaded data in not transformed and contains all rubrics and additional fields.

        Args:
            train_path (str): absolute or relative path to train data file.
            test_path (str): absolute or relative path to test data file. If is equal to None, train data 
                           will be separated to make train and test sets.
            split_ratio (int): proportion of train data in splitting.
            
        Returns:
            True or False depending on the success of the task.
        """
        if train_path and os.path.exists(train_path):
            self.name_train = train_path
            if test_path and os.path.exists(test_path):
                self.name_test = test_path
                self.data_train = pd.read_csv(train_path, index_col=0, sep='\t')
                self.data_test = pd.read_csv(test_path, index_col=0, sep='\t')
            else:
                if test_path:
                    print('Test path is not valid, train set will be splitted.')
                self.name_test = train_path
                data = pd.read_csv(train_path, index_col=0, sep='\t')
                train_index, test_index = train_test_split(data.index.unique(), 
                                                           test_size=1-split_ratio)
                self.data_train, self.data_test = data.loc[train_index], data.loc[test_index]
            if   '_sum'  in os.path.split(train_path)[1]:
                self.conv_type = 'sum'
            elif '_max'  in os.path.split(train_path)[1]:
                self.conv_type = 'max'
            elif '_mean' in os.path.split(train_path)[1]:
                self.conv_type = 'mean'
            return True
        else:
            print('Please specify existing train data path.')
            self.data_train = None
            self.data_test = None
            return False
        
    def data_cleaning(self, train_path, split_ratio=None, description=None):
        """
        Creates one or two files with train and test data with only one rubric per string.
        Saves the result 

        Args:
        train_path (str): absolute or relative path to train data file.
        split_ratio (int): proportion of train data in splitting or None if no split is needed.\
        description (str): additional info shoul be added to file names.
        """
        if train_path and os.path.exists(train_path):
            data = pd.read_csv(train_path, index_col=0)
            data = self.split_all_sect(data)
            if split_ratio:
                train_index, test_index = train_test_split(data.index.unique(), 
                                                           test_size=1-split_ratio)
                self.data_train, self.data_test = data.loc[train_index], data.loc[test_index]
                self.name_train, self.name_test = train_path, train_path
                if remarks:
                    train_name = self.create_name('data', self.data_train, description='single_theme'+'_'+description)
                    test_name = self.create_name('data', self.data_test, description='test_single_theme'+'_'+description)
                else:
                    train_name = self.create_name('data', self.data_train, description='single_theme')
                    test_name = self.create_name('data', self.data_test, description='test_single_theme')
                self.save_file(train_name, self.data_train)
                self.save_file(test_name, self.data_test)
            else:
                self.data_train = data
                name = self.create_name('data', self.data_train, description='single_theme')
                self.save_file(name, self.data_train) 
        else:
            print('Please specify existing train data path.')
            self.data_train = None
            return False   
    
    def set_res_folder(self, path):
        """
        Creates directory for saving current working files.

        Args:
        path (str): absolute or relative path to result folder. If folder does not exixts it will be created. 
                       In that case head of path must exists.
        """
        if os.path.exists(path) and os.path.isdir(path):
            self.res_folder = path
            return True
        elif os.path.exists(os.path.split(path)[0]):
            os.makedirs(path)
            self.res_folder = path
            return True
        else:
            print('Result directory creation failed.')
            self.res_folder = None
            return False
    
    def set_rubr_id(self, rubric):
        """
        Checkes the correctness of input data and sets rubric if it is fine.
        
        Args:
        rubric (str): name of rubric to be set.
        """
        rubric = rubric.lower()
        if rubric in ['ipv', 'subj', 'rgnti']:
            self.rubr_id = rubric
            return True
        else:
            print('Not a valid rubric name. Please choose one of "ipv", "subj", "rgnti".')
            return False
    
    def set_conv_type(self, conv_type):
        """
        Checkes the correctness of input data and sets convlution type if it is fine.
        
        Args:
        conv_type (str): name of convolution type to be set.
        """
        conv_type = conv_type.lower()
        if conv_type in ['mean', 'max', 'sum']:
            self.conv_type = conv_type
        else:
            print('Not a valid convolution type name. Please choose one of "mean", "max", "sum".')
    
    def set_clf(self, path):
        """
        Checkes the correctness of input path and setas classification model if it is fine.
        
        Args:
        path (str): absolute or relative path to classification model.
        """
        if os.path.exists(path) and path[-4:] == '.pkl':
            self.load_clf(path)
            return True
        else:
            print('Not a valid path, try again. File type should be ".pkl".')
            return False
    
    def set_w2v(self, path):
        """
        Checkes the correctness of input path and setas word2vec model if it is fine.
        
        Args:
        path (str): absolute or relative path to word2vec model.
        """
        if os.path.exists(path) and path[-6:] == '.model':
            self.load_w2v(path)
            return True
        else:
            print('Not a valid path, try again. File type should be ".model".')
            return False
        
    ################################################
    def set_lang(self, lang):
        if lang in ['ru', 'en']:
            self.lang = lang
            return True
        else:
            print('Not a valid language. Please choose "en" or "ru".')
            return False
    ################################################
    def check_res_folder(self):
        """
        Checkes if result folder is set. 
        If it is not, function asks a new path and checks if it is correct.
        """
        while not self.res_folder:
            print('Please specify result folder:')
            folder = input()
            if os.path.exists(folder):
                self.set_res_folder(folder)
            else:
                print('Not a valid path, try again.')
        return True
           
    def check_rubr_id(self):
        """
        Checkes if rubric is set. 
        If it is not, function asks a new rubric and checks if it is correct.
        """
        while self.rubr_id is None:
            print('Please specify rubric id:')
            rubric = input()
            self.set_rubr_id(rubric)
        return True

    def check_conv_type(self):
        """
        Checkes if convolution type is set. 
        If it is not, function asks a new convolution type and checks if it is correct.
        """
        while self.conv_type is None:
            print('Please specify convolution type:')
            conv_type = input()
            self.set_conv_type(conv_type)
        return True
    
    def check_clf(self):
        """
        Checkes if classifier is set. 
        If it is not, function asks a new classifier path and checks if it is correct.
        """
        while self.clf is None:
            print('Please specify path to classifier file:')
            file = input()
            self.set_clf(file)
        
    def check_w2v(self):
        """
        Checkes if word2vec model is set. 
        If it is not, function asks a new word2vec path and checks if it is correct.
        """
        while self.w2v_model is None:
            print('Please specify path to word2vec model file:')
            file = input()
            self.set_w2v(file)
    ####################################################
    
    def check_lang(self):
        while not self.lang:
            print('Please specify language:')
            lang = input()
            self.set_lang(lang)
    
    def check_data(self):
        while self.data_train is None:
            file_train = '---'
            while not os.path.exists(file_train):
                print('Please specify path to train data file:')
                file_train = input()
            file_test = '---'
            while file_test and not os.path.exists(file_test):
                print('Please specify path to test data file (leave empty if no such file):')
                file_test = input()
                split = None
                if not file_test:
                    file_test = None
                    print('Please specify split fraction(leave empty if no split):')
                    split = float(input().replace(',','.'))
            self.load_data(train_path=file_train, test_path=file_test, split_ratio=split)
    ####################################################
    
    def create_name(self, file_type, save_data, version=1, description=None, info=None):
        """
        Creates saving name for file in result_folder. Always add "test" in description 
            for use it's features in name. By default train dataset is used.

        Args:
        file_type (str): type of data that file contains, can be "data", "clf_model", 
                       "result", "answers", "w2v_model", "w2v_vectors"
        save_data   -- string/DataFrame/dict{int or str:pd.DataFrame}/w2v model/clf model for saving.
        version     -- int or str with version of file.
        description (str): additional data that should be in file name.
        info        -- if file contains additional information or not (bool).
        """
        self.check_res_folder()
        if description:
            description = str(description)
        name = self.res_folder + '/'
        if info is not None:
            name += 'info_'
        name += file_type
        
        if file_type == 'w2v_model':
            name += '_'+str(self.w2v_size)
            name += '_'+self.lang
            name += '_'+str(round((self.data_train.shape[0]+self.data_test.shape[0])/1000))+'k'
            if description:
                name += '_'+description
            
        elif file_type == 'clf_model':
            name += '_'+self.lang
            name += '_'+self.rubr_id
            if description:
                name += '_'+description
            name += '_'+self.conv_type
            name += str(self.w2v_size)
            
        elif file_type == 'data':
            if description:
                name += '_'+description
            if 'test' in description:
                name += '_'+str(round(self.data_test.shape[0]/1000))+'k'
            else:
                name += '_'+str(round(self.data_train.shape[0]/1000))+'k'
            
        elif file_type == 'w2v_vectors':
            if description is None:
                name += '_'+str(round(self.data_train.shape[0]/1000))+'k'
            else:
                name += '_'+description
                if 'test' in description:
                    name += '_'+str(round(self.data_test.shape[0]/1000))+'k'
                else:
                    name += '_'+str(round(self.data_train.shape[0]/1000))+'k'
            name += '_'+self.conv_type
            name += str(self.w2v_size)
        
        elif file_type == 'answers':
            name += '_'+self.rubr_id
            name += '_'+self.lang
            name += '_'+self.conv_type
            name += str(self.w2v_size) 
            if 'test' in description:
                name += '_'+str(round(self.data_test.shape[0]/1000))+'k'
            else:
                name += '_'+str(round(self.data_train.shape[0]/1000))+'k'
            
        elif file_type == 'result':
            name += '_'+self.rubr_id
            name += '_'+self.lang
            name += '_'+self.conv_type
            name += str(self.w2v_size)
            if description and 'test' in description:
                name += '_'+str(round(self.data_test.shape[0]/1000))+'k'
            else:
                name += '_'+str(round(self.data_train.shape[0]/1000))+'k'
        
        name += '_v'+str(version)
        now = datetime.datetime.today()
        date = str(now.day)+'_'+str(now.month)+'_'+str(now.year)[2:]
        name += '_'+date
        
        if type(save_data)   == str:
            name += '.txt'     
        elif type(save_data) == pd.core.frame.DataFrame:
            name += '.csv'
        elif type(save_data) == Word2Vec:
            name += '.model'
        elif type(save_data) == dict:
            name += '.xlsx'
        else:
            name += '.plk'
        return name
    
    # дописать path_ipv_codes и path_replacement
    def create_sets(self, path_ipv_codes='./RJ_code_21017_utf8.txt', path_replacement='./Replacement_RJ_code_utf8.txt', split_ratio=None):  
        """
        Creates clear train and test X and y based on current train and test sets in object.
        Needs right names of chosen rubric column according to format ("subj", "ipv", "rgnti").

        Args:
        split_ratio (int): needed if there ae no test data specified in self.data_test.
        """
        self.check_rubr_id()
        helper = Codes_helper()
        if self.rubr_id == 'ipv':
            helper.set_ipv_codes(path_ipv_codes)
            helper.set_ipv_change(path_replacement)
            if self.data_train is not None:
                self.data_train = helper.change_ipv(self.data_train)
            if self.data_test is not None:
                self.data_test = helper.change_ipv(self.data_test)
        elif self.rubr_id == 'subj':
            if self.data_train is not None:
                self.data_train = helper.change_subj(self.data_train)
            if self.data_test is not None:
                self.data_test = helper.change_subj(self.data_test)
        elif self.rubr_id == 'rgnti':
            if self.data_train is not None:
                self.data_train = helper.change_rgnti(self.data_train)
            if self.data_test is not None:
                self.data_test = helper.change_rgnti(self.data_test)

        if split_ratio is None:
            if self.data_train is None and self.data_test is None:
                print('Please set train data and split ratio or test data.')
            elif self.data_test is None:
                print('Please set test data of split ratio.')
        else:
            if self.data_train is None:
                print('Please set train data.')
            elif self.data_test is None:
                train_index, test_index = train_test_split(self.data_train.index.unique(),
                                                   random_state=42, 
                                                   test_size=1-split_ratio)
                self.data_test = self.data_train.loc[test_index]
                self.data_train = self.data_train.loc[train_index]
        cols = np.array(self.data_train.columns)
        size = 0
        for i in cols:
            if str(i).isdigit():
                if int(i) > size:
                    size = int(i)
       #  size = np.array(list(map(int, cols[list(map(str.isdigit, str(cols)))])))
        if size == 0:
            print("No features columns are found.")
        elif size > 0:
            if self.rubr_id == 'subj':
                y_train = self.data_train.subj
                y_test  = self.data_test.subj
            elif self.rubr_id == 'ipv':
                y_train = self.data_train.ipv
                y_test  = self.data_test.ipv
            elif self.rubr_id == 'rgnti':
                y_train = Codes_helper().cut_rgnti(self.data_train.rgnti)
                y_test  = Codes_helper().cut_rgnti(self.data_test.rgnti)
            X_train = self.data_train[list(map(str,np.arange(size+1)))]
            X_test  = self.data_test[list(map(str,np.arange(size+1)))]
            X_test, y_test = self.change_test(X_test, y_test)
            y_train = list(y_train)
            return X_train, X_test, y_train, y_test
        return None, None, None, None
    
    # ToDo Description
    def create_w2v_vectors(self, data, description=None):
        """
        Creates pd.DataFrame with vectors instead of text column.

        Args:
        data (pd.DataFrame): .
        """
        self.check_conv_type()
        self.check_w2v()
    # No need to check w2v any more
        if self.w2v_model is None:
            print('Word2Vec model is not set.')
            return False
        else:
            columns = list(data.columns)
            columns.remove('text')
            result = pd.DataFrame([], columns=[columns+list(range(self.w2v_size))])
            total_am = data.shape[0]
            for j,i in enumerate(data.index.unique()):
                if j%100 == 0: 
                    clear_output()
                    display(self.conv_type+' '+str(self.w2v_size))
                    display(str(j)+'/'+str(total_am))
                if type(data.loc[i]) != pd.core.series.Series:
                    features = self.vectorize(data.loc[i].text.values[0])
                    for k in data[columns].loc[i].values:
                        inp = pd.DataFrame([list(list(k) + list(features))], 
                                            columns = columns+list(range(self.w2v_size)), index = [i])
                else:
                    features = self.vectorize(data.loc[i].text)
                    inp = pd.DataFrame([list(data.loc[i][columns]) + list(features)], 
                                       columns = columns+list(range(self.w2v_size)), index = [i])
                result = result.append(inp)
        name = self.create_name("w2v_vectors", result, description=description)
        self.save_file(name, result)
        return True
    
    def create_w2v_model(self, size=50, lang=None, description=None):
        """
        Creates word2vec model based on data_train set. Set new model as current w2v model.

        Args:
        size(int): w2v vectors dimension size.
        """
        self.check_lang()
        if 'text' in list(self.data_train.columns):
            self.w2v_size = size
            if lang:
                self.lang = lang
            df = pd.concat([self.data_train, self.data_test], ignore_index=True)
            df.text.to_csv('./only_text.csv', index=False, encoding='utf-8')
            model = Word2Vec(LineSentence('./only_text.csv'), size=size, 
                         window=4, min_count=3, workers=3)
            os.remove('./only_text.csv')
            self.w2v_model = model
            name = self.create_name("w2v_model", model, description=description)
            self.save_file(name, model)
            return True
        else:
            print('Train DataFrame does not contain "text" column.')
            return False
    
    def create_clf(self, model, X_train, X_test, y_train, y_test, parameters=None, description=None, version=1):
        """
        Creates a classifier based on given train and test data and parameters. 
        Saves clf and created description for it's results.

        Args:
        model       -- sklearn model that shoud be trained.
        X_train     -- train objects set (vectors).
        X_test      -- test objects set (vectors).
        y_train     -- rubrics of X_train.
        y_test      -- real rubrics of X_test.
        parameters  -- dict with keys and corresponding  parameters of the classifier.
        description (str): additional data that should be in file name.
        version     -- int or str with version of file. 1 by default.
        """
        clf = model
        if parameters is not None:
            clf.set_params(**parameters)
        clf.fit(X_train, y_train)
        clf_name = self.create_name('clf_model', clf, version=version, description=description)
        self.save_file(clf_name, clf)
        self.clf = clf
        pred = []
        for j in clf.predict_proba(X_test):
            all_prob = pd.Series(j, index=clf.classes_)
            pred.append(list(all_prob.sort_values(ascending=False).index))
        stats = self.count_stats(pred, y_test, amounts=[1,2,3,5,-1])
        name = self.create_name('clf_model', stats, version=version, description=description, info=1)
        self.save_file(name, stats)
        return clf, clf_name, stats
# ToDo: ? Save description of the whole experiment and clf details/modify search.

    def search_for_clf(self, model, parameters, description=None, jobs=3, 
                       skf_folds=2, version=1, scoring='f1_weighted', OneVsAll=False):
        """
        Searches for a best parameters combination and creates a classifier.

        Args:
        model       -- sklearn model that shoud be trained.
        parameters  -- dict with keys and corresponding lists of parameters that should be tested.
        description (str): additional data that should be in file name.
        jobs        -- amount of threads shoud run in parallel during training.
        skf_folds   -- amount of cross-validation folds.
        version     -- int or str with version of file. 1 by default.
        """
        X_train, X_test, y_train, y_test = self.create_sets()
        self.check_conv_type()
        self.check_lang()
        skf = StratifiedKFold(y_train, shuffle=True, n_folds=3)
        p = parameters.copy()
        if OneVsAll:
            for i in list(p.keys()):
                p['estimator__'+i] = p.pop(i)
            model = OneVsRestClassifier(model)
        for i in parameters.keys():
            if len(parameters[i]) > 1:
                gs_clf = GridSearchCV(estimator=model, 
                               param_grid=p, 
                               n_jobs=jobs, 
                               scoring=scoring, 
                               cv=skf, 
                               verbose=20)
                break
        gs_clf.fit(X_train, y_train)
        clf, clf_name, stats = self.create_clf(model, 
                                        X_train, 
                                        X_test, 
                                        y_train, 
                                        y_test,
                                        parameters=gs_clf.best_estimator_.get_params(), 
                                        description=description, 
                                        version=version)
        now = datetime.datetime.today()
        descr = 'Date of creation: ' + str(now.day)+'.'+str(now.month)+'.'+str(now.year)
        descr += '\nTested parameters:'
        for i in parameters.items():
            descr += '\n\t'+ str(i)[1:-1]
        descr += '\nBest prameters:'
        for i in gs_clf.best_estimator_.get_params().items():
            descr += '\n\t'+ str(i)[1:-1]
        descr += '\nTrain and test data sizes and files:\n' + \
            '\t' + str(len(y_train)) + '\t' + self.name_train + '\n' + \
            '\t' + str(len(y_test)) + '\t' + self.name_test + \
            '\nClassifier version: v' + str(version) 
        if description:
            descr += '\nClassifier remarks:\t' + description
        descr += '\nResults (accuracy, precision, recall, f1-score):'
        for i in stats.keys():
            mac = stats[i].loc['macro']
            mic = stats[i].loc['micro']
            macro = str(mac['accuracy']) + '\t' + str(mac['precision']) + '\t' + \
            str(mac['recall']) + '\t' + str(mac['f1-score'])
            micro = str(mic['accuracy']) + '\t' + str(mic['precision']) + '\t' + \
            str(mic['recall']) + '\t' + str(mic['f1-score'])
            descr += '\n\t\tFor ' + str(i) + ' answers :' + '\n\t Macro ' + macro + '\n\t Micro ' + micro
            print('For '+str(i)+'\n\tmicro '+micro+'\n\tmacro'+macro+'\n')
        name = self.create_name('clf_model', descr, version=version, description=description, info=1)
        self.save_file(name, descr)

    def make_res_b(self, predicts, y_test):
        """
        Counts binary acccuracy, precision, recall and f1.

        Args:
        predicts    -- classifiers answers for X_test (0 and 1 for a particular rubric).
        y_test      -- real rubrics of X_test (0 and 1 for a particular rubric).
        """
        ac = accuracy_score(y_test, predicts)
        pr = precision_score(y_test, predicts)
        rec = recall_score(y_test, predicts)
        f1 = f1_score(y_test, predicts)
        return [ac, pr, rec, f1]

    def count_stats(self, predicts, y_test, legend=None, amounts=[1], version=1):
        """
        Counts statistics for predictions of a classifier

        Args:
        predicts    -- classifiers answers for X_test.
        y_test      -- real rubrics of X_test.
        legend      -- list with ordered unique rubrics. If equals to None, legend will be created in alphabet order.
        amounts     -- list with amounts of answers we want to test (-1 means all answers). 1 by default.
        version     -- int or str with version of file. 1 by default.
        """
        if legend is None:
            if self.rubr_id == 'subj':
                legend = Codes_helper().get_codes('subj')
            else:
                legend = [item for sublist in y_test for item in sublist]
                legend = pd.Series(map(str, legend))
                legend = legend.unique()
                legend.sort()
                legend = list(legend)
        if not -1 in amounts:
            amounts = list(map(int,amounts))
            amounts.sort()
            amounts = amounts[::-1]
        else:
            amounts = list(set(amounts)-set([-1]))
            amounts.sort()
            amounts = [-1]+amounts[::-1]
        keys, values = [], []
        for a in amounts:
            k = []
            if a != -1:
                for j in predicts:
                    k += [j[:a]]
            else:
                k = predicts
            cur_pred = k
            stats = pd.DataFrame(columns=['accuracy', 'precision', 'recall', 'f1-score','TP','FP','FN','TN'])
            for i in legend:
                cur_predicts = []
                cur_y_test = []
                for j in zip(cur_pred, y_test):
                    if (type(j[0])==list and i in j[0]) or i==j[0]:
                        cur_predicts += [1]
                    else:
                        cur_predicts += [0]
                    if (type(j[1])==list and i in j[1]) or i==j[1]:
                        cur_y_test += [1]
                    else:
                        cur_y_test += [0]
                temp = []
                for l in self.make_res_b(cur_predicts, cur_y_test):
                    temp += [l]
                mat = confusion_matrix(cur_predicts, cur_y_test)
                if (mat.shape == (1, 1)):
                    conf_matr = [0,0,0]+list(np.array(mat).ravel())
                else:
                    conf_matr = list(np.array(mat).ravel())[::-1]
                stats = stats.append(pd.DataFrame([temp+conf_matr],
                                                  columns=['accuracy', 'precision', 'recall', 'f1-score', 'TP','FP','FN','TN'], index=[i]))
            stats = stats.sort_index()
            stats_mean = stats.mean().values
            tp, fp, fn, tn = stats_mean[4:]
            acc_temp = (tp + tn) / (tp + fp + fn + tn)
            pr_temp = tp / (tp + fp)
            rec_temp = tp / (tp + fn)
            f1_temp = 2 * pr_temp * rec_temp / (pr_temp + rec_temp)
            stats = stats.append(pd.DataFrame([list(stats_mean[0:4])+['-']*4],
                          columns=['accuracy', 'precision', 'recall', 'f1-score','TP','FP','FN','TN'], 
                                              index = ['macro']))
            stats = stats.append(pd.DataFrame([[acc_temp, pr_temp, rec_temp, f1_temp] +list(stats_mean[4:])],
                          columns=['accuracy', 'precision', 'recall', 'f1-score','TP','FP','FN','TN'], 
                                              index = ['micro']))
            if a != -1:
                keys += [str(a)]
            else:
                keys += ['all']
            values += [stats]
        full_stats = dict(zip(keys, values))
        return full_stats
    
    def split_all_sect(self, data):
        """
        Splits all rubrics separated with / to different strings.

        Args:
        data (pd.DataFrame): DataFrame that should be splitted.
        """
        timer = time.time()
        am = data.shape[0]
        df = pd.DataFrame(columns=data.columns)
        col = list(data.columns)
        if 'text' in col:
            col.remove('text')
        size = 0
        for i in col:
            if str(i).isdigit():
                if int(i) > size:
                    size = int(i)
        if size != 0:
            for i in list(map(str,np.arange(size+1))):
                col.remove(i)
        for j,i in enumerate(data.index):
            if j%1000 == 0:
                clear_output()
                display('Splitting rubrics '+str(j)+'/'+str(am))
            temp = []
            no_miss = True
            if size == 0:
                text = data.loc[i]['text']
                for l in data.columns:
                    if not l == 'text':
                        if type(data.loc[i][l]) == str:
                            temp.append(str(data.loc[i][l]).split('\\'))
                        else:
                            no_miss = False
                if no_miss:
                    for k in zip_longest(*temp):
                        df = df.append(pd.DataFrame([list(k)+[text]], columns=col+['text'], index=[i]))
            else:
                vect = data.loc[i][list(map(str,np.arange(size+1)))]
                for l in data.columns:
                    if not str(l).isdigit():
                        if type(data.loc[i][l]) == str:
                            temp.append(str(data.loc[i][l]).split('\\'))
                        else:
                            no_miss = False
                if no_miss:
                    for k in zip_longest(*temp):
                        df = df.append(pd.DataFrame([list(k)+list(vect)], columns=col+list(map(str,np.arange(size+1))), index=[i]))
        print('Work time is', int(((time.time() - timer)%3600)//60), 'minutes',\
              '%.2f'%((time.time() - timer)%60), 'seconds')
        return df 
    
    def change_test(self, X_test, y_test):
        """
        Changes test set to make it possible to deal with several answers to one text.

        Args:
        X_test      -- DataFrame with objects features.
        y_test      -- Series with answers corresponding to X_test. 
        """
        df = pd.DataFrame([], columns=X_test.columns)
        ans = []
        for i in X_test.index.unique():
            if type(y_test.loc[i]) == pd.core.series.Series:
                df = df.append(X_test.loc[i].iloc[0])
                ans.append(list(y_test[i][y_test[i].notnull()]))
            else:
                df = df.append(X_test.loc[i])
                ans.append([y_test[i]])
        return df, pd.Series(ans)
    
    ########################################################
    # ToDo: write, test, description
    def test_with_new_data(self, data_path):
        self.check_res_folder()
        self.check_clf()
        self.check_w2v()
        self.check_rubr_id()
        if os.path.exists(data_path):
            self.data_train = pd.read_csv(data_path, index_col=0)
            self.data_train = self.w2v_vectors_creation(self.data_train)
            self.data_test  = self.data_train
            X_train, X_test, y_train, y_test = self.create_sets()
            pred = []
            for j in self.clf.predict_proba(X_test):
                all_prob = pd.Series(j, index=self.clf.classes_)
                pred.append(list(all_prob.sort_values(ascending=False).index))
            stats = self.count_stats(pred, y_test, amounts=[1,2,3,5,-1])
            name = self.create_name('result', stats, description='test', info=1)
            self.save_file(name, stats)
            for i in stats.keys():
                mac = stats[i].loc['macro']
                mic = stats[i].loc['micro']
                macro = str(mac['accuracy'].round(3)) + '\t' + str(mac['precision'].round(3)) + '\t' + \
                str(mac['recall'].round(3)) + '\t' + str(mac['f1-score'].round(3))
                micro = str(mic['accuracy'].round(3)) + '\t' + str(mic['precision'].round(3)) + '\t' + \
                str(mic['recall'].round(3)) + '\t' + str(mic['f1-score'].round(3))
                print('For '+str(i)+'\n\tmicro '+micro+'\n\tmacro '+macro+'\n')
            answers = []
            all_prob = pd.Series(j, index=self.clf.classes_)
            res = all_prob.sort_values(ascending=False)
            res = res[res!=0]
            temp = ''
            for i, k in zip(res, res.index):
                temp += k+'-'+str(i).replace('.',',')+'\\'
            answers.append(temp[:-1])
            pred = pd.DataFrame(list(zip([self.rubr_id]*len(answers), answers, 
                             ['###']*len(answers))), columns=['rubric id','result', 'correct'], index=X_train.index)
            name = self.create_name('answers', pred, description='test', info=1)
            self.save_file(name, pred)
        else:
            print('Please specify existing test data path.')
            return False
        
    ########################################################
    
    def save_file(self, name, save_data):
        """
        Saves the data with the given file name.

        Args:
        name (str): saving file name.
        file_type   -- string/DataFrame/dict{int or str:pd.DataFrame}/w2v model/clf model for saving.
        """
        path = os.path.split(name)[0]
        if os.path.exists(path):
            if   type(save_data) == dict:
                writer = pd.ExcelWriter(name, engine='xlsxwriter')
                names = list(map(str, save_data.keys()))
                names.sort()
                for i in names:
                    save_data[i].to_excel(writer, sheet_name=self.rubr_id+'_'+str(i))
                writer.save()
            elif type(save_data)   == str:
                f = open(name, 'w')
                f.write(save_data)
                f.close()
            elif type(save_data) == pd.core.frame.DataFrame:
                save_data.to_csv(name, encoding='utf-8', sep='\t')
            elif type(save_data) == Word2Vec:
                save_data.save(name)
            else:
                joblib.dump(save_data, name)
            return True
        else:
            print('Not a valid name.')
            return False
    
    # check_w2v_model()
    def vectorize(self, text): 
        """
        Transforms one text into vector.

        Args:
        text (str): string with text taht should be transformed.
        """
        self.check_conv_type()
        if self.w2v_model:
            tokens = text.split()
            features = [0]*self.w2v_size
            if self.conv_type == 'sum':
                for t in tokens:
                    if t in self.w2v_model:
                        features += self.w2v_model[t]
            elif self.conv_type in ['max','mean']:
                for t in tokens:
                    if t in self.w2v_model:
                        features = np.vstack((features, self.w2v_model[t]))
                if features.shape[0] > 1:
                    if self.conv_type == 'max':
                        features = features.max(axis=0)
                    else:
                        features = features.mean(axis=0)
            return features
        else:
            print('Word2Vec model is not set.')
            return None
    
    

In [4]:
# Гиперпараметры, которые будем перебирать при обучении соответствующих моделей.

parameter = {'random_state':[RANDOM_SEED]}

parameters_lr = {#'class_weight':['balanced'],
                 'C':[0.01, 0.01, 1.0, 10.0],
             'solver':['liblinear'], #newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’
#              'tol':[5e-4, 1e-6],
#              'max_iter':[500],
#              'fit_intercept':[True],
             'random_state':[RANDOM_SEED]}

parameters_rfc = {'criterion':['entropy'],#, 'gini'],
                  'max_depth':list(range(10,111,50)),
             'min_samples_leaf':list(range(15,27,10)),
              'max_features':list(range(15,46,15)),#list(range(40,56,10)),
#               'bootstrap':[True, False],
#               'min_samples_split':list(range(14,35,20)),
#               'min_impurity_split':[1e-7, 1e-8],
              'random_state':[RANDOM_SEED],
              'warm_start':[True]}

parameters_mlp1 = {'max_iter':[700],
             'hidden_layer_sizes':list(range(15,56,10)),
             'activation': ['logistic', 'tanh', 'relu', 'identity'],
             'alpha':[1],#[0.1, 0.5, 1.0],
             'random_state':[RANDOM_SEED],
             'warm_start':[True]}

parameters_mlp2 = {'max_iter':[700],
             'activation': ['logistic', 'tanh', 'relu', 'identity'],
             'hidden_layer_sizes':list(zip(list(range(15,56,10)),list(range(15,56,10)))),
             'alpha':[1],#[0.1, 0.5, 1.0],
             
             'random_state':[RANDOM_SEED],
             'warm_start':[True]}

parameters_svc = {'kernel':['rbf'],#'linear', 
                  'C':[0.1, 1.0, 5, 10],
            'class_weight':['balanced'],
            'probability':[True],
            'random_state':[RANDOM_SEED]}

In [5]:
worker = Worker()
worker.set_res_folder('.././for_nondet_test')
# worker.load_data('./Samples/Single_educate6_clean.txt', split_ratio=1)
# worker.load_data('./Samples/balanced_annotations_clear_17-07-17/test_clear_annotation_balanced_w2v_mean50_48k.csv', split_ratio=0.3)
worker.load_w2v('.././for_rgnti_update(26.03.18)/w2v_model_50_for_rgnti_update_k_v1_25_3_18.model')
worker.set_lang('ru')
# worker.create_w2v_model()
worker.set_conv_type('sum')
# worker.create_w2v_vectors(worker.data_train)

In [39]:
worker.load_data('./for_det_test/w2v_vectors_167k_sum50_v1_3_6_18.csv','./for_det_test/test_temp_full_with_tabs.csv')# './for_rgnti_update(26.03.18)/'+train_p)

True

In [33]:
worker.data_train

Unnamed: 0,subj,ipv,rgnti,0,1,2,3,4,5,6,...,40,41,42,43,44,45,46,47,48,49
16.10-13Б.614,e8,13Б,27.37.15,26.614842,-23.710891,-51.024774,-9.343907,-55.335266,-25.949150,-109.239970,...,-51.335909,12.360015,10.226584,-25.600821,28.770709,-10.293263,-19.148150,0.277277,2.674961,-8.404255
16.08-06Б.43,f4,06Б,73.37.61,27.723075,8.539550,-66.406864,-62.615166,-23.834502,-37.703617,-2.565913,...,-26.856116,3.878465,25.305938,48.799437,43.192581,65.946698,-75.301095,17.622035,-44.445942,-17.310776
16.10-48.142,e9,48,55.13.17,9.085715,-30.521394,-42.225581,-117.283964,15.999149,-14.015607,-22.333458,...,-72.981254,47.923601,72.617281,-7.201991,24.825994,-37.123597,10.714829,63.084432,-60.768044,-11.426489
16.12-19Р1.232,f7,19Р1,65.63.03,-73.527298,-22.482005,7.995107,-83.012389,33.401030,-1.839072,-33.992277,...,-54.508057,-6.450962,68.107552,7.332750,14.849589,-40.670493,-21.821899,-16.249774,43.296319,-20.118556
16.02-18И.263,f5,18И,29.17.43,-156.134090,43.551748,23.397978,373.514337,-75.603388,69.008851,-6.189964,...,124.207459,96.382837,-165.249924,5.311344,67.291482,94.611673,192.453999,134.669320,203.114857,29.317322
16.06-14А.225,e9,14А,55.31.29,31.640468,-9.358885,-33.293762,-32.921136,13.511489,32.508831,-34.516022,...,-31.209783,7.978374,32.216354,25.776818,-12.241494,-83.584669,-26.900720,60.371110,-34.472347,-22.201029
16.05-90.79,f9,90,44.39.01,-7.123358,54.182872,22.327802,-73.503812,27.263478,5.622327,1.234530,...,-1.219658,45.424001,2.509101,51.355751,4.110443,-55.455355,-66.110143,37.247796,-24.538850,-14.493251
16.12-72.620,f3,72,87.21.09,-23.193769,7.690369,-2.658768,-110.714775,-64.250616,35.005645,4.275961,...,-11.530837,-1.605308,73.229435,17.624976,-30.100909,-119.110809,-25.811197,-21.957534,45.427667,-4.586284
16.05-19П.168,f7,19П,61.53.99,93.483995,-81.918330,-66.760058,-104.643561,19.222663,-12.998195,-71.907939,...,-142.537598,-10.751706,65.824358,-59.646607,83.586549,-107.072903,61.892880,22.352296,-90.954281,21.124827
16.03-18Ж.105,f5,18Ж,29.35.37,52.526004,-109.856629,-98.821307,-31.774349,-46.138049,-14.208530,4.580595,...,-33.484874,-63.871863,30.102857,-2.667060,73.429772,-128.669366,45.690162,-16.435148,-69.412661,44.474665


In [34]:
worker.data_test

Unnamed: 0,Unnamed: 0.1,subj,ipv,rgnti,0,1,2,3,4,5,...,40,41,42,43,44,45,46,47,48,49
0,17.09-84.142,f3,72,87.25.03,-19.758386,-36.673104,-62.724414,-93.372798,-64.374654,-52.422450,...,-19.506704,20.350120,51.660320,28.892574,24.697751,-24.833498,-58.305306,-42.427013,84.795810,-22.135544
1,17.09-84.142,,84,87.25.33,-19.758386,-36.673104,-62.724414,-93.372798,-64.374654,-52.422450,...,-19.506704,20.350120,51.660320,28.892574,24.697751,-24.833498,-58.305306,-42.427013,84.795810,-22.135544
2,17.10-24А.243,e4,09А,29.35.19,25.585731,-25.195650,0.183580,-15.993161,-33.686667,2.644306,...,0.460392,-19.397379,-18.374510,3.033325,48.032418,26.424928,2.766526,-10.106796,-14.800390,7.831758
3,17.10-24А.243,e1,18Ж,37.15.03,25.585731,-25.195650,0.183580,-15.993161,-33.686667,2.644306,...,0.460392,-19.397379,-18.374510,3.033325,48.032418,26.424928,2.766526,-10.106796,-14.800390,7.831758
4,17.10-24А.243,f5,24А,47.43.15,25.585731,-25.195650,0.183580,-15.993161,-33.686667,2.644306,...,0.460392,-19.397379,-18.374510,3.033325,48.032418,26.424928,2.766526,-10.106796,-14.800390,7.831758
5,17.04-20А.135,f8,20А,06.51.53,-18.299589,-36.904900,-33.696227,-18.178921,4.280813,-30.645097,...,-72.368484,-7.367394,33.865894,33.655578,49.280049,64.324388,-61.811786,-13.916809,18.783886,5.898275
6,17.04-21Ю.15,f9,21Ю,45.37.31,1.428266,-27.449734,-51.352553,-14.285231,15.614072,-33.331823,...,-54.389081,4.149605,21.281762,11.363471,68.805657,16.361562,16.609074,85.642546,-45.935897,-1.848660
7,17.02-10Г.67,e5,10Г,38.59.15,10.489974,-13.513301,-34.057315,-77.027283,14.233473,45.307344,...,4.871417,25.935527,-0.366845,7.580936,-18.263990,-28.451828,16.179392,10.899248,-41.185763,-1.682674
8,17.02-10Г.67,,08М,52.47.15,10.489974,-13.513301,-34.057315,-77.027283,14.233473,45.307344,...,4.871417,25.935527,-0.366845,7.580936,-18.263990,-28.451828,16.179392,10.899248,-41.185763,-1.682674
9,17.01-19Ф.190,f7,19Ф,61.67.31,60.107315,-102.514691,-98.386475,-236.622102,-13.340682,-16.679546,...,-97.249881,-116.301656,-40.262052,3.174717,-39.062485,-115.266371,162.673646,100.289417,-154.859642,42.287417


In [30]:
df = pd.read_csv('./for_rgnti_update(26.03.18)/'+test_p)
name = './for_det_test/test_temp_full_with_tabs.csv'
df = df.iloc[0:100000]
df.to_csv(name, encoding='utf-8', sep='\t')

In [19]:
worker.data_train.ipv.unique().shape

(199,)

In [9]:
lst = os.listdir(path="./for_rgnti_update(26.03.18)") 
# lst = os.listdir(path="./Attic/temp_remake_folder") 
for i in lst:
    if 'single_theme' in i:
        if '366' in i:
            test_p = i
        else:
            train_p = i
print(test_p, '\n', train_p)

data_single_theme_366k_v1_26_3_18.csv 
 data_single_theme_964k_v1_28_3_18.csv


In [76]:
worker.data_test = worker.data_test.iloc[:int(worker.data_test.shape[0]/2)]

In [40]:
worker.data_train.subj.isnull().any()

False

In [None]:
for i in ['subj', 'ipv']:#, 'rgnti']:
    worker.set_rubr_id(i)
    worker.search_for_clf(LogisticRegression(), parameters_lr, description='lr_det')
    worker.search_for_clf(LogisticRegression(), parameters_lr, description='lr_det_onevsall', OneVsAll=True)
    worker.search_for_clf(RandomForestClassifier(), parameters_rfc, description='rfc_det')
    worker.search_for_clf(RandomForestClassifier(), parameters_rfc, description='rfc_det_onevsall', OneVsAll=True)
    worker.search_for_clf(MLPClassifier(), parameters_mlp1, description='mpl1_det')
    worker.search_for_clf(MLPClassifier(), parameters_mlp1, description='mpl1_det_onevsall', OneVsAll=True)
    worker.search_for_clf(MLPClassifier(), parameters_mlp2, description='mpl2_det')
    worker.search_for_clf(MLPClassifier(), parameters_mlp2, description='mpl2_det_onevsall', OneVsAll=True)
    worker.search_for_clf(SVC(), parameters_svc, description='svc_det')
#     worker.search_for_clf(SVC(), parameters_svc, description='svc_det_onevsall', OneVsAll=True)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed:  6.4min
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:  6.6min
[Parallel(n_jobs=3)]: Done   3 tasks      | elapsed:  6.7min
[Parallel(n_jobs=3)]: Done   4 tasks      | elapsed: 13.0min
[Parallel(n_jobs=3)]: Done   5 tasks      | elapsed: 13.1min
[Parallel(n_jobs=3)]: Done   6 tasks      | elapsed: 13.2min
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed: 20.5min
[Parallel(n_jobs=3)]: Done   8 out of  12 | elapsed: 20.6min remaining: 10.3min
[Parallel(n_jobs=3)]: Done   9 out of  12 | elapsed: 20.7min remaining:  6.9min
[Parallel(n_jobs=3)]: Done  10 out of  12 | elapsed: 28.0min remaining:  5.6min
[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed: 28.2min remaining:    0.0s
[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed: 28.2min finished


For all
	micro 0.0666666666667	0.0666666666667	1.0	0.125
	macro0.0666666666667	0.0666666666667	1.0	0.123519035883

For 5
	micro 0.721886137628	0.182829206442	0.91414603221	0.30471534407
	macro0.721886137628	0.182467356733	0.898319054083	0.30003296694

For 3
	micro 0.844622443298	0.278222774913	0.834668324738	0.417334162369
	macro0.844622443298	0.278088770786	0.807869690679	0.407528702604

For 2
	micro 0.899693988272	0.373852456018	0.747704912037	0.498469941358
	macro0.899693988272	0.367107018229	0.709787087898	0.477350868482

For 1
	micro 0.943030613365	0.572729600234	0.572729600234	0.572729600234
	macro0.943030613365	0.556319779022	0.521720145352	0.525883940329

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed:  3.4min
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:  3.5min
[Parallel(n_jobs=3)]: Done   3 tasks      | elapsed:  3.7min
[Parallel(n_jobs=3)]: Done   4 tasks      | elapsed:  7.0min
[Parallel(n_jobs=3)]: Done   5 tasks      | elapsed:  7.0min
[Parallel(n_jobs=3)]: Done   6 tasks      | elapsed:  7.4min
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed: 11.3min
[Parallel(n_jobs=3)]: Done   8 out of  12 | elapsed: 11.3min remaining:  5.6min
[Parallel(n_jobs=3)]: Done   9 out of  12 | elapsed: 11.6min remaining:  3.9min
[Parallel(n_jobs=3)]: Done  10 out of  12 | elapsed: 15.3min remaining:  3.1min
[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed: 15.6min remaining:    0.0s
[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed: 15.6min finished


For all
	micro 0.0666666666667	0.0666666666667	1.0	0.125
	macro0.0666666666667	0.0666666666667	1.0	0.123519035883

For 5
	micro 0.72189426543	0.182841398144	0.914206990722	0.304735663574
	macro0.72189426543	0.182450392585	0.898394337642	0.300024263043

For 3
	micro 0.844622443298	0.278222774913	0.834668324738	0.417334162369
	macro0.844622443298	0.278063799438	0.807893661893	0.407517757616

For 2
	micro 0.899692362711	0.373846360167	0.747692720335	0.498461813556
	macro0.899692362711	0.367067748041	0.709746376224	0.477322728953

For 1
	micro 0.943045243407	0.572839325555	0.572839325555	0.572839325555
	macro0.943045243407	0.556406705449	0.52182054741	0.525974662934

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done   3 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done   4 tasks      | elapsed:  3.2min
[Parallel(n_jobs=3)]: Done   5 tasks      | elapsed:  3.3min
[Parallel(n_jobs=3)]: Done   6 tasks      | elapsed:  3.5min
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:  6.4min
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:  6.6min
[Parallel(n_jobs=3)]: Done   9 tasks      | elapsed:  7.0min
[Parallel(n_jobs=3)]: Done  10 tasks      | elapsed:  9.5min
[Parallel(n_jobs=3)]: Done  11 tasks      | elapsed:  9.9min
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed: 10.5min
[Parallel(n_jobs=3)]: Done  13 tasks      | elapsed: 14.4min
[Parallel(n_jobs=3)]: Done  14 tasks      | elapsed: 14.8min
[Parallel(n_jobs=3)]: Done  15 tasks      | elapsed: 15.8min
[Parallel(n_jobs=3)]: Done  16 tasks      | elapsed: 19.2min
[Parallel(n_jobs=3)]: Do

For all
	micro 0.0666666666667	0.0666666666667	1.0	0.125
	macro0.0666666666667	0.0666666666667	1.0	0.123519035883

For 5
	micro 0.717794602327	0.17669190349	0.883459517452	0.294486505817
	macro0.717794602327	0.173060742739	0.861005541151	0.284781317954

For 3
	micro 0.83933774673	0.265011033491	0.795033100472	0.397516550236
	macro0.83933774673	0.259130889206	0.758333200888	0.380516353207

For 2
	micro 0.894779919453	0.355424697951	0.710849395901	0.473899597267
	macro0.894779919453	0.345305934212	0.66465539494	0.446262904559

For 1
	micro 0.939625064514	0.547187983858	0.547187983858	0.547187983858
	macro0.939625064514	0.538855153614	0.486439937243	0.493329365785

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed:  6.0min
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:  6.0min
[Parallel(n_jobs=3)]: Done   3 tasks      | elapsed:  6.0min
[Parallel(n_jobs=3)]: Done   4 tasks      | elapsed: 11.9min
[Parallel(n_jobs=3)]: Done   5 tasks      | elapsed: 11.9min
[Parallel(n_jobs=3)]: Done   6 tasks      | elapsed: 12.0min
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed: 23.4min
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed: 23.6min
[Parallel(n_jobs=3)]: Done   9 tasks      | elapsed: 23.7min
[Parallel(n_jobs=3)]: Done  10 tasks      | elapsed: 34.8min
[Parallel(n_jobs=3)]: Done  11 tasks      | elapsed: 35.1min
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed: 35.2min
[Parallel(n_jobs=3)]: Done  13 tasks      | elapsed: 52.0min
[Parallel(n_jobs=3)]: Done  14 tasks      | elapsed: 52.2min
[Parallel(n_jobs=3)]: Done  15 tasks      | elapsed: 52.4min
[Parallel(n_jobs=3)]: Done  16 tasks      | elapsed: 69.0min
[Parallel(n_jobs=3)]: Do

For all
	micro 0.0666666666667	0.0666666666667	1.0	0.125
	macro0.0666666666667	0.0666666666667	1.0	0.123519035883

For 5
	micro 0.718843088727	0.178264633091	0.891323165454	0.297107721818
	macro0.718843088727	0.176783209486	0.873248104181	0.289762431762

For 3
	micro 0.841785840557	0.271131268059	0.813393804177	0.406696902088
	macro0.841785840557	0.262871287702	0.784053138353	0.388626058498

For 2
	micro 0.897705928012	0.366397230045	0.73279446009	0.48852964006
	macro0.897705928012	0.351928956623	0.694914528502	0.461477407348

For 1
	micro 0.942269851139	0.567023883545	0.567023883545	0.567023883545
	macro0.942269851139	0.552106447731	0.514753934489	0.520800391065

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed:   17.5s
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:   22.0s
[Parallel(n_jobs=3)]: Done   3 tasks      | elapsed:   27.0s
[Parallel(n_jobs=3)]: Done   4 tasks      | elapsed:   39.1s
[Parallel(n_jobs=3)]: Done   5 tasks      | elapsed:   44.7s
[Parallel(n_jobs=3)]: Done   6 tasks      | elapsed:   48.5s
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:  1.1min
[Parallel(n_jobs=3)]: Done   9 tasks      | elapsed:  1.2min
[Parallel(n_jobs=3)]: Done  10 tasks      | elapsed:  1.5min
[Parallel(n_jobs=3)]: Done  11 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done  13 tasks      | elapsed:  1.9min
[Parallel(n_jobs=3)]: Done  14 tasks      | elapsed:  2.1min
[Parallel(n_jobs=3)]: Done  15 tasks      | elapsed:  2.3min
[Parallel(n_jobs=3)]: Done  16 tasks      | elapsed:  2.3min
[Parallel(n_jobs=3)]: Do

For all
	micro 0.0666666666667	0.0666666666667	1.0	0.125
	macro0.0666666666667	0.0666666666667	1.0	0.123519035883

For 5
	micro 0.724113155253	0.18616973288	0.930848664399	0.310282888133
	macro0.724113155253	0.186286597565	0.924283478716	0.304887542032

For 3
	micro 0.847285111087	0.284879444383	0.85463833315	0.427319166575
	macro0.847285111087	0.281350500113	0.837527665077	0.414506081703

For 2
	micro 0.902156712142	0.383087670531	0.766175341063	0.510783560709
	macro0.902156712142	0.371039344394	0.74036009489	0.488164163001

For 1
	micro 0.944670803718	0.585031027882	0.585031027882	0.585031027882
	macro0.944670803718	0.56197389668	0.551140342179	0.548710379388

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:  2.1min
[Parallel(n_jobs=3)]: Done   3 tasks      | elapsed:  2.1min
[Parallel(n_jobs=3)]: Done   4 tasks      | elapsed:  3.9min
[Parallel(n_jobs=3)]: Done   5 tasks      | elapsed:  3.9min
[Parallel(n_jobs=3)]: Done   6 tasks      | elapsed:  4.2min
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:  5.8min
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:  6.2min
[Parallel(n_jobs=3)]: Done   9 tasks      | elapsed:  6.2min
[Parallel(n_jobs=3)]: Done  10 tasks      | elapsed:  8.0min
[Parallel(n_jobs=3)]: Done  11 tasks      | elapsed:  8.4min
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:  8.4min
[Parallel(n_jobs=3)]: Done  13 tasks      | elapsed: 10.7min
[Parallel(n_jobs=3)]: Done  14 tasks      | elapsed: 11.0min
[Parallel(n_jobs=3)]: Done  15 tasks      | elapsed: 11.1min
[Parallel(n_jobs=3)]: Done  16 tasks      | elapsed: 13.2min
[Parallel(n_jobs=3)]: Do

For all
	micro 0.0666666666667	0.0666666666667	1.0	0.125
	macro0.0666666666667	0.0666666666667	1.0	0.123519035883

For 5
	micro 0.723388155355	0.185082233032	0.925411165161	0.308470388387
	macro0.723388155355	0.186005261184	0.919330388748	0.302453206552

For 3
	micro 0.84698600799	0.284131686641	0.852395059922	0.426197529961
	macro0.84698600799	0.279340315278	0.837126093538	0.409378636538

For 2
	micro 0.90272240713	0.385209026736	0.770418053473	0.513612035649
	macro0.90272240713	0.373287696931	0.745168118209	0.488231394606

For 1
	micro 0.94578268697	0.593370152274	0.593370152274	0.593370152274
	macro0.94578268697	0.57202699849	0.554194963054	0.551410286874

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed:   22.3s
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:   22.4s
[Parallel(n_jobs=3)]: Done   3 tasks      | elapsed:   29.0s
[Parallel(n_jobs=3)]: Done   4 tasks      | elapsed:   43.6s
[Parallel(n_jobs=3)]: Done   5 tasks      | elapsed:  1.0min
[Parallel(n_jobs=3)]: Done   6 tasks      | elapsed:  1.0min
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:  1.2min
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done   9 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done  10 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done  11 tasks      | elapsed:  2.1min
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed:  2.4min
[Parallel(n_jobs=3)]: Done  13 tasks      | elapsed:  2.5min
[Parallel(n_jobs=3)]: Done  14 tasks      | elapsed:  2.8min
[Parallel(n_jobs=3)]: Done  15 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done  16 tasks      | elapsed:  3.4min
[Parallel(n_jobs=3)]: Do

For all
	micro 0.0666666666667	0.0666666666667	1.0	0.125
	macro0.0666666666667	0.0666666666667	1.0	0.123519035883

For 5
	micro 0.723922964697	0.185884447045	0.929422235227	0.309807411742
	macro0.723922964697	0.179964727871	0.920346922217	0.297620234194

For 3
	micro 0.847621602071	0.285720671844	0.857162015532	0.428581007766
	macro0.847621602071	0.277877125574	0.836451512014	0.411794628763

For 2
	micro 0.902715904888	0.385184643332	0.770369286663	0.513579524442
	macro0.902715904888	0.374808038291	0.739507455497	0.490593919455

For 1
	micro 0.945314525601	0.589858942004	0.589858942004	0.589858942004
	macro0.945314525601	0.577971254501	0.544752095344	0.549993341902

Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done   3 tasks      | elapsed:  2.9min
[Parallel(n_jobs=3)]: Done   4 tasks      | elapsed:  5.7min
[Parallel(n_jobs=3)]: Done   5 tasks      | elapsed:  5.7min
[Parallel(n_jobs=3)]: Done   6 tasks      | elapsed:  5.9min
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed:  9.1min
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed:  9.3min
[Parallel(n_jobs=3)]: Done   9 tasks      | elapsed:  9.5min
[Parallel(n_jobs=3)]: Done  10 tasks      | elapsed: 13.0min
[Parallel(n_jobs=3)]: Done  11 tasks      | elapsed: 13.3min
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed: 13.8min
[Parallel(n_jobs=3)]: Done  13 tasks      | elapsed: 17.5min
[Parallel(n_jobs=3)]: Done  14 tasks      | elapsed: 18.1min
[Parallel(n_jobs=3)]: Done  15 tasks      | elapsed: 18.3min
[Parallel(n_jobs=3)]: Done  16 tasks      | elapsed: 21.4min
[Parallel(n_jobs=3)]: Do

For all
	micro 0.0666666666667	0.0666666666667	1.0	0.125
	macro0.0666666666667	0.0666666666667	1.0	0.123519035883

For 5
	micro 0.724540677615	0.186811016422	0.934055082111	0.311351694037
	macro0.724540677615	0.187659440323	0.928502743208	0.306470032338

For 3
	micro 0.847888193962	0.286387151571	0.859161454714	0.429580727357
	macro0.847888193962	0.287602473873	0.843670808139	0.419206444255

For 2
	micro 0.903071902596	0.386519634737	0.773039269473	0.515359512982
	macro0.903071902596	0.381461665731	0.748024038268	0.496109009916

For 1
	micro 0.946080164507	0.5956012338	0.5956012338	0.5956012338
	macro0.946080164507	0.583365339342	0.554417549524	0.559696324686

Fitting 3 folds for each of 4 candidates, totalling 12 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed: 979.8min
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed: 981.4min
[Parallel(n_jobs=3)]: Done   3 tasks      | elapsed: 981.8min
[Parallel(n_jobs=3)]: Done   4 tasks      | elapsed: 2598.4min
[Parallel(n_jobs=3)]: Done   5 tasks      | elapsed: 2602.5min
[Parallel(n_jobs=3)]: Done   6 tasks      | elapsed: 2609.2min
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed: 5052.4min
[Parallel(n_jobs=3)]: Done   8 out of  12 | elapsed: 5068.3min remaining: 2534.1min
[Parallel(n_jobs=3)]: Done   9 out of  12 | elapsed: 5089.0min remaining: 1696.3min
[Parallel(n_jobs=3)]: Done  10 out of  12 | elapsed: 7524.4min remaining: 1504.9min
[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed: 7565.3min remaining:    0.0s
[Parallel(n_jobs=3)]: Done  12 out of  12 | elapsed: 7565.3min finished


In [77]:
for i in ['subj', 'ipv']:#, 'rgnti']:
    worker.set_rubr_id(i)
    worker.search_for_clf(LogisticRegression(), parameters_lr, description='nondet')#, OneVsAll=True)

Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed:  7.1min
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:  7.4min remaining:    0.0s
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed:  7.4min finished


For all
	micro 0.0666666666667	0.0666666666667	1.0	0.125
	macro0.0666666666667	0.0666666666667	1.0	0.123501159937

For 5
	micro 0.721859846133	0.182789769199	0.913948845997	0.304649615332
	macro0.721859846133	0.181360768401	0.899272394967	0.298744985004

For 3
	micro 0.844486134386	0.27788200263	0.833646007891	0.416823003946
	macro0.844486134386	0.276713802715	0.807109036288	0.406181490244

For 2
	micro 0.899790585937	0.374214697262	0.748429394524	0.498952929683
	macro0.899790585937	0.366390748799	0.70976652736	0.476919708323

For 1
	micro 0.943077652337	0.573082392528	0.573082392528	0.573082392528
	macro0.943077652337	0.555531942217	0.521047139028	0.525087981601



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fitting 3 folds for each of 1 candidates, totalling 3 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed: 115.8min
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed: 116.1min remaining:    0.0s
[Parallel(n_jobs=3)]: Done   3 out of   3 | elapsed: 116.1min finished
  np.exp(prob, prob)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


For all
	micro 0.035909936088	0.00509330034651	0.94735386445	0.0101321268925
	macro0.035909936088	0.00493413471068	0.96875	0.00977934329882

For 5
	micro 0.976217753032	0.143380858214	0.716904291068	0.238968097023
	macro0.976217753032	0.138777191509	0.700291893001	0.22421259035

For 3
	micro 0.985790216374	0.211953590641	0.635860771922	0.317930385961
	macro0.985790216374	0.200199008086	0.606960925472	0.290290880138

For 2
	micro 0.990229684381	0.281024850304	0.562049700607	0.374699800405
	macro0.990229684381	0.258993394242	0.527106677184	0.333514360982

For 1
	micro 0.994015598855	0.425497490078	0.425497490078	0.425497490078
	macro0.994015598855	0.383728205649	0.386707409997	0.362132023933



In [None]:
worker = Worker()
worker.set_res_folder('./for_1vsAll_update')
# worker.load_w2v('./for_rgnti_update(26.03.18)/w2v_model_50_for_rgnti_update_k_v1_25_3_18.model')
# worker.load_data(train_p, test_p)
worker.lang = 'ru'
worker.set_conv_type('sum')
worker.load_data('./for_rgnti_update(26.03.18)/'+train_p, './for_rgnti_update(26.03.18)/'+test_p)
# worker.load_data('./Samples/balanced_annotations_clear_17-07-17/train_clear_annotation_balanced_w2v_max50_95k.csv')#, './Samples/balanced_annotations_clear_17-07-17/test_clear_annotation_balanced_w2v_max50_48k')
for i in ['subj']:#, 'ipv']:#, 'rgnti']:
    worker.set_rubr_id(i)
    worker.search_for_clf(LogisticRegression(), parameters_lr, description='onevsall', OneVsAll=True)
# worker.search_for_clf(RandomForestClassifier(), parameters_rfc, description='rfc_nti')
# worker.search_for_clf(MLPClassifier(), parameters_mlp1, description='mpl1_nti')
# worker.search_for_clf(MLPClassifier(), parameters_mlp2, description='mpl2_nti')
# worker.search_for_clf(SVC(), parameters_svc, description='svc_nti')

In [7]:
clf = joblib.load('./for_1vsAll_test/clf_model_ru_subj_onevsall_max50_v1_1_5_18.plk')

In [8]:
clf.classes_[0]

'e1'

In [19]:
res = []
if type(clf) == OneVsRestClassifier:
    for i in clf.estimators_:
        res.append(i.predict_proba([[1]*50]).T[1])
r = pd.DataFrame(np.array(res).T, columns=clf.classes_)
r = r.iloc[0].sort_values(ascending=False)

In [None]:
 def classify(self, vector):
        if self.clf:
            if self.clf.coef_.T.shape[0] == len(vector):
                if type(clf) == OneVsRestClassifier:
                    res = []
                    for i in clf.estimators_:
                        res.append(i.predict_proba([[1]*50]).T[1])
                    r = pd.DataFrame(np.array(res).T, columns=clf.classes_)
                    result = r.iloc[0].sort_values(ascending=False)
                else:
                    result = pd.Series(self.clf.predict_proba([vector])[0], index=self.clf.classes_)
                    result = result.sort_values(ascending=False)
                # result = result.round(3)
                # result = result[result!=0]
                return result   
            else:
                self.error_occurred.emit('Vector has '+str(len(vector))+' elements. Model can work with vectors that have '+str(clf.coef_.T.shape[0])+'attibutes.')
                return None
        else:
            return None    

In [20]:
r

f5    0.545143
f7    0.095689
e5    0.025877
e3    0.022962
e9    0.022521
e4    0.010626
e1    0.008004
f4    0.007281
f3    0.006997
f1    0.004613
f9    0.001843
f2    0.001313
e2    0.001071
f8    0.000134
e7    0.000071
Name: 0, dtype: float64

In [18]:
# X_train, X_test, y_train, y_test = worker.create_sets()
clf, clf_name, stats = worker.create_clf(LogisticRegression(), X_train, X_test, y_train, y_test, description='lr_grnti_update')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [24]:
description='lr_grnti_update'
version = 1
now = datetime.datetime.today()
descr = 'Date of creation: ' + str(now.day)+'.'+str(now.month)+'.'+str(now.year)
descr += '\nTested parameters:'
for i in clf.get_params().items():
    descr += '\n\t'+ str(i)[1:-1]
descr += '\nBest prameters:'
for i in clf.get_params().items():
    descr += '\n\t'+ str(i)[1:-1]
descr += '\nTrain and test data sizes and files:\n' + \
    '\t' + str(len(y_train)) + '\t' + worker.name_train + '\n' + \
    '\t' + str(len(y_test)) + '\t' + worker.name_test + \
    '\nClassifier version: v' + str(1) 
if description:
    descr += '\nClassifier remarks:\t' + description
descr += '\nResults (accuracy, precision, recall, f1-score):'
for i in stats.keys():
    mac = stats[i].loc['macro']
    mic = stats[i].loc['micro']
    macro = str(mac['accuracy']) + '\t' + str(mac['precision']) + '\t' + \
    str(mac['recall']) + '\t' + str(mac['f1-score'])
    micro = str(mic['accuracy']) + '\t' + str(mic['precision']) + '\t' + \
    str(mic['recall']) + '\t' + str(mic['f1-score'])
    descr += '\n\t\tFor ' + str(i) + ' answers :' + '\n\t Macro ' + macro + '\n\t Micro ' + micro
    print('For '+str(i)+'\n\tmicro '+micro+'\n\tmacro'+macro+'\n')
name = worker.create_name('clf_model', descr, version=version, description=description, info=1)
worker.save_file(name, descr)

For 2
	micro 0.995591188855	0.350037633635	0.527375882491	0.420785416569
	macro0.995591188855	0.323229946002	0.398575519635	0.332133658058

For all
	micro 0.00532483381855	0.00304353367568	0.999978670914	0.00606859697813
	macro0.00532483381855	0.00303656906772	0.997711670481	0.00603549816562

For 1
	micro 0.996911605237	0.488685860984	0.368118703476	0.419919344538
	macro0.996911605237	0.425648517111	0.27457738223	0.302573285983

For 5
	micro 0.989922876219	0.19216853083	0.723685239561	0.303693741874
	macro0.989922876219	0.188407265141	0.571816879778	0.272467617139

For 3
	micro 0.993862330724	0.274043543921	0.619268696722	0.379949094097
	macro0.993862330724	0.2596772896	0.476451599177	0.317104533195



True

In [11]:
for i in ['subj', 'ipv', 'rgnti']:
    worker.set_rubr_id(i)
    worker.search_for_clf(SVC(), parameters_svc, description='svc_nti')

KeyboardInterrupt: 

In [15]:
worker = Worker()
worker.set_res_folder('./lstm experiment')
worker.load_w2v('./w2v50_viniti_ru_100k.model')
worker.load_data('./Samples/balanced_annotations_clear_17-07-17/train_clear_annotation_balanced_w2v_mean50_95k.csv', './Samples/balanced_annotations_clear_17-07-17/test_clear_annotation_balanced_w2v_mean50_48k.csv')
worker.set_rubr_id('rgnti')
# X_train, X_test, y_train, y_test = worker.create_sets()

test ok


True

In [5]:
%load_ext autoreload
%autoreload 2
from ANN_v2 import run_ann

Using TensorFlow backend.


In [32]:
np.array(y_test)

array([['30.51', '37.31', '38.63', '52.13', '61.53'], ['34.49'], ['61.31'],
       ..., ['73.37'], ['34.27', '34.27'], ['81.92']], dtype=object)

In [33]:
codes = np.array(list(set(y_train)))
codes.sort()

In [34]:
answers = run_ann(X_train, np.array(y_train), X_test, np.array(y_test), codes)# np.array(helper.get_codes('subj')))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 50)                20200     
_________________________________________________________________
dense_2 (Dense)              (None, 449)               22899     
Total params: 43,099
Trainable params: 43,099
Non-trainable params: 0
_________________________________________________________________
None
Train on 109894 samples, validate on 12211 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [35]:
stats = worker.count_stats(answers, y_test, legend=codes, amounts=[1,2,3,5,-1])

  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [36]:
name = worker.create_name("result", stats)
worker.save_file(name, stats)

True

In [None]:
worker = Worker()
worker.set_res_folder('./for nti stats')
worker.load_w2v('./w2v50_viniti_ru_100k.model')
worker.load_data('./Samples/balanced_annotations_clear_17-07-17/train_clear_annotation_balanced_w2v_max50_95k.csv', './Samples/balanced_annotations_clear_17-07-17/test_clear_annotation_balanced_w2v_max50_48k')
worker.set_rubr_id('subj')
worker.search_for_clf(LogisticRegression(), parameters_lr, description='lr_nti')
worker.search_for_clf(RandomForestClassifier(), parameters_rfc, description='rfc_nti')
worker.search_for_clf(MLPClassifier(), parameters_mlp1, description='mpl1_nti')
worker.search_for_clf(MLPClassifier(), parameters_mlp2, description='mpl2_nti')
worker.search_for_clf(SVC(), parameters_svc, description='svc_nti')

In [None]:
worker = Worker()
worker.set_res_folder('./for nti stats')
worker.load_w2v('./w2v50_viniti_ru_100k.model')
worker.load_data('./Samples/balanced_annotations_clear_17-07-17/train_clear_annotation_balanced_w2v_mean50_95k.csv', './Samples/balanced_annotations_clear_17-07-17/test_clear_annotation_balanced_w2v_mean50_48k')
worker.set_rubr_id('rgnti')
worker.search_for_clf(SVC(), parameters_svc, description='svc_nti')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fitting 3 folds for each of 8 candidates, totalling 24 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed: 13.1min
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed: 13.1min
[Parallel(n_jobs=3)]: Done   3 tasks      | elapsed: 13.1min
[Parallel(n_jobs=3)]: Done   4 tasks      | elapsed: 30.2min
[Parallel(n_jobs=3)]: Done   5 tasks      | elapsed: 30.2min
[Parallel(n_jobs=3)]: Done   6 tasks      | elapsed: 30.4min
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed: 43.3min
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed: 43.3min
[Parallel(n_jobs=3)]: Done   9 tasks      | elapsed: 43.6min
[Parallel(n_jobs=3)]: Done  10 tasks      | elapsed: 60.5min
[Parallel(n_jobs=3)]: Done  11 tasks      | elapsed: 60.5min
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed: 60.9min
[Parallel(n_jobs=3)]: Done  13 tasks      | elapsed: 86.4min
[Parallel(n_jobs=3)]: Done  14 tasks      | elapsed: 86.4min
[Parallel(n_jobs=3)]: Done  15 tasks      | elapsed: 86.8min
[Parallel(n_jobs=3)]: Done  16 tasks      | elapsed: 122.8min
[Parallel(n_jobs=3)]: D

For all
	micro 0.0150360537124	0.00339420397478	0.999779054353	0.00676543959285
	macro0.0150360537124	0.00335455205919	0.988317757009	0.00664077405127

For 1
	micro 0.996227580719	0.41025095704	0.284136102519	0.335741014707
	macro0.996227580719	0.299186919907	0.297768768342	0.249393438879

For 2
	micro 0.994913683607	0.313825546165	0.434821034026	0.364545707141
	macro0.994913683607	0.23137949984	0.427463932504	0.257022923026

For 5
	micro 0.989378734488	0.187339833906	0.64877006923	0.290728471357
	macro0.989378734488	0.141058855025	0.59722807267	0.204803256786

For 3
	micro 0.993247774752	0.256539680853	0.533399617028	0.346452361305
	macro0.993247774752	0.190888154091	0.502626329307	0.242290782056



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:  2.5min
[Parallel(n_jobs=3)]: Done   3 tasks      | elapsed:  5.5min
[Parallel(n_jobs=3)]: Done   4 tasks      | elapsed:  6.9min
[Parallel(n_jobs=3)]: Done   5 tasks      | elapsed:  7.3min
[Parallel(n_jobs=3)]: Done   6 tasks      | elapsed: 11.3min
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed: 11.9min
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed: 13.2min
[Parallel(n_jobs=3)]: Done   9 tasks      | elapsed: 16.9min
[Parallel(n_jobs=3)]: Done  10 tasks      | elapsed: 17.6min
[Parallel(n_jobs=3)]: Done  11 tasks      | elapsed: 19.0min
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed: 21.6min
[Parallel(n_jobs=3)]: Done  13 tasks      | elapsed: 23.1min
[Parallel(n_jobs=3)]: Done  14 tasks      | elapsed: 23.9min
[Parallel(n_jobs=3)]: Done  15 tasks      | elapsed: 26.3min
[Parallel(n_jobs=3)]: Done  16 tasks      | elapsed: 31.0min
[Parallel(n_jobs=3)]: Do

For all
	micro 0.0150360537124	0.00339420397478	0.999779054353	0.00676543959285
	macro0.0150360537124	0.00335455205919	0.988317757009	0.00664077405127

For 1
	micro 0.996533505983	0.469391919467	0.254124318751	0.329733862105
	macro0.996533505983	0.16901242119	0.0801536858529	0.091084483813

For 2
	micro 0.995573968182	0.35944595822	0.408049786419	0.38220888521
	macro0.995573968182	0.14390020137	0.142416304231	0.124247876381

For 5
	micro 0.990591191922	0.203088260248	0.617027544557	0.305593551093
	macro0.990591191922	0.123175788465	0.254771615951	0.12975999553

For 3
	micro 0.994111432906	0.284772522097	0.499484460156	0.36273683929
	macro0.994111432906	0.135279557699	0.187193911013	0.131272775198



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fitting 3 folds for each of 20 candidates, totalling 60 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed:  4.9min
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed:  5.1min
[Parallel(n_jobs=3)]: Done   3 tasks      | elapsed:  5.3min
[Parallel(n_jobs=3)]: Done   4 tasks      | elapsed: 11.0min
[Parallel(n_jobs=3)]: Done   5 tasks      | elapsed: 11.1min
[Parallel(n_jobs=3)]: Done   6 tasks      | elapsed: 11.6min
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed: 16.4min
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed: 16.5min
[Parallel(n_jobs=3)]: Done   9 tasks      | elapsed: 17.3min
[Parallel(n_jobs=3)]: Done  10 tasks      | elapsed: 19.9min
[Parallel(n_jobs=3)]: Done  11 tasks      | elapsed: 23.3min
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed: 28.2min
[Parallel(n_jobs=3)]: Done  13 tasks      | elapsed: 32.5min
[Parallel(n_jobs=3)]: Done  14 tasks      | elapsed: 32.7min
[Parallel(n_jobs=3)]: Done  15 tasks      | elapsed: 40.5min
[Parallel(n_jobs=3)]: Done  16 tasks      | elapsed: 40.9min
[Parallel(n_jobs=3)]: Do

For all
	micro 0.0150360537124	0.00339420397478	0.999779054353	0.00676543959285
	macro0.0150360537124	0.00335455205919	0.988317757009	0.00664077405127

For 1
	micro 0.996508053396	0.464960081105	0.270216526734	0.34179514649
	macro0.996508053396	0.191712320083	0.0967655936002	0.112019457359

For 2
	micro 0.995546538695	0.360680920434	0.423663278833	0.389643377248
	macro0.995546538695	0.188725225325	0.168294013841	0.150933481114

For 5
	micro 0.990780603646	0.211297249291	0.639564000589	0.317650178779
	macro0.990780603646	0.131471668374	0.301671802576	0.158805345695

For 3
	micro 0.994172099023	0.293024987589	0.521652673442	0.375258278146
	macro0.994172099023	0.166121889534	0.222589622627	0.163030421328



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed: 60.6min
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed: 60.8min
[Parallel(n_jobs=3)]: Done   3 tasks      | elapsed: 62.3min
[Parallel(n_jobs=3)]: Done   4 tasks      | elapsed: 203.7min
[Parallel(n_jobs=3)]: Done   5 tasks      | elapsed: 204.0min
[Parallel(n_jobs=3)]: Done   6 tasks      | elapsed: 206.9min


In [6]:
worker = Worker()
worker.set_res_folder('./for nti stats')
worker.load_w2v('./w2v50_viniti_ru_100k.model')
worker.load_data('./Samples/balanced_annotations_clear_17-07-17/train_clear_annotation_balanced_w2v_mean50_95k.csv', './Samples/balanced_annotations_clear_17-07-17/test_clear_annotation_balanced_w2v_mean50_48k')
worker.set_rubr_id('rgnti')
worker.search_for_clf(SVC(), parameters_svc, description='svc_nti')
# worker.search_for_clf(RandomForestClassifier(), parameters_rfc, description='rfc_nti')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Fitting 3 folds for each of 6 candidates, totalling 18 fits


[Parallel(n_jobs=3)]: Done   1 tasks      | elapsed: 59.4min
[Parallel(n_jobs=3)]: Done   2 tasks      | elapsed: 62.0min
[Parallel(n_jobs=3)]: Done   3 tasks      | elapsed: 62.9min
[Parallel(n_jobs=3)]: Done   4 tasks      | elapsed: 171.4min
[Parallel(n_jobs=3)]: Done   5 tasks      | elapsed: 176.2min
[Parallel(n_jobs=3)]: Done   6 tasks      | elapsed: 177.6min
[Parallel(n_jobs=3)]: Done   7 tasks      | elapsed: 233.6min
[Parallel(n_jobs=3)]: Done   8 tasks      | elapsed: 237.4min
[Parallel(n_jobs=3)]: Done   9 tasks      | elapsed: 238.6min
[Parallel(n_jobs=3)]: Done  10 tasks      | elapsed: 316.7min
[Parallel(n_jobs=3)]: Done  11 tasks      | elapsed: 321.1min
[Parallel(n_jobs=3)]: Done  12 tasks      | elapsed: 323.4min
[Parallel(n_jobs=3)]: Done  13 tasks      | elapsed: 379.7min
[Parallel(n_jobs=3)]: Done  14 out of  18 | elapsed: 383.9min remaining: 109.7min
[Parallel(n_jobs=3)]: Done  15 out of  18 | elapsed: 388.3min remaining: 77.7min
[Parallel(n_jobs=3)]: Done  16 out

For 3
	micro 0.994937431355	0.369586756389	0.658538379492	0.473458055133
	macro0.994937431355	0.286952445702	0.42907183567	0.324603942827

For all
	micro 0.00345627567831	0.00345627567831	1.0	0.00688874196532
	macro0.00345627567831	0.00345627567831	1.0	0.00684176529966

For 5
	micro 0.991478359029	0.255656802266	0.766707124205	0.3834522846
	macro0.991478359029	0.200138403751	0.53233831685	0.279538944065

For 1
	micro 0.997050680348	0.620965785004	0.376475780647	0.468756439906
	macro0.997050680348	0.41418989626	0.208295961943	0.258464074444

For 2
	micro 0.996295076882	0.469747587231	0.558534701534	0.510307979233
	macro0.996295076882	0.351407055069	0.339084263426	0.322971906605



In [138]:
worker = Worker()
# worker.load_clf('temp_remake_folder/clf_model_ru_subj_very_nice_max50_v1_23_1_18.plk')
# worker.load_w2v('./w2v50_viniti_ru_100k.model')
# worker.vectorize('text article задача текст биология астрономия магия солнце', 'sum')
worker.set_res_folder('./temp_remake_folder_v2(for_rgnti_experiment)')
# worker.load_data('./Samples/balanced_annotations_clear_17-07-17/test_clear_annotation_balanced_w2v_max50_48k_subj.csv')
# worker.load_data('./Samples/balanced_annotations_clear_17-07-17/test_clear_annotation_balanced_rj_48k.csv')
worker.load_data('./temp_remake_folder_v2(for_rgnti_experiment)/w2v_vectors_60k_mean75_v1_7_3_18.csv')
# worker.create_w2v_model(75, 'test_start')
worker.load_w2v('./temp_remake_folder_v2(for_rgnti_experiment)/w2v_model_75_test_start_k_v1_7_3_18.model')
worker.conv_type = 'mean'
# worker.create_w2v_vectors(worker.data_test)

# worker.data_cleaning('./Samples/balanced_annotations_clear_17-07-17/test_clear_annotation_balanced_rj_48k.csv', description='ddd')
worker.lang = 'ru'
# worker.rubr_id = 'subj'
worker.rubr_id = 'rgnti'

In [103]:
# worker.test_with_new_data('./Samples/balanced_annotations_clear_17-07-17/test_clear_annotation_balanced_rj_48k.csv')

'max 50'

'48600/48666'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


For all
	micro 0.984	0.044	0.705	0.083
	macro 0.984	0.001	0.016	0.001

For 5
	micro 0.996	0.153	0.6	0.244
	macro 0.996	0.002	0.014	0.004

For 3
	micro 0.998	0.217	0.531	0.308
	macro 0.998	0.003	0.012	0.005

For 2
	micro 0.998	0.277	0.47	0.349
	macro 0.998	0.004	0.01	0.005

For 1
	micro 0.999	0.394	0.36	0.376
	macro 0.999	0.005	0.008	0.006



In [135]:
X_train, X_test, y_train, y_test = worker.create_sets()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [23]:
clf, _ = worker.create_clf(LogisticRegression(), X_train, X_test, y_train, y_test, description='lr_xxx_test')

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
