In [1]:
import numpy as np
import pandas as pd
import datetime
import pickle
import os

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier

from nltk.corpus import stopwords

In [2]:
## ADDITIONAL FUNCTIONS
if not os.path.exists('models'):
    os.makedirs('models')

pStopWordsList = stopwords.words('english') + stopwords.words('russian')
    
def process_text_layer(TextLayer, pStopWordsList = []):
    txt =  " ".join([w for w in TextLayer.lower().split() \
                     if (not w in pStopWordsList)])

    txt = txt. \
        replace('«', ''). \
        replace('»', ''). \
        replace('(', ''). \
        replace(')', ''). \
        replace('\[)', ''). \
        replace('\]', ''). \
        replace('^', ''). \
        replace('\\', '')

    return txt

def learnBinaryClassifier(ds):
    '''Kлассификатор. 
   Input: ds (dataframe): x - текстовый слой; target - результат
   Output: CountVectorizer, RandomForestClassifier
   '''
    ngram_range = (1, 3)
    max_features = 50000
    n_estimators=100

    print("Create vectorizer")
    vectorizer = CountVectorizer(analyzer = "word",
                                tokenizer = None,
                                preprocessor = None,
                                stop_words = None, 
                                ngram_range = ngram_range,
                                max_features = max_features
                                )

    print("Fit and Transform vectorizer")

    X_train = ds['x']
    Y_train = ds['target']

    X_train = vectorizer.fit_transform(X_train)
    X_train = X_train.toarray()

    model = RandomForestClassifier(n_estimators=n_estimators)

    print("Fit Model")
    model = model.fit(X_train, Y_train)

    return vectorizer, model

In [3]:
text_data = pd.read_csv('CLASS PROSPECTUS.csv')
text_data['string_value'] = text_data['string_value'].astype(str)
class_data = pd.read_excel('CLASS PROSPECTUS.xlsx', sheet_name = 'DATA')

models_rename = {
    "Ограничение по предоставлению залога":"Restriction_on_the_provision_of_collateral",
    "Изменение контроля":"Change_of_control",
    "Случаи дефолта":"Cases_of_default",
    "Кросс-дефолт":"Cross-default",
    "Оговорки о коллективных действиях":"Collective_action_clauses",
    "Ограничение задолженности":"Limitation_on_indebtedness",
    "Ограничение задолженности дочерних компаний":"Limitation_of_debt_of_subsidiaries",
    "Ограничение по платежам":"Limitation_on_payments",
    "Ограничение по инвестициям":"Investment_restriction",
    "Ограничение по платежам в отношении дочерних компаний":"Limitation_on_payments_to_subsidiaries",
    "Ограничение по транзакциям с аффилированными лицами":"Restriction_on_transactions_with_affiliates",
    "Ограничение деятельности":"Restriction_of_activity",
    "Ограничение по продаже активов":"Restriction_on_asset_sales",
    "Ограничение по продаже активов с обратной арендой":"Restriction_on_the_sale_of_assets_with_leaseback",
    "Ограничение по слиянию":"Limitation_on_merger",
    "Триггер рейтингов":"Ratings_trigger",
    "Обозначение прав дочерних компаний (restricted / unrestricted)":"Designation_of_the_rights_of_subsidiaries",
    "Ограничение по наслоению долговых обязательств по рангам":"Restriction_on_the_layering_of_debt_obligations_by_rank",
    "Условие приостановки действия ковенантов":"A_condition_of_suspension_of_the_covenants",
    "Финансовые ковенанты":"Financial_covenants"}
class_data = class_data.rename(columns=models_rename)

In [4]:
# Уменьшаем размерность задачи, так как слишком много нулевых данных
tags = list(class_data['tag'])
new_tags = list(text_data[~text_data['tag'].isin(tags)].sample(3000)['tag'])
new_tags = new_tags + tags
text_data = text_data[text_data['tag'].isin(new_tags)]

In [5]:
data = text_data.merge(class_data, on='tag', how='left').copy()
data['string_value'] = data.apply(lambda r: process_text_layer(r['string_value'], pStopWordsList), axis = 1)
models = list(data.columns)[2:]

In [6]:
datasets = {}
#models = ['Change_of_control']
for model_name in models:
    ds = data[['tag','string_value', model_name]].rename(columns={'string_value': 'x', model_name: 'target'})
    ds = ds.fillna(0)
    ds = ds[ds['target']!=''].drop_duplicates()
    datasets[model_name] = ds.copy()

In [7]:
validation_set_part = 0.2
for model_name in models:
    df_validation = datasets[model_name].sample(round(len(datasets[model_name])*validation_set_part)).copy()
    item = {
        'df_validation': df_validation.copy(),
        'df_training': datasets[model_name][datasets[model_name]['tag'].isin(df_validation['tag'].values)==False].copy()
    }
    datasets[model_name] = item

In [8]:
for model_name in models:
    vector, model = learnBinaryClassifier(datasets[model_name]['df_training'])
    with open(('models\\CLASS_PROSPECTUS_%s.rft' % model_name), 'wb') as f:
        pickle.dump(model, f)
    with open(('models\\VECTOR_PROSPECTUS_%s.txt' % model_name), 'wb') as f:
        pickle.dump(vector, f)

Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Transform vectorizer
Fit Model
Create vectorizer
Fit and Trans

In [9]:
confidence_level = 0
for model_name in models:
    with open(('models\\CLASS_PROSPECTUS_%s.rft' % model_name), 'rb') as f:
        model = pickle.load(f)
    with open(('models\\VECTOR_PROSPECTUS_%s.txt' % model_name), 'rb') as f:
        vector = pickle.load(f)  
        
    ds = datasets[model_name]['df_validation']
    
    ds['predict']= model.predict(vector.transform(ds['x']))
    ds['confidence']=np.transpose(np.amax(model.predict_proba(vector.transform(ds['x'])), axis=1))
    ds['target'] = ds.apply(lambda r: 'empty' if r['target']==None else r['target'], axis=1)
    
    precision = round(100- 100*len(ds[(ds['confidence']>=confidence_level)&(ds['target']!=ds['predict'])])/len(ds),2)
    to_validate = round(100*len(ds[(ds['confidence']<confidence_level)])/len(ds),2)
    
    print('Model %s statistic (validation rows %s): Precision= %s ; To_Validate=%s  at Confidence=%s' % (model_name, len(ds), precision, to_validate, confidence_level))

Model Restriction_on_the_provision_of_collateral statistic (validation rows 685): Precision= 98.83 ; To_Validate=0.0  at Confidence=0
Model Change_of_control statistic (validation rows 685): Precision= 99.12 ; To_Validate=0.0  at Confidence=0
Model Cases_of_default statistic (validation rows 685): Precision= 98.98 ; To_Validate=0.0  at Confidence=0
Model Cross-default statistic (validation rows 685): Precision= 98.54 ; To_Validate=0.0  at Confidence=0
Model Collective_action_clauses statistic (validation rows 685): Precision= 98.54 ; To_Validate=0.0  at Confidence=0
Model Limitation_on_indebtedness statistic (validation rows 685): Precision= 99.71 ; To_Validate=0.0  at Confidence=0
Model Limitation_of_debt_of_subsidiaries statistic (validation rows 685): Precision= 99.85 ; To_Validate=0.0  at Confidence=0
Model Limitation_on_payments statistic (validation rows 685): Precision= 99.71 ; To_Validate=0.0  at Confidence=0
Model Investment_restriction statistic (validation rows 685): Precisi