In [1]:
import pandas as pd
from sklearn.externals import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

In [2]:
## Load data
intent_train = pd.read_csv('../data/intent_train.csv')
intent_label_map = {label:i for i, label in enumerate(intent_train['label'].unique())}
print(intent_label_map)

support_train = pd.read_csv('../data/support_train.csv')
support_label_map = {label:i for i, label in enumerate(support_train['label'].unique())}
print(support_label_map)

callcenter_train = pd.read_csv('../data/callcenter_train.csv', index_col='id')

{'FAQ - тарифы и услуги': 0, 'мобильная связь - тарифы': 1, 'Мобильный интернет': 2, 'FAQ - интернет': 3, 'тарифы - подбор': 4, 'Баланс': 5, 'Мобильные услуги': 6, 'Оплата': 7, 'Личный кабинет': 8, 'SIM-карта и номер': 9, 'Роуминг': 10, 'запрос обратной связи': 11, 'Устройства': 12, 'мобильная связь - зона обслуживания': 13}
{'positive': 0, 'neutral': 1, 'negative': 2}


In [3]:
## Prepare data
X_intent = intent_train['text'].fillna('none').str.lower()
y_intent = intent_train['label'].map(intent_label_map)

X_support = support_train['text'].fillna('none').str.lower()
y_support = support_train['label'].map(support_label_map)

time_columns = [
    'Время окончания разговора с оператором',
    'Время переключения на оператора',
    'Время постановки в очередь',
    'Время окончания вызова',
    'Время начала вызова',
]

start_time = pd.Timestamp('00:00:00')

for col in time_columns:
    callcenter_train[col] = callcenter_train[col].fillna(start_time)
    callcenter_train[col] = callcenter_train[col].apply(pd.Timestamp) - start_time
    
for i, col_1 in enumerate(time_columns[:-1]):
    for col_2 in time_columns[i + 1:]:
        callcenter_train[f'{col_1}-{col_2}'] = abs(callcenter_train[col_1] - callcenter_train[col_2])

for col in callcenter_train.columns.drop(['Метка', 'Длительность разговора с оператором, сек']):
    callcenter_train[f'{col}_seconds'] = callcenter_train[col].apply(lambda x: x.seconds)

callcenter_train = callcenter_train[
    ['Метка', 'Длительность разговора с оператором, сек'] + [x for x in callcenter_train.columns if 'seconds' in x]
]

X_callcenter = callcenter_train.drop('Метка', axis=1)
y_callcenter = callcenter_train['Метка']

In [4]:
def generate_model(params):
    tfidf = TfidfVectorizer(
        analyzer=params['analyzer'], 
        ngram_range=(params['range_min'], params['range_max']),
        min_df=params['min_df'],
        max_features=params['max_features'],
    )
    lr = LogisticRegression(
        C = params['C'],
        solver='liblinear',
        class_weight='balanced',
        random_state=42,
        n_jobs=1,
    )
    return Pipeline([('tfidf', tfidf), ('lr', lr)])

In [5]:
## Fit all models
intent_params = {
    'C': 10,
    'analyzer': 'char_wb',
    'max_features': 7500,
    'min_df': 5,
    'range_max': 4,
    'range_min': 3,
}
support_params = {
    'C': 1,
    'analyzer': 'char',
    'max_features': 10000,
    'min_df': 7,
    'range_max': 4,
    'range_min': 4,
}
callcenter_params = {
    'loss_function': 'Logloss',
    'custom_metric': 'F1',
    'iterations': 2000,
    'early_stopping_rounds': 20,
    'verbose': False,
    'random_state': 42,
    'learning_rate': 0.03,
    'border_count': 200,
    'depth': 6,
    'l2_leaf_reg': 3,
}

intent_model = generate_model(intent_params)
support_model = generate_model(support_params)
callcenter_model = CatBoostClassifier(**callcenter_params)

intent_model.fit(X_intent, y_intent)
support_model.fit(X_support, y_support)
callcenter_model.fit(X_callcenter, y_callcenter)

<catboost.core.CatBoostClassifier at 0x10b6155c0>

In [6]:
## Dump all models
joblib.dump(intent_model, '../solution/intent_model')
joblib.dump(support_model, '../solution/support_model')
joblib.dump(callcenter_model, '../solution/callcenter_model')

['../solution/callcenter_model']