In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import re
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import eli5
import warnings
warnings.filterwarnings("ignore")

# Read data

In [2]:
data = pd.read_csv('D:/tele2_data_hack/email_reports.csv')
data = data.sort_values(by='To.queue.time').reset_index(drop=True)
print(data.shape)

(125938, 10)


In [3]:
data.head()

Unnamed: 0,SENDER_ID,SENDER_EMAIL,OPERATOR_ID,ACCEPTOR,SUBJECT,To.queue.time,agent.receive.time,treatment.end.time,CONTENT,ANSWERTEXT
0,token_7747,xxxx@yandex.ru,token_48,spb.vopros@tele2.ru,Обратная связь,2088-06-17 02:31:32,2088-06-18 19:48:26,2088-06-18 20:24:59,Тело письма\r\n\r\nMSISDN: +xxx xxx xx xx\r\nО...,"\r\n\r\nЗдравствуйте, xxxxx xxxxx xxxxx!\r\n\r..."
1,token_14226,xxxx@mail.ru,token_50,Saransk.Vopros@tele2.ru,Re[2]: Обратная связь,2088-06-17 02:31:52,2088-06-18 18:43:45,2088-06-18 18:58:43,\r\nвся необходимая информация отображена в ...,"Здравствуйте, xxxxx xxxxx xxxxx!\r\nБлагодарим..."
2,token_23942,xxxx@mail.ru,token_3,rostov.vopros@tele2.ru,Обратная связь,2088-06-17 02:33:14,2088-06-18 18:43:45,2088-06-18 18:47:06,Тело письма\r\n\r\nMSISDN: +xxx xxx xx xx\r\nО...,"\r\n\r\nЗдравствуйте, xxxxx xxxxx xxxxx!\r\nБл..."
3,token_18368,xxxx@mail.ru,token_53,krasnodar.vopros@tele2.ru,Обратная связь,2088-06-17 02:33:36,2088-06-18 19:49:10,2088-06-18 19:53:57,Тело письма\r\n\r\nMSISDN: +xxx xxx xx xx\r\nО...,"\r\n\r\nЗдравствуйте, xxxxx xxxxx xxxxx! ¦\r\n..."
4,token_21039,xxxx@mail.ru,token_34,kostroma.vopros@tele2.ru,Обратная связь,2088-06-17 02:33:56,2088-06-18 19:49:17,2088-06-18 19:59:48,"Тело письма\r\n\r\nОтметьте, в каком регионе В...","\r\n\r\nАлександр, доброго времени суток! ;)\r..."


# determine fraction of each problem 

In [4]:
# for training classifier use only letters for support 
print(data.shape)
data = data[data['CONTENT'].apply(lambda x: 'Тело письма' in str(x))].reset_index(drop=True)
print(data.shape)

(125938, 10)
(102898, 10)


### Get question from user and topic of this question 

In [5]:
def get_topic(x):
    """""
    Get topic from letter's body  
    """""
    ret = ''
    try:
        ret = x.split('Выберите тему обращения: ')[1].split('\r')[0]
    except:
        pass
    return ret

def get_question(x):
    """""
    Get question from letter's body
    """""
    ret = ''
    try:
        ret = x.split('Выберите тему обращения: ')[1].split('\r')[1].split('\nЗадайте ваш вопрос или опишите проблему: ')[1]
    except:
        pass
    return ret


data['topic'] = data['CONTENT'].apply(lambda x: get_topic(x))
data['question'] = data['CONTENT'].apply(lambda x: get_question(x))

### Statistics of topics 

In [6]:
# calculate statisitics of topic 
temp = data[['topic','CONTENT']].groupby('topic').agg('count').reset_index()
temp = temp.rename_axis({'CONTENT':'Number of questions'},axis=1)
num_q = np.sum(temp['Number of questions'])
temp = temp[temp['Number of questions']>1200].reset_index(drop=True)

temp['Topic fraction'] = temp['Number of questions'] / num_q * 100
print('Most popular topics fraction: ', np.sum(temp['Topic fraction']), '%')

Most popular topics fraction:  97.50335283484615 %


In [7]:
temp.sort_values(by='Topic fraction', ascending=False)

Unnamed: 0,topic,Number of questions,Topic fraction
8,прочее,24434,23.745845
10,тарифыИУслуги,18770,18.241365
3,мобильныйИнтернет,15918,15.469688
6,подключенныеУслуги,9112,8.855371
2,личныйКабинет,7379,7.171179
9,роумингИЗонаОбслуживания,6732,6.542401
7,поступлениеПлатежейОшибочныеПлатежи,6045,5.87475
1,детализацияСчётаИлиЗвонков,5474,5.319831
0,блокировкаИРазблокировкаSimКарты,2412,2.344069
5,подключениеКTele2,2398,2.330463


In [8]:
most_freq_topics = list(temp['topic'])

In [9]:
# keep only freq topics 
data = data[data['topic'].apply(lambda x: x in most_freq_topics)].reset_index(drop=True)

### Prepare data for test classification 

In [10]:
# split texts in train and test part, time-series split, i.e. shuffle=False (data was sorted by time before) 
X_train, X_test, y_train, y_test = train_test_split(data['question'], data['topic'], 
                                                    shuffle=False, 
                                                    test_size=0.1)

In [11]:
# transfrom 'string' labels in 'int' labels 
le = LabelEncoder()
y_train_labels = le.fit_transform(y_train)
y_test_labels = le.transform(y_test)

In [12]:
# del all 'xx', 'xxx', 'xxxx' - i.e. artefacts of tele2 data protection
xs_seq = []
for n_x in np.arange(10,2,-1):
    xs_seq.append('x'*n_x)
    
def preprocess_string(x):
    for xs in xs_seq:
        x = re.sub(xs, ' ', x)
    return x

X_train = X_train.apply(lambda x: preprocess_string(x))
X_test = X_test.apply(lambda x: preprocess_string(x))

In [13]:
# download stopwords from NLTK 
stopWords = set(stopwords.words('russian'))
stopWords = list(stopWords)
stopWords = stopWords + ['вечер', 'добрый', 'день', 'скажите', 'пожалуйста', 'спасибо', 'ли', 'почему', 'вопрос']
stopWords.remove('не')
stopWords = set(stopWords)
print('Number of stop words:', len(stopWords))

Number of stop words: 158


In [14]:
# fit TFIDF
tfidf = TfidfVectorizer(ngram_range=(1, 2), min_df=100, max_df=0.99, stop_words=stopWords)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [15]:
tfidf.vocabulary_

{'пока не': 1556,
 'подписку tuboteka': 1537,
 'офисе': 1343,
 'дело': 443,
 'кому': 766,
 'давно': 400,
 'поселке': 1640,
 'ул': 2302,
 'вообще не': 316,
 'писала': 1432,
 'прошу вернуть': 1794,
 'думал': 532,
 'вижу': 282,
 'разы выше': 1842,
 'живу': 547,
 'настроить': 987,
 'интернета': 679,
 'тт': 2274,
 'какие условия': 724,
 'постоянные': 1650,
 'регул': 1862,
 'полное': 1564,
 'поэтому': 1661,
 'минут гб': 864,
 'суд': 2166,
 'которую': 784,
 'безлимитный': 209,
 'телефона': 2242,
 'приложение теле2': 1696,
 'произошло списание': 1760,
 'раньше': 1848,
 'вернуть денежные': 262,
 'оказывается': 1227,
 'звонить': 602,
 'услугу которую': 2334,
 'получить пароль': 1584,
 'здравствуйте хочу': 636,
 'сегодня': 1951,
 'стороны': 2163,
 'перезагрузка': 1397,
 'оператора это': 1236,
 'никакого': 1123,
 'вашими': 246,
 'такие': 2192,
 'деньги счета': 460,
 'вами': 233,
 'нахожусь': 995,
 'не перенесли': 1039,
 'перевел': 1387,
 'входа личный': 342,
 'мтс': 963,
 'нечаянно': 1111,
 'согла

In [16]:
X_train_tfidf.shape, X_test_tfidf.shape

((90296, 2438), (10033, 2438))

In [17]:
# fit logistic regression 
ls = LogisticRegression(solver='saga', verbose=1, n_jobs=-1)
ls.fit(X_train_tfidf, y_train_labels)

convergence after 18 epochs took 32 seconds
convergence after 18 epochs took 32 seconds
convergence after 19 epochs took 34 seconds
convergence after 19 epochs took 34 seconds
convergence after 20 epochs took 36 seconds
convergence after 21 epochs took 37 seconds
convergence after 21 epochs took 37 seconds
convergence after 21 epochs took 37 seconds


[Parallel(n_jobs=-1)]: Done   8 out of  11 | elapsed:   37.5s remaining:   14.0s


convergence after 17 epochs took 10 seconds
convergence after 21 epochs took 8 seconds
convergence after 24 epochs took 10 seconds


[Parallel(n_jobs=-1)]: Done  11 out of  11 | elapsed:   42.8s finished


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=-1,
          penalty='l2', random_state=None, solver='saga', tol=0.0001,
          verbose=1, warm_start=False)

In [18]:
# check accuracy of model on test set 
y_hat = ls.predict(X_test_tfidf)
acc_test = np.mean(y_hat==y_test_labels)

print('Accuracy on test data:', acc_test)

Accuracy on test data: 0.6052028306588259


# Visualization of predictions 

In [19]:
def show_examples(id_example):
    
    text = X_test.reset_index(drop=True).iloc[id_example]
    predicted_label = le.inverse_transform(y_hat[id_example])
    true_label = le.inverse_transform(y_test_labels[id_example])

    print('Id:', id_example)
    print('True label:', true_label)
    print('Predicted label:', predicted_label)
    #print('Text:\n', text)
    display(eli5.show_prediction(ls, 
                         X_test.reset_index(drop=True).iloc[id_example], 
                         vec=tfidf,
                         #feature_names=list(tfidf.vocabulary_),
                         target_names=list(le.classes_),
                         targets=[predicted_label]))

In [20]:
# have a look at examples of text classification 
for idd in [0, 16, 2018, 10002, 10024]:
    show_examples(idd)

Id: 0
True label: прочее
Predicted label: прочее


Contribution?,Feature
0.864,Highlighted in text (sum)
-0.851,<BIAS>


Id: 16
True label: мобильныйИнтернет
Predicted label: мобильныйИнтернет


Contribution?,Feature
1.667,Highlighted in text (sum)
-2.097,<BIAS>


Id: 2018
True label: подключенныеУслуги
Predicted label: подключенныеУслуги


Contribution?,Feature
3.763,Highlighted in text (sum)
-2.717,<BIAS>


Id: 10002
True label: мобильныйИнтернет
Predicted label: прочее


Contribution?,Feature
0.183,Highlighted in text (sum)
-0.851,<BIAS>


Id: 10024
True label: мобильныйИнтернет
Predicted label: мобильныйИнтернет


Contribution?,Feature
1.63,Highlighted in text (sum)
-2.097,<BIAS>
