In [8]:
import sys
import pandas as pd
from datetime import datetime
import re
import collections
import pymorphy2
import warnings
import numpy as np
import matplotlib.pyplot as plt
import logging, datetime, os
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC

#%config Application.log_level="INFO"
#logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
warnings.simplefilter('ignore')
morph = pymorphy2.MorphAnalyzer()

In [9]:
def delete_punctuation(s):
    return ' '.join((re.sub(r'[№"\'-_/.:?!1234567890()%<>;,+#$&\s+]', u' ', s)).split())

def lemmatize(s):    
    return ' '.join([morph.parse(w)[0].normal_form for w in s.split()])

##Получаем слово из словаря по индексу

def get_word_from_dict(dict_name,index):
    return list(dict_name.keys())[list(dict_name.values()).index(int(index))]    

# Создаем класс "Мусор"                   
# n - количество топовых в классе слов, наличие которых проверяется в тексте
# Если ни одного из этих слов нет, то текст отправляется в "мусор"

def create_data_with_garbage(data,n):    
    
    #Создаем словари классов и слов:

    class_indexes = {}
    for i in range(len((set(data['CLASS'])))):
        class_indexes[sorted(set(data['CLASS']))[i]] = i

    cv = CountVectorizer(stop_words=stopwords)    
    word_count_vector = cv.fit_transform(data['TEXT'].tolist())       
    word_indexes = cv.vocabulary_

    #Считаем сколько раз слова встречаются в каждом классе

    class_word_count = np.empty((0,3))

    for class_name in class_indexes:

        text_class = data.loc[data['CLASS'] == class_name]['TEXT'].tolist()  

        # Инициализируем и подгоняем CV:

        cv = CountVectorizer(max_df = 0.5,min_df = 20,stop_words=stopwords)    
        word_count_vector = cv.fit_transform(text_class)           

        # Заполняем массивы для подсчета встерчаемости слов в классах:   

        class_word_count = np.append(class_word_count,[[class_indexes.get(class_name),word_indexes.get(word),word_count_vector.sum(axis = 0)[0, idx]] for word, idx in cv.vocabulary_.items()],axis=0) 
    
    # выделяем мусорный класс
    
    top_words = []
    
    data_new  = data.copy()
    
    for c in set(class_word_count.transpose()[0]):
        
        class_name = get_word_from_dict(class_indexes,c)    
            
        for clss,word,cnt in np.array(pd.DataFrame(class_word_count[class_word_count.transpose()[0] == c]).sort_values(by = [2],ascending = False))[:n]:            
            word = get_word_from_dict(word_indexes,word) 

            top_words.append(word)
    
    top_words = set(top_words)       
    
    for index,row in data_new.iterrows():                              

        if not top_words.isdisjoint(row['TEXT'].split(' ')):                
            pass            
        else:                
            data_new.set_value(index,'CLASS','Мусор')
            
    return data_new      

## Исходные данные 

In [11]:
from stop_words import get_stop_words
import nltk
nltk.download('stopwords')

with open('stopwords.txt','r', encoding='cp1251') as f:
    stopwords_txt = f.read().splitlines() 
    
with open('names.txt','r', encoding='cp1251') as f:
    stopnames = f.read().splitlines()     

stopwords = stopwords_txt + get_stop_words('ru')  + nltk.corpus.stopwords.words('russian') + stopnames    
    
#learning_file = 'naznach_for_24_classes_lemmatized.csv'   
#training_file = 'naznach_for_24_classes_lemmatized.csv'

#learning_file = 'naznach_for_24_classes_1_product_inn_v4_lemmatized.csv'   
#training_file = 'naznach_for_24_classes_1_product_inn_v4_lemmatized.csv'

#learning_file = 'naznach_for_24_classes_1_product_inn_v5_lemmatized.csv'   
#training_file = 'naznach_for_24_classes_1_product_inn_v5_lemmatized.csv'

learning_file = 'data_clean.csv'
learning_file = 'data_clean.csv'

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ivan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Обучение

In [12]:
# Инициализация массива даных:
    
data = pd.read_csv(learning_file, encoding='utf-8', delimiter=';')
data = data.dropna(subset=['TEXT'])
data = data.drop_duplicates().reset_index(drop = True)
data_new = create_data_with_garbage(data,100)

In [None]:
set(data_new[data_new['CLASS']=='Мусор']['TEXT'].values[:100])

In [None]:
data_new['TEXT'].values

In [7]:
# Обучение на всей выборке

text_clf_lr = Pipeline([('tfidf',TfidfVectorizer(stop_words = stopwords)),('clf',OneVsRestClassifier(LinearSVC()))])
text_clf_lr = text_clf_lr.fit(data_new['TEXT'],data_new['CLASS'])
predicted_values = text_clf_lr.predict(data_new['TEXT'])
print(metrics.classification_report(data_new['CLASS'],predicted_values,target_names = data_new['CLASS'].unique()))

                                                        precision    recall  f1-score   support

                                                 Мусор       0.75      0.89      0.82     88044
                                          Автозапчасти       0.93      0.85      0.89    132987
                                            Автомобили       0.65      0.60      0.62      2772
                                       Бытовая техника       0.84      0.64      0.73     21583
                                         Бытовая химия       0.99      0.99      0.99   2676928
                                                   ГСМ       0.84      0.54      0.66     64235
                                          Канцелярский       0.96      0.98      0.97   1061975
                             Компьютерное оборудование       0.86      0.87      0.86     22330
                                                Мебель       0.99      0.98      0.98   1420402
                                       

## Классификация

In [10]:
data = pd.read_csv(training_file, encoding='utf-8', delimiter=';')
data = data.dropna(subset=['TEXT'])
data_new = create_data_with_garbage(data,50)
data_new['PREDICTED'] = text_clf_lr.predict(data_new['TEXT'])
print(metrics.classification_report(data_new['CLASS'],data_new['PREDICTED'],target_names = data_new['CLASS'].unique()))

                                                        precision    recall  f1-score   support

                                                 Мусор       0.71      0.89      0.79     83011
                                          Автозапчасти       0.84      0.85      0.85    121150
                                            Автомобили       0.64      0.60      0.62      2660
                                       Бытовая техника       0.79      0.67      0.72     19697
                                         Бытовая химия       0.97      0.99      0.98   2632677
                                                   ГСМ       0.80      0.54      0.64     61052
                                          Канцелярский       0.86      0.98      0.92    948582
                             Компьютерное оборудование       0.81      0.89      0.85     20785
                                                Мебель       0.96      0.99      0.97   1373192
                                       

In [14]:
data_new[data_new['PREDICTED']=='ГСМ'][['TEXT','PREDICTED']] \
.to_csv('sample_gsm_client_payments.csv',sep = ';',index=False,encoding='cp1251')

In [None]:
data_new[data_new['PREDICTED']=='ГСМ'][['TEXT','PREDICTED']].sample(n=100)

## Классифицируем клиентов с несколькими продуками

In [None]:
data = pd.read_csv('naznach_for_24_classes_v5.csv', encoding='utf-8', delimiter=';')
data = data.rename(columns={'class':'CLASS','text':'TEXT'})
data = data.dropna(subset=['TEXT'])
data_new = create_data_with_garbage(data,50)

In [None]:
data_new['PREDICTED'] = text_clf_lr.predict(data_new['TEXT'])
print(metrics.classification_report(data_new['CLASS'],data_new['PREDICTED'],target_names = data_new['PREDICTED'].unique()))

## Формируем выборку для кластеризации:

In [40]:
data_new[['INN','SUMMA','PREDICTED']].to_csv('clients_clustering_v5.csv',sep = ';',index=False,encoding='cp1251')

## Сэмпл данных

In [69]:
data_new[data_new['CLASS']=='Канцелярский'].sample(n=200).to_csv('data_sample_canc.csv',sep = ';',index=False,encoding='cp1251')

## Классификация текстов платежек случайных клиентов

In [299]:
training_file = 'new_part_test_data_lemmatized.csv'
data_random_clients = pd.read_csv(training_file, encoding='utf-8', delimiter=';')
data_random_clients = data_random_clients.dropna(subset=['text'])
data_random_clients['PREDICTED'] = text_clf_lr.predict(data_random_clients['text'])

In [None]:
pd.options.display.max_colwidth = 200
data_random_clients[data_random_clients['PREDICTED']!='Мусор'][['text','PREDICTED']].sample(n=100)

In [174]:
len(data_random_clients['inn'].drop_duplicates())

168390

In [93]:
len(data_random_clients[data_random_clients['PREDICTED']!='Мусор']['inn'].drop_duplicates())

40765

In [99]:
data_random_clients['COUNT'] = 1

data_random_clients_pivot = pd.pivot_table(data_random_clients,
               values=['COUNT'],
               index = ['inn','PREDICTED'],
               aggfunc={
                   'COUNT' : np.sum,
               }).unstack().fillna(0)

In [113]:
labels_not_garbage = [x for x in list(data_random_clients_pivot.columns) 
                      if 'Мусор' not in x                                                               
                     ]

label_garbage = [x for x in list(data_random_clients_pivot.columns) 
                      if 'Мусор' in x                                                               
                     ]

data_random_clients_pivot['SUM_NOT_GARBAGE'] = data_random_clients_pivot[labels_not_garbage].sum(axis=1) 
data_random_clients_pivot['GARBAGE'] = data_random_clients_pivot[label_garbage].values
data_random_clients_pivot['GARBAGE %'] = data_random_clients_pivot['GARBAGE'] / (data_random_clients_pivot['SUM_NOT_GARBAGE'] + data_random_clients_pivot['GARBAGE'])

In [167]:
data_random_clients_pivot['GARBAGE %'].mean()

0.8628561583026206

In [168]:
from loader import Loader
import os, sys

        
file= 'tmp_select_from_triggers.csv'
query = 'select distinct inn from atb_segmen_tr_all_t'
print(query)

data=Loader(True).save_csv(query, path=file, verbose=1)   
data_triggers = pd.read_csv('tmp_select_from_triggers.csv', encoding='utf-8', delimiter=';')

select distinct inn from atb_segmen_tr_all_t
Connecting...
Getting data ... 
Downloaded 10,000 lines,       0sec. passed
Downloaded 20,000 lines,       0sec. passed
Downloaded 30,000 lines,       0sec. passed
Downloaded 40,000 lines,       0sec. passed
Downloaded 50,000 lines,       0sec. passed
Downloaded 60,000 lines,       0sec. passed
Downloaded 70,000 lines,       0sec. passed
Downloaded 80,000 lines,       0sec. passed
Downloaded 90,000 lines,       1sec. passed
Downloaded 100,000 lines,       1sec. passed
Downloaded 110,000 lines,       1sec. passed
Downloaded 120,000 lines,       1sec. passed
Downloaded 130,000 lines,       1sec. passed
Downloaded 140,000 lines,       1sec. passed
Downloaded 150,000 lines,       1sec. passed
Downloaded 160,000 lines,       1sec. passed
Downloaded 170,000 lines,       1sec. passed
Downloaded 180,000 lines,       1sec. passed
Downloaded 190,000 lines,       2sec. passed
Downloaded 200,000 lines,       2sec. passed
Downloaded 210,000 lines,       

Downloaded 1,800,000 lines,      12sec. passed
Downloaded 1,810,000 lines,      13sec. passed
Downloaded 1,820,000 lines,      13sec. passed
Downloaded 1,830,000 lines,      13sec. passed
Downloaded 1,840,000 lines,      13sec. passed
Downloaded 1,850,000 lines,      13sec. passed
Downloaded 1,860,000 lines,      13sec. passed
Downloaded 1,870,000 lines,      13sec. passed
Downloaded 1,880,000 lines,      13sec. passed
Downloaded 1,890,000 lines,      13sec. passed
Downloaded 1,900,000 lines,      13sec. passed
Downloaded 1,910,000 lines,      13sec. passed
Downloaded 1,920,000 lines,      13sec. passed
Downloaded 1,930,000 lines,      13sec. passed
Downloaded 1,940,000 lines,      13sec. passed
Downloaded 1,950,000 lines,      13sec. passed
Downloaded 1,960,000 lines,      14sec. passed
Downloaded 1,970,000 lines,      14sec. passed
Downloaded 1,980,000 lines,      14sec. passed
Downloaded 1,990,000 lines,      14sec. passed
Downloaded 2,000,000 lines,      14sec. passed
Downloaded 2,

In [172]:
len(pd.merge(data_triggers,pd.DataFrame(data_random_clients[data_random_clients['PREDICTED']!='Мусор']['inn'].drop_duplicates()),on = 'inn')['inn'].drop_duplicates())

19903

In [175]:
len(pd.merge(data_triggers,pd.DataFrame(data_random_clients['inn'].drop_duplicates()),on = 'inn')['inn'].drop_duplicates())

40708

In [243]:
file_1 = 'new_part_test_data.csv'
data_random_clients_src = pd.read_csv(file_1, encoding='utf-8', delimiter=';')
data_random_clients_src = data_random_clients_src.dropna(subset=['text'])

In [241]:
data_random_clients_src['PREDICTED'] = data_random_clients['PREDICTED']

In [259]:
data_payment_sample = data_random_clients[data_random_clients['inn'].isin([ 101000776  #питание
                                                     ,27814792826#гсм
                                                     ,27814973163#фин аудит
                                                     ,27815395349#автомобили
                                                     ,27815497012#электротовары
                                                     ,27615946895#хозяйственный
                                                    ])]

In [261]:
data_payment_sample['text'] = data_random_clients_src[data_random_clients_src['inn'].isin([ 101000776  #питание
                                                     ,27814792826#гсм
                                                     ,27814973163#фин аудит
                                                     ,27815395349#автомобили
                                                     ,27815497012#электротовары
                                                     ,27615946895#хозяйственный
                                                    ])]['text']

In [265]:
data_payment_sample[['inn','text','PREDICTED']].sort_values(by = ['inn','PREDICTED']) \
.to_csv('sample_client_payments.csv',sep = ';',index=False,encoding='cp1251')

In [None]:
# Инициализация массива даных:
    
# data2 = pd.read_csv('naznach_for_24_classes_1_product_inn_v5_lemmatized.csv', encoding='utf-8', delimiter=';')
# data2 = data.dropna(subset=['TEXT'])
# #data = data.drop_duplicates().reset_index(drop = True)
# data_new2 = create_data_with_garbage(data,100)

# Обучение на всей выборке

text_clf_lr2 = Pipeline([('tfidf',TfidfVectorizer(stop_words = stopwords,min_df = 1000,max_df = 0.6)),('clf',OneVsRestClassifier(LinearSVC()))])
text_clf_lr2 = text_clf_lr2.fit(data_new2['TEXT'],data_new2['CLASS'])
predicted_values2 = text_clf_lr2.predict(data_new2['TEXT'])
print(metrics.classification_report(data_new2['CLASS'],predicted_values2,target_names = data_new2['CLASS'].unique()))

#data_random_clients2 = pd.read_csv('new_part_test_data_lemmatized.csv', encoding='utf-8', delimiter=';')
data_random_clients2 = pd.read_csv('new_part_test_data.csv', encoding='utf-8', delimiter=';')
data_random_clients2 = data_random_clients2.dropna(subset=['text'])
data_random_clients2['PREDICTED'] = text_clf_lr2.predict(data_random_clients2['text'])

pd.options.display.max_colwidth = 200
data_random_clients2[data_random_clients2['PREDICTED']!='Мусор'][['text','PREDICTED']].sample(n=100)