#Packages and imports

In [71]:
!pip install names-dataset
!pip install natasha

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [72]:
import pandas as pd
import numpy as np
from names_dataset import NameDataset, NameWrapper

from natasha import (
    Segmenter,
    MorphVocab,
    NewsEmbedding,
    NewsMorphTagger,
    NewsSyntaxParser,
    Doc
)

nd = NameDataset()
segmenter = Segmenter()
morph_vocab = MorphVocab()
emb = NewsEmbedding()
morph_tagger = NewsMorphTagger(emb)
syntax_parser = NewsSyntaxParser(emb)

#Dataset preparation

In [73]:
test_data = pd.read_csv('test_data.csv')

In [74]:
data_manager = test_data[test_data['role']=='client'] 
#there are confused roles 'client' and 'manager'

In [75]:
dialogs_indexes = data_manager['dlg_id'].unique()
dialogs_indexes

array([0, 1, 2, 3, 4, 5])

In [76]:
dialogs = []

for index in dialogs_indexes:
  dialogs.append(data_manager[data_manager['dlg_id']==index]['text'])

#Dialog checkpoint(for managers):
*   greeting 
*   representation 
*   manager name 
*   company name
*   farewell 
*   greeting & farewell



In [77]:
greetings = ['здравствуйте','добрый']
farewells = ['до свидания','всего доброго','хорошего']
represent = ['зовут'] 
company_flag = 'компан'

#Greeting and farewell

In [78]:
def greeting_or_farewell(speech,farewells=farewells,greetings=greetings):
  farewell_replics, greeting_replics = [], []
  for farewell,greeting in zip(farewells,greetings):
    for replic in speech:
      if farewell in replic.lower():
        farewell_replics.append(replic)
      if greeting in replic.lower():
        greeting_replics.append(replic)      
  farewell_replics = np.unique(farewell_replics)
  greeting_replics = np.unique(greeting_replics)

  return greeting_replics, farewell_replics

#Representation and name

In [79]:
def Represent_and_name(dialog,repr_flag=represent,country='Russian Federation'):
    represent = ''
    name = ''
    for repr_word in repr_flag:
      for replic in dialog:
        if repr_word in replic:
          represent = replic
          tokens = represent.split()
          for i in range(len(tokens)):
            if tokens[i] == repr_word:
              left_cand = NameWrapper(nd.search(tokens[i-1])).country == country
              right_cand = NameWrapper(nd.search(tokens[i+1])).country == country
              if left_cand or right_cand:
                if right_cand:
                  name = tokens[i+1]
                elif left_cand:
                  name = tokens[i-1]
    return represent, name

#Company name

In [80]:
def name_from_line(line,flag=company_flag):
  name = ''
  tokens = line.split()
  for i in range(len(tokens)):
    if flag in tokens[i]:
      line = 'компания ' +  ' '.join(tokens[i+1:])
      line = Doc(line)
      line.segment(segmenter)
      line.parse_syntax(syntax_parser)
      line.tag_morph(morph_tagger)
      for token in line.tokens[1:4]:
        if token.pos in ['NOUN', 'ADJ','ADV', 'PNOUN']:
          name += token.text + ' '
      return name


def company_name(dialog,flag=company_flag):
  name = ''
  for line in dialog:
    if not(name):
      if flag in line:
        name = name_from_line(line,flag)
  return name

#Parsing

In [81]:
def parse_dialog(dialog):
  greeting, farewell = greeting_or_farewell(dialog)
  repr_line, name = Represent_and_name(dialog)
  company = company_name(dialog)
  print('GREETING')
  for line in greeting:
    print(line)
  print('-'*30)
  print('REPRESENTATION')
  print(repr_line)
  print('-'*30)
  print('MANAGER NAME')
  print(name)
  print('-'*30)
  print('COMPANY NAME')
  print(company)
  print('-'*30)
  print('FAREWELL')
  for line in farewell:
    print(line)
  print('-'*30)
  if len(greeting) * len(farewell):
    print('GREETING & FAREWELL - YES')
  else:
    print('GREETING & FAREWELL - NO')

In [82]:
for index,dialog in enumerate(dialogs):
  print(f'DIALOG {index} :') 
  parse_dialog(dialog)
  print('\n\n')

DIALOG 0 :
GREETING
Алло здравствуйте
------------------------------
REPRESENTATION
Меня зовут ангелина компания диджитал бизнес звоним вам по поводу продления лицензии а мы с серым у вас скоро срок заканчивается
------------------------------
MANAGER NAME
ангелина
------------------------------
COMPANY NAME
диджитал бизнес 
------------------------------
FAREWELL
Всего хорошего до свидания
------------------------------
GREETING & FAREWELL - YES



DIALOG 1 :
GREETING
Алло здравствуйте
------------------------------
REPRESENTATION
Меня зовут ангелина компания диджитал бизнес звоню вам по поводу продления а мы сели обратила внимание что у вас срок заканчивается
------------------------------
MANAGER NAME
ангелина
------------------------------
COMPANY NAME
диджитал бизнес 
------------------------------
FAREWELL
До свидания
------------------------------
GREETING & FAREWELL - YES



DIALOG 2 :
GREETING
Алло здравствуйте
------------------------------
REPRESENTATION
Меня зовут ангелина 