In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
pip install rapidfuzz textdistance catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [3]:
HYPHENS = {
    '-',  # \u002d Hyphen-minus
    '‐',  # \u2010 Hyphen
    '‑',  # \u2011 Non-breaking hyphen
    '⁃',  # \u2043 Hyphen bullet
    '‒',  # \u2012 figure dash
    '–',  # \u2013 en dash
    '—',  # \u2014 em dash
    '―',  # \u2015 horizontal bar
}

MINUSES = {
    '-',  # \u002d Hyphen-minus
    '−',  # \u2212 Minus
    '－',  # \uff0d Full-width Hyphen-minus
    '⁻',  # \u207b Superscript minus
}

PLUSES = {
    '+',  # \u002b Plus
    '＋',  # \uff0b Full-width Plus
    '⁺',  # \u207a Superscript plus
}

SLASHES = {
    '/',  # \u002f Solidus
    '⁄',  # \u2044 Fraction slash
    '∕',  # \u2215 Division slash
}


APOSTROPHES = {
    "'",  # \u0027
    '’',  # \u2019
    '՚',  # \u055a
    'Ꞌ',  # \ua78b
    'ꞌ',  # \ua78c
    '＇',  # \uff07
}

SINGLE_QUOTES = {
    "'",  # \u0027
    '‘',  # \u2018
    '’',  # \u2019
    '‚',  # \u201a
    '‛',  # \u201b

}

DOUBLE_QUOTES = {
    '"',  # \u0022
    '“',  # \u201c
    '”',  # \u201d
    '„',  # \u201e
    '‟',  # \u201f
    '«',
    '»'
}

ACCENTS = {
    '`',  # \u0060
    '´',  # \u00b4
}

PRIMES = {
    '′',  # \u2032
    '″',  # \u2033
    '‴',  # \u2034
    '‵',  # \u2035
    '‶',  # \u2036
    '‷',  # \u2037
    '⁗',  # \u2057
}

QUOTES = APOSTROPHES | SINGLE_QUOTES | DOUBLE_QUOTES | ACCENTS | PRIMES
DASHES = HYPHENS | MINUSES

In [6]:
import re
import itertools
from nltk import everygrams
import pandas as pd
import rapidfuzz
from unicodedata import normalize, category


# поправить пунктуацию
char2idx_1 = re.compile(r"й")
char2idx_2 = re.compile(r"ё")
idx2char_1 = re.compile(r"<p>99</p>")
idx2char_2 = re.compile(r"<p>98</p>")

trash_punct = re.compile(f"[(),№;<>%‰*{''.join(QUOTES)}]+")
double_space = re.compile(r"\s+")
broken_hyphen_re = re.compile(r"(?<=[a-zа-я])- (?=[a-zа-я])")


def _broken_hyphen(text: str) -> str:
    return broken_hyphen_re.sub("", text)

def remove_accents(text: str) -> str:
    text = char2idx_1.sub(idx2char_1.pattern, text)
    text = char2idx_2.sub(idx2char_2.pattern, text)
    text = "".join(c for c in normalize("NFD", text)
                   if category(c) != "Mn")
    text = idx2char_1.sub("й", text)
    text = idx2char_2.sub("ё", text)
    return text


def filter_short_paragraphs(text: str, min_token_len: int = 2, min_char_len: int = 10) -> str:
    paragraphs = text.split("\n\n")
    return "\n\n".join([i for i in paragraphs
                        if len(i) >= min_char_len
                        and len(i.split()) >= min_token_len])


def preprocess_text(text: str) -> str:
    text = _broken_hyphen(filter_short_paragraphs(text))
    text = text.lower()
    text = remove_accents(text)

    for slash in SLASHES:
        text = text.replace(slash, "/")
    for dash in DASHES:
        text = text.replace(dash, "-")
    for plus in PLUSES:
        text = text.replace(plus, "+")

    text = trash_punct.sub(" ", text)
    text = double_space.sub(" ", text)
    return text.strip()


def get_candidates(documents: dict, golden_name: str):
    CANDIDATES_SIMILARITY = 50
    processed_golden_name = preprocess_text(golden_name)
    candidates = []

    for document_name, parsed_content in documents.items():
      if type(parsed_content)!= str:
        for page_num, context in parsed_content.items():
          if context["text"] and len(context["text"])>0:
            raw_candidates = [" ".join(ngram) for ngram in everygrams(preprocess_text(context["text"]).split(),
                                                              min_len=int(len(processed_golden_name.split())/2),
                                                              max_len=len(processed_golden_name.split())+10)]
            for candidate in raw_candidates:
                score = rapidfuzz.fuzz.QRatio(processed_golden_name, candidate)
                if score > CANDIDATES_SIMILARITY:
                    candidates.append(( golden_name,
                                        document_name,
                                        page_num,
                                        processed_golden_name,
                                        candidate ))

    df = pd.DataFrame(candidates, columns=["golden_name", "doc_name", "page_num", "targets", "candidate"])
    return df


In [23]:
import re
import pandas as pd
import rapidfuzz
import textdistance
from logging import getLogger

logger = getLogger()

def matching_numbers(external_name, internal_name):

    external_numbers = set(re.findall(r'[0-9]+', external_name))
    internal_numbers = set(re.findall(r'[0-9]+', internal_name))
    union = external_numbers.union(internal_numbers)
    intersection = external_numbers.intersection(internal_numbers)

    if len(external_numbers)==0 and len(internal_numbers) == 0:
        return 1
    else:
        return len(intersection)/ len(union)


def generate_features(df: pd.DataFrame) -> pd.DataFrame:
    logger.info("calculate editions")
    df["qratio"] = df \
      .apply(lambda x: rapidfuzz.fuzz.QRatio(x["candidate"],
                                             x["targets"]), axis=1) \
      .astype(float)
    df["partial_token_set_ratio"] = df \
      .apply(lambda x: rapidfuzz.fuzz.partial_token_set_ratio(x["candidate"],
                                                              x["targets"]), axis=1) \
      .astype(float)
    df["token_set_ratio"] = df.apply(lambda x: rapidfuzz.fuzz.token_set_ratio(x["candidate"],
                                                                              x["targets"]), axis=1) \
      .astype(float)

    logger.info("calculate string sims")
    df["jaro_winkler"] = df \
      .apply(lambda x: textdistance.jaro_winkler(x["candidate"],
                                                 x["targets"]), axis=1) \
      .astype(float)
    df["levenshtein"] = df \
      .apply(lambda x: textdistance.levenshtein(x["candidate"],
                                                x["targets"]) / len(x["candidate"]), axis=1) \
      .astype(float)
    df["damerau_levenshtein"] = df \
      .apply(lambda x: textdistance.damerau_levenshtein(x["candidate"],
                                                        x["targets"]) / len(x["candidate"]), axis=1) \
      .astype(float)
    df["cosine"] = df \
      .apply(lambda x: textdistance.cosine(x["candidate"],
                                           x["targets"]), axis=1) \
      .astype(float)

    logger.info("calculate union chars")
    df["chars_in_common"] = df \
      .apply(lambda x: len(set(x["candidate"]) & set(x["targets"])) / len(set(x["candidate"] + x["targets"])), axis=1) \
      .astype(float)
    df["matching_numbers"] = df \
      .apply(lambda x: matching_numbers(x["candidate"], x["targets"]), axis=1) \
      .astype(float)
    df["words_in_common"] = df \
      .apply(lambda x: len(set(x["candidate"].split()) & set(x["targets"].split())) / len(set(x["candidate"].split()+x["targets"].split())), axis=1) \
      .astype(float)
    df["target_in_candidate"] = df \
      .apply(lambda x: int(x["targets"] in x["candidate"]), axis=1) \
      .astype(int)
    df["candidate_in_targets"] = df \
      .apply(lambda x: int(x["candidate"] in x["targets"]), axis=1) \
      .astype(int)

    logger.info("calculate candidates characteristics")
    df["candidate_contains_special_chars"] = df["candidate"] \
      .str.contains("[^\w\s]", regex=True) \
      .astype(int)

    logger.info("calculate targets characteristics")
    df["target_contains_special_chars"] = df["targets"] \
      .str.contains("[^\w\s]", regex=True) \
      .astype(int)
    return df


In [10]:
import re
from typing import Union


CLOSINGS = ['материалы инженерных изысканий',
 'технический отч[ёе]т',
 'проектная документация',
 'отч[ёе]тная документация',
 'инженерные изыскания',
 'отч[ёе]тная техническая документация']

OPENINGS = ['государственный контракт',
 'технический отч[ёе]т',
 'научно-проектная документация',
 'размещение объекта']


closing_re = re.compile(r"([\S\s]+)((?<=\n\n)(?:{}))".format("|".join(CLOSINGS)), re.IGNORECASE)
opening_re = re.compile(r"((?<=\n\n)(?:{}))([\S\s]+)".format("|".join(OPENINGS)), re.IGNORECASE)



def _filter_matches(match_: str) -> Union[str, None]:
    match_ = re.split("\n\n(?![a-zа-я])", match_)[-1].strip()
    if len(match_) > 5:
        return _broken_hyphen(match_)


def extract_main_name_candidate(m: str) -> str:
  try:
    m = closing_re.search(m).groups()[0]
    m = re.sub("(?:{})".format("|".join(CLOSINGS)), "", m, flags=re.IGNORECASE).strip()
    return opening_re.search(m).groups()[1]
  except Exception as e:
    return m.split("\n\n")[-1]

def find_golden_name(documents: dict):
    """Search and extract golden names in pack of documents in load"""
    main_names = {}
    golden_name = None

    for document_name, parsed_content in documents.items():
        if type(parsed_content)!= str:
          for page_num, content in parsed_content.items():
              if page_num < 2:
                  res = extract_main_name_candidate(content["text"])
                  if (len(res) > 0) and (_filter_matches(res)):
                      main_names[document_name] = _filter_matches(res)
                      break


    names = list(main_names.values())
    golden_name = max(names, key=names.count)

    return golden_name


In [24]:
import re
import joblib
import rapidfuzz

dropcols = ["golden_name", "doc_name", "page_num", "candidate", "targets"]


def get_raw_candidate(page: str, gold:str, candidate: str) -> str:
  if gold[0] in QUOTES:
    gold=gold[1:]
  if gold[-1] in ['.', ',']:
    gold=gold[:-1]
  if gold[-1] in QUOTES:
    gold=gold[:-1]

  res_gold = re.findall(f'{gold[:7]}.*{gold[-7:]}', page, re.IGNORECASE)
  if res_gold:
    return res_gold[0]

  res_cand = re.findall(f'{candidate[:7]}.*{candidate[-7:]}', page, re.IGNORECASE)
  if res_cand:
    return res_cand[0]

  return candidate


def predict(all_documents: dict, golden_name: str=''):
    artifacts = joblib.load('artifacts.pkl')
    classifier = artifacts['classifier']
    best_threshold = artifacts['threshold']
    if golden_name=='':
        golden_name = find_golden_name(all_documents)
    if golden_name:
      candidates = get_candidates(all_documents, golden_name)
      candidates = candidates.drop_duplicates(['golden_name', 'doc_name', 'page_num', 'candidate'])
      candidates_featured = generate_features(candidates)
      candidates_featured['probability'] = classifier.predict_proba(candidates_featured.drop(dropcols, axis=1))[:, 1]
      candidates_featured['page_num'] = candidates_featured['page_num'] + 1
      candidates_featured.loc[candidates_featured["targets"] == candidates_featured["candidate"], "probability"] = 1.
      final_entities = candidates_featured[(candidates_featured['probability'] > best_threshold)]
      final_entities = final_entities.sort_values('probability', ascending=False)\
        .groupby(["doc_name", "page_num", "golden_name", "targets"], sort=False) \
        .agg({"candidate": "first",
              "probability": max}) \
        .reset_index() \
        .sort_values(["page_num"])
      final_entities['candidate'] = final_entities.apply(lambda x:
            get_raw_candidate(all_documents[x['doc_name']][x['page_num']-1]['text'],
                              x['golden_name'],
                              x['candidate']), axis=1)
      final_entities["similarity"] = final_entities \
      .apply(lambda x: len(set(x["candidate"].lower()) & set(x["golden_name"].lower())) / len(set(x["candidate"].lower() + x["golden_name"].lower())), axis=1) \
      .astype(float)
      return final_entities[[ "doc_name", "page_num", "golden_name", "targets", "candidate", "probability", "similarity"]]

    return None

# model test

In [15]:
import joblib

dropcols = ["golden_name", "doc_name", "page_num", "candidate", "targets"]
all_documents = joblib.load('all_documents_v3.pkl')
all_documents = all_documents['doc2']

In [16]:
len(all_documents)

14

In [25]:
pred = predict(all_documents)

In [26]:
pred

Unnamed: 0,doc_name,page_num,golden_name,targets,candidate,probability,similarity
17,doc_2_Раздел ПД №1 Часть №2 Изм.1.00001-21_ЕГЭ...,1,«СТРОИТЕЛЬСТВО И ОБУСТРОЙСТВО СКВАЖИН КУСТА № ...,строительство и обустройство скважин куста 10 ...,Строительство и обустройство скважин куста № 1...,1.000000,0.928571
37,doc_2_Том 1.1.00001-21_ЕГЭ-26404.pdf,1,«СТРОИТЕЛЬСТВО И ОБУСТРОЙСТВО СКВАЖИН КУСТА № ...,строительство и обустройство скважин куста 10 ...,СТРОИТЕЛЬСТВО И ОБУСТРОЙСТВО СКВАЖИН КУСТА № 1...,1.000000,0.928571
68,doc_2_Раздел ПД №3_Изм.2.00001-21_ЕГЭ-26404.pdf,1,«СТРОИТЕЛЬСТВО И ОБУСТРОЙСТВО СКВАЖИН КУСТА № ...,строительство и обустройство скважин куста 10 ...,Строительство и обустройство скважин куста № 1...,0.026234,0.821429
3,Том 1.2.00001-21_ЕГЭ-26404.pdf,1,«СТРОИТЕЛЬСТВО И ОБУСТРОЙСТВО СКВАЖИН КУСТА № ...,строительство и обустройство скважин куста 10 ...,СТРОИТЕЛЬСТВО И ОБУСТРОЙСТВО СКВАЖИН КУСТА № 1...,1.000000,0.928571
45,doc_2_Раздел ПД№6 Часть №2_Изм.1.00001-21_ЕГЭ-...,1,«СТРОИТЕЛЬСТВО И ОБУСТРОЙСТВО СКВАЖИН КУСТА № ...,строительство и обустройство скважин куста 10 ...,Строительство и обустройство скважин куста № 1...,1.000000,0.928571
...,...,...,...,...,...,...,...
21,doc_2_Приложение П-ППТ_ПМТ.00001-21_ЕГЭ-26404.pdf,91,«СТРОИТЕЛЬСТВО И ОБУСТРОЙСТВО СКВАЖИН КУСТА № ...,строительство и обустройство скважин куста 10 ...,строительство и обустройство скважин куста 10 ...,1.000000,0.892857
64,doc_2_Приложение П-ППТ_ПМТ.00001-21_ЕГЭ-26404.pdf,93,«СТРОИТЕЛЬСТВО И ОБУСТРОЙСТВО СКВАЖИН КУСТА № ...,строительство и обустройство скважин куста 10 ...,строительство и обустройство скважин куста 10 ...,0.040516,0.687500
84,doc_2_Раздел ПД№6 Часть №3_Изм.1.00001-21_ЕГЭ-...,95,«СТРОИТЕЛЬСТВО И ОБУСТРОЙСТВО СКВАЖИН КУСТА № ...,строительство и обустройство скважин куста 10 ...,строительства по проекту в базисном уровне цен...,0.005332,0.656250
25,doc_2_Приложение П-ППТ_ПМТ.00001-21_ЕГЭ-26404.pdf,98,«СТРОИТЕЛЬСТВО И ОБУСТРОЙСТВО СКВАЖИН КУСТА № ...,строительство и обустройство скважин куста 10 ...,Строительство и обустройство скважин куста № 1...,1.000000,0.928571
