These notebook automates replacing choice / multichoice field values.
See usage examples in last 3 code cells .

In [2]:
# hide warnings
import warnings
warnings.filterwarnings('ignore')

# setup django
import os
import sys
sys.path.append('../../')
os.environ["DJANGO_SETTINGS_MODULE"] = "settings"
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "settings")
import django
django.setup()

from django.conf import settings
from django.db.models import *
from apps.document.field_type_registry import init_field_type_registry

init_field_type_registry()

Registered field type: String (vectorizer uses words as tokens) (string)
Registered field type: String (vectorizer uses whole value as a token) (string_no_word_wrap)
Registered field type: Long Text (text)
Registered field type: Integer Number (int)
Registered field type: Boolean (boolean)
Registered field type: Floating Point Number (float)
Registered field type: DateTime: Non-recurring Events (datetime)
Registered field type: Date: Non-recurring Events (date)
Registered field type: Date: Recurring Events (date_recurring)
Registered field type: Company (company)
Registered field type: Duration (duration)
Registered field type: Percent (percent)
Registered field type: Ratio (ratio)
Registered field type: Address (address)
Registered field type: Related Info (related_info)
Registered field type: Choice (choice)
Registered field type: Multi Choice (multi_choice)
Registered field type: Person (person)
Registered field type: Amount (amount)
Registered field type: Money (money)
Registered f

In [21]:
from apps.document.models import DocumentField
from apps.document.models import FieldValue
from apps.document.field_types import TypedField
import apps.document.repository.document_field_repository as dfr


class DocFieldSearchParams:
    def __init__(self,
        field_code: str,
        old_to_new = None,  # { 'Choice 1': 'Choice A', ... }
        new_choices = None, # ['Choice A', 'Choice B', ...]
        document_type: str = '',
        check_only: bool = True,
        non_fit_action: str = 'BREAK',  # or 'AUTO-REPLACE' or 'DELETE',
        show_doc_urls: bool = False,
        rename_field_choices: bool = True
        ):
        
        self.field_code = field_code  # 'lease_type' (code) or 'Lease Type' (title)
        # you either provide "old_to_new mapping": old defined (or stored) values
        #  to new ones or "new_choices": the only allowed choices
        self.old_to_new = old_to_new
        self.new_choices = new_choices
        self.document_type = document_type  # 'k_lease_doc' or None
        self.check_only = check_only  # check or check and replace
        # what to do if there's a value not listed in substitution list?
        # - 'BREAK' - abort the whole procedure
        # - 'AUTO-REPLACE' - replace mis
        self.non_fit_action = non_fit_action
        # show document URLs (if True) when printing misfitting options
        self.show_doc_urls = show_doc_urls
        # rename options (choices) in field's settings as well
        self.rename_field_choices = rename_field_choices


class DocumentFieldValueManager:
    def __init__(self):
        self.search_params = None  # DocFieldSearchParams
        self.field = None  # DocumentField
        self.typed_field = None  # TypedField
    
    def check_misfit_options(self, ptrs: DocFieldSearchParams):
        self.search_params = ptrs
        if not self.find_doc_field():
            return
        
        if self.search_params.check_only:
            self.check_only()
            return
        self.check_and_replace()
    
    def check_only(self):
        new_options = self.search_params.new_choices
        if not new_options:
            if not self.search_params.old_to_new:
                print('No choice options were provided')
                return
            new_options = self.make_updated_choice_list()
        self.field.choices = '\n'.join(new_options)

        repo = dfr.DocumentFieldRepository()
        wrongs = repo.get_wrong_choice_options(self.field,
                                              limit_result=30)
        if not wrongs:
            print('All field values stored are already OK')
        else:
            print('Following field values don''t correspond provided choices:')
            for wrong in wrongs[1]:
                # <doc_name>,<doc_url>,<wrong_value>,<closest_option>)
                url = '' if not self.search_params.show_doc_urls else \
                    f' ({wrong[1]})'
                print(f'Doc "{wrong[0]}"{url}: "{wrong[2]}" (suggested is "{wrong[3]}")')
            if wrongs[0]:
                print('\... and some more')
                
    def check_and_replace(self):
        repo = dfr.DocumentFieldRepository()        
        result = repo.replace_wrong_choice_options(
            self.field.pk,
            self.search_params.old_to_new,
            self.search_params.new_choices,
            self.search_params.non_fit_action)
        # deleted, updated, errors
        print(f'Deleted {result["deleted"]} values. Updated {result["updated"]} values.')
        if result['errors']:
            print('There were errors:')
            for err in result['errors']:
                print(err)
        if self.search_params.rename_field_choices:
            new_choices = self.search_params.new_choices or self.make_updated_choice_list()
            self.field.choices = '\n'.join(new_choices)
            print(self.field.choices)
            self.field.save()
    
    def find_doc_field(self) -> bool:
        field_code = self.search_params.field_code
        document_type = self.search_params.document_type
        
        if not self.find_doc_field_by_column('code', field_code, document_type):
            if not self.find_doc_field_by_column('title', field_code, document_type):
                print(f'Field with code or title "{field_code}" was not found.')
                return False
        self.typed_field = TypedField.by(self.field)
        
        if not self.typed_field.is_choice_field:
            print(f'Field "{field_code}" is not a choice / multichoice field.')
            return False
            
        if self.field.allow_values_not_specified_in_choices:
            print(f'Field "{field_code}" allows for values outside of choice list.')
        return True  

    def make_updated_choice_list(self):
        new_options = DocumentField.parse_choice_values(self.field.choices)
        corrected_options = []
        for opt in new_options:
            if opt in self.search_params.old_to_new:
                opt = self.search_params.old_to_new[opt]
            corrected_options.append(opt)
        return corrected_options
    
    def find_doc_field_by_column(self,
                                 column: str,  # 'code' or 'title'
                                 field_code: str,  # 'lease_option'
                                 document_type: str = ''):
        fields = DocumentField.objects.filter(code=field_code) \
            if column == 'code' else DocumentField.objects.filter(title=field_code)
        if document_type:
            fields = fields.filter(document_type=document_type)
        fields = list(fields)
        if len(fields) > 1:
            print(f'Field with {column} "{field_code}" is found {len(fields)} times.')
        elif not fields:
            return False
        else:
            self.field = fields[0]
            return True
        

In [13]:
# That's how you can check (check_only=True) if provided choices (new_choices)
# don't contradict stored values
# - 'k_eight' - field to check
# - new_choices - new choices for the field "k_eight"
# - check_only=True - we're just checking, not replacing stored values
# - show_doc_urls show documents' URLs in log

ptrs = DocFieldSearchParams('k_eight', new_choices=['A', 'B', 'C'], check_only=True, show_doc_urls=True)
DocumentFieldValueManager().check_misfit_options(ptrs)

Field "k_eight" allows for values outside of choice list.
Following field values dont correspond provided choices:
Doc "1108320_2001-04-02_9.txt" (http://dev.contraxsuite.com/#/contract_analysis/32/annotator/3816): "la-la" (suggested is "A")
Doc "1172852_2008-02-14_7.txt" (http://dev.contraxsuite.com/#/contract_analysis/12/annotator/1482): "la-la" (suggested is "A")
Doc "1130950_2002-03-29_14.txt" (http://dev.contraxsuite.com/#/contract_analysis/32/annotator/3820): "la-la" (suggested is "A")
Doc "1173204_2010-06-14_2.txt" (http://dev.contraxsuite.com/#/contract_analysis/12/annotator/1487): "la-la" (suggested is "A")
Doc "1002037_2005-10-11_2.txt" (http://dev.contraxsuite.com/#/contract_analysis/12/annotator/62): "la-la" (suggested is "A")
Doc "1111559_2007-03-09_6.txt" (http://dev.contraxsuite.com/#/contract_analysis/12/annotator/68): "la-la" (suggested is "A")
Doc "1203957_2010-11-12_2.txt" (http://dev.contraxsuite.com/#/contract_analysis/12/annotator/1484): "la-la" (suggested is "A")

In [10]:
# Here I replace old values (check_only=False) with new options (new_choices)
# I replace (non_fit_action) old values. 
# When I find a value (e.g. "lah-lah"),
# I'm searching for the closest match among all options (new_choices). Here the close
# match will be 'Blah-Blah'
ptrs = DocFieldSearchParams('k_eight', new_choices=['Blah-Blah-Blah', 'Blah-Blah', 'Blah'], 
                            check_only=False, non_fit_action='AUTO-REPLACE',
                            rename_field_choices=False)
DocumentFieldValueManager().check_misfit_options(ptrs)

Field "k_eight" allows for values outside of choice list.
Deleted 0 values. Updated 0 values.


In [24]:
# Here I replace old values (check_only=False) by their counterparts
# from "old_to_new" map.
# Here I replace all 'Blah-Blah-Blah' with 'la-la'
ptrs = DocFieldSearchParams('k_eight', 
                            old_to_new={'la-la-la': 'Blah-Blah-Blah'}, 
                            check_only=False, non_fit_action='AUTO-REPLACE')
DocumentFieldValueManager().check_misfit_options(ptrs)

Field "k_eight" allows for values outside of choice list.
Deleted 0 values. Updated 39 values.
Blah
Blah-Blah
Blah-Blah-Blah


In [25]:
ptrs = DocFieldSearchParams('k_27', 
                            old_to_new={'Choice 1': 'Choice A'}, 
                            check_only=False, non_fit_action='AUTO-REPLACE')
DocumentFieldValueManager().check_misfit_options(ptrs)

Field "k_27" allows for values outside of choice list.
Deleted 0 values. Updated 0 values.
Choice A
Choice 2
