### Azure Form Recognizer - Custom Model - Python SDK Demo

#### Importing Azure Form Recognizer Python modules

In [1]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import FormRecognizerClient
from azure.ai.formrecognizer import FormTrainingClient
from azure.core.exceptions import ResourceNotFoundError

In [2]:
AZURE_FORM_RECOGNIZER_ENDPOINT = "https://-----------------------------.cognitiveservices.azure.com/"
AZURE_FORM_RECOGNIZER_KEY = "------------------------------"

In [3]:
endpoint = AZURE_FORM_RECOGNIZER_ENDPOINT
key = AZURE_FORM_RECOGNIZER_KEY

In [4]:
form_training_client = FormTrainingClient(endpoint=endpoint, credential=AzureKeyCredential(key))

In [5]:
saved_model_list = form_training_client.list_custom_models()

#### Training Source Data URL
To generate the training data URL, you can

1. Download the Cognito Corporation training documents to your local system: https://github.com/udacity/cd0461-building-computer-vision-solutions-with-azure-exercises/tree/main/resources/cognito-corp-docs.
2. Upload the training documents to a blob container at Azure Blob Storage. Training documents are named Cognito-corporation-u*.pdf.
3. Generate a SAS URL of the training data container.
4. Once the model is trained, you will use the Cognito-corporation-test01.png file located in this GitHub directory to perform prediction: https://raw.githubusercontent.com/udacity/cd0461-building-computer-vision-solutions-with-azure-exercises/main/resources/Cognito-corporation-test01.png

In [6]:
trainingDataUrl = "https://storgaccuntcustmodsdk.blob.core.windows.net/cognitocorpcontainer?sp=rwdl&st=2023-08-29T06:12:16Z&se=2023-08-30T14:12:16Z&spr=https&sv=2022-11-02&sr=c&sig=E2Ac%2BXhe02VrUE44zQGRPiCDoqFk79F00JwGSZdAsZs%3D"

#### Performing Unlabeled Traning
##### use_training_labels=False

In [7]:
training_process = form_training_client.begin_training(trainingDataUrl, use_training_labels=False)
custom_model = training_process.result()

#### Getting Model Info

In [8]:
custom_model

CustomFormModel(model_id=3f10fb9c-7329-4c53-bb79-c9b8b2bba099, status=ready, training_started_on=2023-08-29 06:18:12+00:00, training_completed_on=2023-08-29 06:18:34+00:00, submodels=[CustomFormSubmodel(accuracy=None, model_id=3f10fb9c-7329-4c53-bb79-c9b8b2bba099, fields={'field-0': CustomFormModelField(label=1,, name=field-0, accuracy=None), 'field-1': CustomFormModelField(label=A Demo Company Corporation, name=field-1, accuracy=None), 'field-2': CustomFormModelField(label=Address To, name=field-2, accuracy=None), 'field-3': CustomFormModelField(label=Business Information, name=field-3, accuracy=None), 'field-4': CustomFormModelField(label=Cognito Corporation,, name=field-4, accuracy=None), 'field-5': CustomFormModelField(label=Comments, name=field-5, accuracy=None), 'field-6': CustomFormModelField(label=Contact Summary, name=field-6, accuracy=None), 'field-7': CustomFormModelField(label=Customer Number, name=field-7, accuracy=None), 'field-8': CustomFormModelField(label=Department Na

In [9]:
custom_model.model_id

'3f10fb9c-7329-4c53-bb79-c9b8b2bba099'

In [10]:
custom_model.status

'ready'

In [11]:
custom_model.training_started_on

datetime.datetime(2023, 8, 29, 6, 18, 12, tzinfo=<isodate.tzinfo.Utc object at 0x7fa9902ff8e0>)

In [12]:
custom_model.training_completed_on

datetime.datetime(2023, 8, 29, 6, 18, 34, tzinfo=<isodate.tzinfo.Utc object at 0x7fa9902ff8e0>)

In [13]:
custom_model.training_documents

[TrainingDocumentInfo(name=Cognito-corporation-test01.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=Cognito-corporation-u-main.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=Cognito-corporation-u1.pdf, status=failed, page_count=0, errors=[FormRecognizerError(code=2005, message=Page 0: Unable to read file.)], model_id=None),
 TrainingDocumentInfo(name=Cognito-corporation-u10.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=Cognito-corporation-u2.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=Cognito-corporation-u3.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=Cognito-corporation-u4.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=Cognito-corporation-u5.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocum

In [14]:
for doc in custom_model.training_documents:
    print("Document name: {}".format(doc.name))
    print("Document status: {}".format(doc.status))
    print("Document page count: {}".format(doc.page_count))
    print("Document errors: {}".format(doc.errors))

Document name: Cognito-corporation-test01.pdf
Document status: succeeded
Document page count: 1
Document errors: []
Document name: Cognito-corporation-u-main.pdf
Document status: succeeded
Document page count: 1
Document errors: []
Document name: Cognito-corporation-u1.pdf
Document status: failed
Document page count: 0
Document errors: [FormRecognizerError(code=2005, message=Page 0: Unable to read file.)]
Document name: Cognito-corporation-u10.pdf
Document status: succeeded
Document page count: 1
Document errors: []
Document name: Cognito-corporation-u2.pdf
Document status: succeeded
Document page count: 1
Document errors: []
Document name: Cognito-corporation-u3.pdf
Document status: succeeded
Document page count: 1
Document errors: []
Document name: Cognito-corporation-u4.pdf
Document status: succeeded
Document page count: 1
Document errors: []
Document name: Cognito-corporation-u5.pdf
Document status: succeeded
Document page count: 1
Document errors: []
Document name: Cognito-corpora

In [15]:
custom_model.properties

CustomFormModelProperties(is_composed_model=False)

In [16]:
for submodel in custom_model.submodels:
    print(
        "The submodel with form type '{}' has recognized the following fields: {}".format(
            submodel.form_type,
            ", ".join(
                [
                    field.label if field.label else name
                    for name, field in submodel.fields.items()
                ]
            ),
        )
    )

The submodel with form type 'form-0' has recognized the following fields: 1,, A Demo Company Corporation, Address To, Business Information, Cognito Corporation,, Comments, Contact Summary, Customer Number, Department Name, First Name, Items Total, Last Name, Order Number, Reference Subscription


In [17]:
custom_model.model_id

'3f10fb9c-7329-4c53-bb79-c9b8b2bba099'

In [18]:
custom_model_info = form_training_client.get_custom_model(model_id=custom_model.model_id)
print("Model ID: {}".format(custom_model_info.model_id))
print("Status: {}".format(custom_model_info.status))
print("Training started on: {}".format(custom_model_info.training_started_on))
print("Training completed on: {}".format(custom_model_info.training_completed_on))

Model ID: 3f10fb9c-7329-4c53-bb79-c9b8b2bba099
Status: ready
Training started on: 2023-08-29 06:18:12+00:00
Training completed on: 2023-08-29 06:18:34+00:00


#### Using an image document as test document URL (Not using PDF here)¶
1. Here, you will use the Cognito-corporation-test01.png file located in this GitHub directory to perform prediction: https://raw.githubusercontent.com/udacity/cd0461-building-computer-vision-solutions-with-azure-exercises/main/resources/Cognito-corporation-test01.png
2. Note: If you want to use a PDF document for the test, please save and upload PDF to Azure Blob Storage and use the SAS URL of this PDF document as the target URL.
3. Using a PDF document from the GitHub URL will give you an error.
4. You will see a screenshot of how to do this on the exercise solution page later in this lesson.

In [19]:
new_test_url = "https://raw.githubusercontent.com/udacity/cd0461-building-computer-vision-solutions-with-azure-exercises/main/resources/Cognito-corporation-test01.png"

In [20]:
new_test_url

'https://raw.githubusercontent.com/udacity/cd0461-building-computer-vision-solutions-with-azure-exercises/main/resources/Cognito-corporation-test01.png'

In [21]:
form_recognizer_client = FormRecognizerClient(endpoint=endpoint, credential=AzureKeyCredential(key))

In [22]:
custom_model.model_id

'3f10fb9c-7329-4c53-bb79-c9b8b2bba099'

In [23]:
custom_model_info.model_id

'3f10fb9c-7329-4c53-bb79-c9b8b2bba099'

In [24]:
custom_test_action = form_recognizer_client.begin_recognize_custom_forms_from_url(model_id=custom_model_info.model_id, form_url=new_test_url)

In [25]:
custom_test_action.status()

'InProgress'

In [26]:
custom_test_action_result = custom_test_action.result()

In [27]:
for recognized_content in custom_test_action_result:
    print("Form type: {}".format(recognized_content.form_type))
    for name, field in recognized_content.fields.items():
        print("Field '{}' has label '{}' with value '{}' and a confidence score of {}".format(
            name,
            field.label_data.text if field.label_data else name,
            field.value,
            field.confidence
        ))

Form type: form-0
Field 'field-0' has label 'First Name' with value '' and a confidence score of 0.5
Field 'field-1' has label 'Last Name' with value 'Singh' and a confidence score of 0.43
Field 'field-2' has label 'Contact Summary' with value 'Please help me immediately.' and a confidence score of 0.43
Field 'field-3' has label 'Order Number' with value '4343568' and a confidence score of 1.0
Field 'field-4' has label 'Customer Number' with value '65001' and a confidence score of 1.0
Field 'field-5' has label 'Department Name' with value 'IT' and a confidence score of 0.58
Field 'field-6' has label 'Address To' with value 'IT' and a confidence score of 0.35
Field 'field-7' has label 'Items Total' with value '80' and a confidence score of 1.0
Field 'field-8' has label 'Reference Subscription' with value 'A-35445' and a confidence score of 1.0
Field 'field-9' has label 'Comments' with value 'None' and a confidence score of 0.66
Field 'field-10' has label '__Address__1' with value 'Amar 

### As you can see above, the confidence is very low with string fields, so we will want to add training labels to improve the confidence scores.

# ===========PAUSE HERE==============

## At this point, you should go to the Form Recognizer portal and label your training documents manually there.
## Please read the following instructions:
1. If you haven't labeled the training documents from the portal demo, you should now visit the Form Recognizer portal and create a new project (https://fott-2-1.azurewebsites.net/projects/) using the same blob container where you have stored the Cognito Corp training documents.
2. When you read the training files in the blob container from the Form Recognizer portal, a master `project_name.fott` file will be auto-generated in your blob container. When you add tags, a `fields.json` file is auto-generated in your blob container.
3. When you run layout on a training document, an `ocr.json` file gets auto-generated in your blob container. When you label the fields at the Form Recognizer portal, a `labels.json` file is auto-generated in your blob container. These files are essential for a labeled training to work. **If you don't have those documents,  you will get the error: <br>"Can't find any OCR files for training." or "Can't find any label files for training."**
5. Label **at least 5** (if not all) of the training documents at the Form Recognizer portal. This will auto-generate the `labels.json` documents in the blob container. If you saved label documents from the previous demo, you can also upload your own `labels.json` documents into the blob container so that you don't have to label the training documents again. 
6. Please go back to the portal demo pages if you need help with these steps. 

### use_training_labels=True

In [28]:
labeled_training_process = form_training_client.begin_training(trainingDataUrl, use_training_labels=True)
labeled_custom_model = labeled_training_process.result()

In [29]:
labeled_custom_model.model_id

'21241174-af2b-46c6-95de-ddb746748100'

In [30]:
labeled_custom_model.status

'ready'

In [31]:
labeled_custom_model.training_documents

[TrainingDocumentInfo(name=Cognito-corporation-test01.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=Cognito-corporation-u-main.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=Cognito-corporation-u1.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=Cognito-corporation-u10.pdf, status=succeeded, page_count=1, errors=[], model_id=None),
 TrainingDocumentInfo(name=Cognito-corporation-u2.pdf, status=succeeded, page_count=1, errors=[], model_id=None)]

In [32]:
for doc in labeled_custom_model.training_documents:
    print("Document name: {}".format(doc.name))
    print("Document status: {}".format(doc.status))
    print("Document page count: {}".format(doc.page_count))
    print("Document errors: {}".format(doc.errors))

Document name: Cognito-corporation-test01.pdf
Document status: succeeded
Document page count: 1
Document errors: []
Document name: Cognito-corporation-u-main.pdf
Document status: succeeded
Document page count: 1
Document errors: []
Document name: Cognito-corporation-u1.pdf
Document status: succeeded
Document page count: 1
Document errors: []
Document name: Cognito-corporation-u10.pdf
Document status: succeeded
Document page count: 1
Document errors: []
Document name: Cognito-corporation-u2.pdf
Document status: succeeded
Document page count: 1
Document errors: []


In [33]:
labeled_custom_model.model_id

'21241174-af2b-46c6-95de-ddb746748100'

In [34]:
labeled_custom_test_action = form_recognizer_client.begin_recognize_custom_forms_from_url(model_id=labeled_custom_model.model_id, form_url=new_test_url)

In [35]:
labeled_custom_test_action.status()

'succeeded'

In [36]:
labeled_custom_test_action_result = labeled_custom_test_action.result()

In [37]:
for recognized_content in labeled_custom_test_action_result:
    print("Form type: {}".format(recognized_content.form_type))
    for name, field in recognized_content.fields.items():
        print("Field '{}' has label '{}' with value '{}' and a confidence score of {}".format(
            name,
            field.label_data.text if field.label_data else name,
            field.value,
            field.confidence
        ))

Form type: custom:21241174-af2b-46c6-95de-ddb746748100
Field 'Order Number' has label 'Order Number' with value '4343568' and a confidence score of 0.995
Field 'Company Name' has label 'Company Name' with value 'None' and a confidence score of 0.919
Field 'Department Name' has label 'Department Name' with value 'IT' and a confidence score of 0.994
Field 'Address To' has label 'Address To' with value 'IT' and a confidence score of 0.995
Field 'Contact Summary' has label 'Contact Summary' with value 'Please help me immediately.' and a confidence score of 0.994
Field 'Company Tag' has label 'Company Tag' with value 'None' and a confidence score of 0.919
Field 'Reference Subscription' has label 'Reference Subscription' with value 'A-35445' and a confidence score of 0.994
Field 'Customer Number' has label 'Customer Number' with value '65001' and a confidence score of 0.995
Field 'Comments' has label 'Comments' with value 'None' and a confidence score of 0.983
Field 'Business Info' has label

### As you can see above, the confidence for string fields is very high, so a labeled training is better.

#### Listing Models

In [38]:
saved_model_list = form_training_client.list_custom_models()

In [39]:
for model in saved_model_list:
    print(model.model_id)

21241174-af2b-46c6-95de-ddb746748100
3f10fb9c-7329-4c53-bb79-c9b8b2bba099


## Creating Composed Model

### All models in composed models list must be created from the labeled training process.

In [40]:
## Cognito corporation model with labeled training (First)
labeled_custom_model.model_id

'21241174-af2b-46c6-95de-ddb746748100'

In [41]:
## Creating another model with labeled training
labeled_2_training_process = form_training_client.begin_training(trainingDataUrl, use_training_labels=True)
labeled_2_custom_model = labeled_2_training_process.result()

In [42]:
## Cognito corporation model with labeled training (Second)
labeled_2_custom_model.model_id

'8053cc32-8152-40bb-9da0-56423247db49'

In [43]:
cognito_corporation_model_list = [labeled_custom_model.model_id, labeled_2_custom_model.model_id]

In [44]:
composed_process = form_training_client.begin_create_composed_model(
            cognito_corporation_model_list, model_name="Cognito Corporation Model")
composed_process_model = composed_process.result()

In [45]:
composed_process_model.model_id

'f832e78d-a669-4d66-989b-d3913973b51b'

In [46]:
composed_model_info = form_training_client.get_custom_model(model_id=composed_process_model.model_id)
print("Model ID: {}".format(composed_model_info.model_id))
print("Status: {}".format(composed_model_info.status))
print("Training started on: {}".format(composed_model_info.training_started_on))
print("Training completed on: {}".format(composed_model_info.training_completed_on))

Model ID: f832e78d-a669-4d66-989b-d3913973b51b
Status: ready
Training started on: 2023-08-29 07:19:17+00:00
Training completed on: 2023-08-29 07:19:17+00:00


In [47]:
# Is this composed model
composed_model_info.properties

CustomFormModelProperties(is_composed_model=True)

### Using composed model to extract text

In [48]:
composed_model_testing = form_recognizer_client.begin_recognize_custom_forms_from_url(model_id=composed_process_model.model_id, form_url=new_test_url)

In [49]:
composed_model_testing.status()

'succeeded'

In [50]:
composed_model_testing_result = composed_model_testing.result()

In [51]:
for recognized_content in composed_model_testing_result:
    print("Form type: {}".format(recognized_content.form_type))
    for name, field in recognized_content.fields.items():
        print("Field '{}' has label '{}' with value '{}' and a confidence score of {}".format(
            name,
            field.label_data.text if field.label_data else name,
            field.value,
            field.confidence
        ))

Form type: Cognito Corporation Model:8053cc32-8152-40bb-9da0-56423247db49
Field 'Items Total' has label 'Items Total' with value '80' and a confidence score of 0.995
Field 'Address To' has label 'Address To' with value 'IT' and a confidence score of 0.995
Field 'First Name' has label 'First Name' with value 'Amar' and a confidence score of 0.995
Field 'Business Info' has label 'Business Info' with value '2436, Medina Circle Portland WA, 97035' and a confidence score of 0.99
Field 'Comments' has label 'Comments' with value 'None' and a confidence score of 0.983
Field 'Last Name' has label 'Last Name' with value 'Singh' and a confidence score of 0.995
Field 'Company Tag' has label 'Company Tag' with value 'None' and a confidence score of 0.919
Field 'Company Name' has label 'Company Name' with value 'None' and a confidence score of 0.919
Field 'Contact Summary' has label 'Contact Summary' with value 'Please help me immediately.' and a confidence score of 0.994
Field 'Order Number' has la

### To get custom models in your account:

In [53]:
account_properties = form_training_client.get_account_properties()
account_properties.custom_model_count, account_properties.custom_model_limit

(8, 250)

### To get a list of all custom models:

In [54]:
custom_models = form_training_client.list_custom_models()
for model in custom_models:
      print(model.model_id)

21241174-af2b-46c6-95de-ddb746748100
3d1680f4-126b-406c-89c9-2162df11e835
3f10fb9c-7329-4c53-bb79-c9b8b2bba099
8053cc32-8152-40bb-9da0-56423247db49
b9005b5d-3135-4475-a697-6c99bad97de9
d374361e-1201-40e7-9b11-41bd7f7551ca
ec9ca269-f4f8-4fd4-a363-1a12ca4e903f
f832e78d-a669-4d66-989b-d3913973b51b


### To select a specific model based on model ID:

In [None]:
# selected_model = form_training_client.get_custom_model(model_id=specific_model_id)

### If the custom model is no longer needed, you can delete the model:

In [55]:
# form_training_client.delete_model(model_id=custom_model.selected_model_id)

##### Form Recognizer composed model
    A collection of up to 100 custom Form Recognizer models under one single model ID.
