# Train a baseline ML model to extract structured key-value data from invoice OCR text.

We'll focus on invoice number, date, seller, client, and line item summaries.

In [14]:
pip install openpyxl

Collecting openpyxl
Note: you may need to restart the kernel to use updated packages.
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl

   ---------------------------------------- 0/2 [et-xmlfile]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1/2 [openpyxl]
   -------------------- ------------------- 1

## PART 1: Setup & Load Data


ocr_text (raw invoice text)

json_data (dictionary as a string with fields like invoice_number, client.name, etc.)

In [3]:
import pandas as pd
import json

# Loaded my dataset
df = pd.read_csv(r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\combined_batch1_cleaned.csv')


# Preview columns
print(df.columns)

# Optional: preview one sample row
df[['ocr_text', 'json_data']].head(1)


Index(['file_name', 'json_data', 'ocr_text', 'clean_text'], dtype='object')


Unnamed: 0,ocr_text,json_data
0,Invoice no: 84652373 Date of issue: 02/23/2021...,"\n{\n ""invoice"": {\n ""client_name"": ""Clark..."


In [7]:
df

Unnamed: 0,file_name,json_data,ocr_text,clean_text
0,batch1-0494.jpg,"\n{\n ""invoice"": {\n ""client_name"": ""Clark...",Invoice no: 84652373 Date of issue: 02/23/2021...,invoice no: 84652373 date of issue: 02/23/2021...
1,batch1-0489.jpg,"\n{\n ""invoice"": {\n ""client_name"": ""Willi...",Invoice no: 37451664 Date of issue: 06/11/2020...,invoice no: 37451664 date of issue: 06/11/2020...
2,batch1-0499.jpg,"\n{\n ""invoice"": {\n ""client_name"": ""Heste...",Invoice no: 40108666 Date of issue: 02/07/2020...,invoice no: 40108666 date of issue: 02/07/2020...
3,batch1-0497.jpg,"\n{\n ""invoice"": {\n ""client_name"": ""Olson...",Invoice no: 73285932 Date of issue: 07/25/2017...,invoice no: 73285932 date of issue: 07/25/2017...
4,batch1-0081.jpg,"\n{\n ""invoice"": {\n ""client_name"": ""Wilso...",Invoice no: 15288019 Date of issue: 09/07/2014...,invoice no: 15288019 date of issue: 09/07/2014...
...,...,...,...,...
1409,batch1-1359.jpg,"\n{\n ""invoice"": {\n ""client_name"": ""Dawso...",Invoice no: 87519797 Date of issue: 05/13/2013...,invoice no: 87519797 date of issue: 05/13/2013...
1410,batch1-1391.jpg,"\n{\n ""invoice"": {\n ""client_name"": ""Brock...",Invoice no: 94223548 Date of issue: 11/19/2012...,invoice no: 94223548 date of issue: 11/19/2012...
1411,batch1-1375.jpg,"\n{\n ""invoice"": {\n ""client_name"": ""Flynn...",Invoice no: 59612541 Date of issue: 08/24/2016...,invoice no: 59612541 date of issue: 08/24/2016...
1412,batch1-1412.jpg,"\n{\n ""invoice"": {\n ""client_name"": ""Green...",Invoice no: 34630909 Date of issue: 09/14/2011...,invoice no: 34630909 date of issue: 09/14/2011...


## Step 2: Extract target fields from json_data and labeled tokens saved
Ix will now parse the JSON in the json_data column to extract the fields we want to predict — for now let's extract:

invoice_number

invoice_date

client_name

(We’ll later go for more like line_items, total_amount, etc.)

## Extracting 2 fields 

In [17]:
import pandas as pd
import json
import os

# Load the cleaned invoice data
input_path = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\combined_batch1_cleaned.csv'
df = pd.read_csv(input_path)

# Define target fields to label - only invoice_number and invoice_date
target_fields = ['invoice_number', 'invoice_date']

def extract_field_values(json_str):
    try:
        data = json.loads(json_str)
        invoice = data.get('invoice', {})
        # Extract only invoice_number and invoice_date
        field_vals = {
            'invoice_number': invoice.get('invoice_number', ''),
            'invoice_date': invoice.get('invoice_date', '')
        }
        return field_vals
    except Exception:
        return {}

def tokenize_and_label(text, field_values):
    tokens = text.split()
    labels = []

    for token in tokens:
        labeled = False
        for field, value in field_values.items():
            if value:
                # Simple token-level matching (case insensitive)
                if token.lower() in [v.lower() for v in value.split()]:
                    # Begin or inside tag based on position in the value
                    label = 'B-' + field.upper() if token.lower() == value.split()[0].lower() else 'I-' + field.upper()
                    labels.append((token, label))
                    labeled = True
                    break
        if not labeled:
            labels.append((token, 'O'))
    return labels

# Placeholder for labeled tokens
token_label_data = []

for idx, row in df.iterrows():
    text = row['clean_text']
    json_str = row['json_data']
    field_values = extract_field_values(json_str)

    if not field_values:
        continue

    labeled_tokens = tokenize_and_label(text, field_values)
    token_label_data.append(labeled_tokens)

# Flatten for NER format (token, label per line, blank line between samples)
output_rows = []
for sent in token_label_data:
    for token, label in sent:
        output_rows.append({'token': token, 'label': label})
    output_rows.append({'token': '', 'label': ''})  # blank line between invoices

# Save output CSV
output_dir = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'ner_token_labels_headers_2fields.csv')

output_df = pd.DataFrame(output_rows)
output_df.to_csv(output_file, index=False)

print(f"✅ Token labeling complete. Saved to: {output_file}")


✅ Token labeling complete. Saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\ner_token_labels_headers_2fields.csv


### Save as Json

In [18]:
import pandas as pd
import json
import os

# Load the cleaned invoice data
input_path = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\combined_batch1_cleaned.csv'
df = pd.read_csv(input_path)

# Function to extract required fields from the JSON string column
def extract_relevant_fields(json_str):
    try:
        data = json.loads(json_str)
        invoice = data.get('invoice', {})
        extracted = {
            'invoice_number': invoice.get('invoice_number', ''),
            'invoice_date': invoice.get('invoice_date', '')
        }
        return extracted
    except Exception as e:
        # If JSON parsing fails, return empty dictionary
        return {}

# Collect all extracted data
extracted_data = []

for idx, row in df.iterrows():
    json_str = row['json_data']
    fields = extract_relevant_fields(json_str)
    if fields:
        extracted_data.append(fields)

# Define output path for JSON file
output_dir = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL'
os.makedirs(output_dir, exist_ok=True)
output_json_file = os.path.join(output_dir, 'extracted_invoice_fields.json')

# Save all extracted fields into JSON file
with open(output_json_file, 'w', encoding='utf-8') as f:
    json.dump(extracted_data, f, indent=4)

print(f"✅ JSON data saved successfully at: {output_json_file}")


✅ JSON data saved successfully at: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\extracted_invoice_fields.json


### Extracting 3 Fields

In [16]:
import pandas as pd
import json
import os

# Load the cleaned invoice data
input_path = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\combined_batch1_cleaned.csv'
df = pd.read_csv(input_path)

# Define target fields to label
target_fields = ['invoice_number', 'invoice_date', 'client_name']

def extract_field_values(json_str):
    try:
        data = json.loads(json_str)
        invoice = data.get('invoice', {})
        # Only extract these 3 fields
        field_vals = {
            'invoice_number': invoice.get('invoice_number', ''),
            'invoice_date': invoice.get('invoice_date', ''),
            'client_name': invoice.get('client_name', '')
        }
        return field_vals
    except Exception:
        return {}

def tokenize_and_label(text, field_values):
    tokens = text.split()
    labels = []

    for token in tokens:
        labeled = False
        for field, value in field_values.items():
            if value:
                # Simple token-level matching (case insensitive)
                if token.lower() in [v.lower() for v in value.split()]:
                    # Begin or inside tag based on position in the value
                    label = 'B-' + field.upper() if token.lower() == value.split()[0].lower() else 'I-' + field.upper()
                    labels.append((token, label))
                    labeled = True
                    break
        if not labeled:
            labels.append((token, 'O'))
    return labels

# Placeholder for labeled tokens
token_label_data = []

for idx, row in df.iterrows():
    text = row['clean_text']
    json_str = row['json_data']
    field_values = extract_field_values(json_str)

    if not field_values:
        continue

    labeled_tokens = tokenize_and_label(text, field_values)
    token_label_data.append(labeled_tokens)

# Flatten for NER format (token, label per line, blank line between samples)
output_rows = []
for sent in token_label_data:
    for token, label in sent:
        output_rows.append({'token': token, 'label': label})
    output_rows.append({'token': '', 'label': ''})  # blank line between invoices

# Save output CSV
output_dir = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'ner_token_labels_headers(3).csv')

output_df = pd.DataFrame(output_rows)
output_df.to_csv(output_file, index=False)

print(f"✅ Token labeling complete. Saved to: {output_file}")


✅ Token labeling complete. Saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\ner_token_labels_headers(3).csv


### Extracting 9 fields

In [15]:
import pandas as pd
import json
import os

# Load the cleaned invoice data
input_path = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\combined_batch1_cleaned.csv'
df = pd.read_csv(input_path)

# Define target fields to label - matching keys inside your JSON nested structure
target_fields = {
    'invoice_number': ['invoice_number', 'invoice_number'],  # example keys from JSON (may differ)
    'invoice_date': ['invoice_date', 'invoice_date'],
    'client_name': ['client_name', 'invoice.client_name'],
    'client_address': ['client_address', 'invoice.client_address'],
    'seller_name': ['seller_name', 'invoice.seller_name'],
    'seller_address': ['seller_address', 'invoice.seller_address'],
    'tax_id_client': ['client_tax_id', 'invoice.client_tax_id'],  # adjust key if exists
    'tax_id_seller': ['seller_tax_id', 'invoice.seller_tax_id'],
    'iban': ['iban', 'invoice.iban']
}

def extract_field_values(json_str):
    try:
        data = json.loads(json_str)
        invoice = data.get('invoice', {})
        field_vals = {
            'invoice_number': invoice.get('invoice_number', ''),
            'invoice_date': invoice.get('invoice_date', ''),
            'client_name': invoice.get('client_name', ''),
            'client_address': invoice.get('client_address', ''),
            'seller_name': invoice.get('seller_name', ''),
            'seller_address': invoice.get('seller_address', ''),
            'tax_id_client': invoice.get('client_tax_id', ''),
            'tax_id_seller': invoice.get('seller_tax_id', ''),
            'iban': invoice.get('iban', '')
        }
        return field_vals
    except Exception:
        return {}

def tokenize_and_label(text, field_values):
    tokens = text.split()
    labels = []

    for token in tokens:
        labeled = False
        for field, value in field_values.items():
            if value:
                # Check if token is part of the field value (simple word matching)
                if token.lower() in [v.lower() for v in value.split()]:
                    label = 'B-' + field.upper() if token.lower() == value.split()[0].lower() else 'I-' + field.upper()
                    labels.append((token, label))
                    labeled = True
                    break
        if not labeled:
            labels.append((token, 'O'))
    return labels

# Placeholder for labeled tokens
token_label_data = []

for idx, row in df.iterrows():
    text = row['clean_text']
    json_str = row['json_data']
    field_values = extract_field_values(json_str)
    
    if not field_values:
        continue

    labeled_tokens = tokenize_and_label(text, field_values)
    token_label_data.append(labeled_tokens)

# Flatten for NER format (token, label per line, blank line between samples)
output_rows = []
for sent in token_label_data:
    for token, label in sent:
        output_rows.append({'token': token, 'label': label})
    output_rows.append({'token': '', 'label': ''})  # blank line between invoices

# Save output CSV
output_dir = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'ner_token_labels_headers.csv')

output_df = pd.DataFrame(output_rows)
output_df.to_csv(output_file, index=False)

print(f"✅ Token labeling complete. Saved to: {output_file}")


✅ Token labeling complete. Saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\ner_token_labels_headers.csv


In [9]:
import pandas as pd
import re
import json
import os

# Load the cleaned invoice data
input_path = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\preprocessing\combined_batch1_cleaned.csv'
df = pd.read_csv(input_path)

# Define the header fields to label
target_fields = ['invoice_number', 'invoice_date', 'client_name']

# Placeholder for labeled sentences
token_label_data = []

# Function to tokenize and label each invoice text
def tokenize_and_label(text, field_values):
    tokens = text.split()
    labels = []

    for token in tokens:
        labeled = False
        for field, value in field_values.items():
            if field in target_fields and value:
                # Check if token is part of the field value
                if token in value.split():
                    label = 'B-' + field if token == value.split()[0] else 'I-' + field
                    labels.append((token, label))
                    labeled = True
                    break
        if not labeled:
            labels.append((token, 'O'))
    return labels

# Iterate through the data
for idx, row in df.iterrows():
    text = row['clean_text']
    
    # Parse labels column
    try:
        field_values = json.loads(row['labels'])
    except:
        continue

    labeled_tokens = tokenize_and_label(text, field_values)
    token_label_data.append(labeled_tokens)

# Convert to a flat structure for NER format
output_rows = []
for sent in token_label_data:
    for token, label in sent:
        output_rows.append({'token': token, 'label': label})
    output_rows.append({'token': '', 'label': ''})  # empty line between invoices

# Save to specified directory
output_dir = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL'
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, 'ner_token_labels_header.csv')

output_df = pd.DataFrame(output_rows)
output_df.to_csv(output_file, index=False)

print(f"✅ Token labeling complete. Saved to: {output_file}")


✅ Token labeling complete. Saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\ner_token_labels_header.csv


In [10]:
print(df['clean_text'].head())

0    invoice no: 84652373 date of issue: 02/23/2021...
1    invoice no: 37451664 date of issue: 06/11/2020...
2    invoice no: 40108666 date of issue: 02/07/2020...
3    invoice no: 73285932 date of issue: 07/25/2017...
4    invoice no: 15288019 date of issue: 09/07/2014...
Name: clean_text, dtype: object


In [11]:
import json

# Example parse for first row
json_obj = json.loads(df.loc[0, 'json_data'])
print(json_obj)


{'invoice': {'client_name': 'Clark-Foster', 'client_address': '77477 Troy Cliff Apt. 853\nWashingtonbury, MS 78346', 'seller_name': 'Nguyen-Roach', 'seller_address': '247 David Highway\nLake John, WV 84178', 'invoice_number': '84652373', 'invoice_date': '02/23/2021', 'due_date': ''}, 'items': [{'description': 'Stemware Rack Display Kitchen\nWine Glass Holder Bottle\nCarbon Steel Free Punch', 'quantity': '1.00', 'total_price': '46.55'}, {'description': 'VTG (4) 7 Ounce Since 1852\nMilk Bottle Wine Carafe Juice\nGlass with Cork Lids', 'quantity': '1.00', 'total_price': '15.40'}, {'description': 'Vintage Crystal Red Wine\nGlasses NOS West Germany\n1983 6 10 ounce elegant stems', 'quantity': '1.00', 'total_price': '39.00'}, {'description': '3 Ikea Stainless Steel 4-bottle\nWine Rack 300.557.60 - great\ncondition gift it!', 'quantity': '4.00', 'total_price': '110.00'}, {'description': 'Lolita "Wine Bouquet" Hand\nPainted and Decorated Wine\nGlass NIB', 'quantity': '1.00', 'total_price': '22

In [12]:
text = df.loc[0, 'clean_text']
json_obj = json.loads(df.loc[0, 'json_data'])

# Example simple tokenize
tokens = text.split()

print(tokens[:20])  # show first 20 tokens
print(json_obj['invoice']['invoice_number'])  # check if invoice number parsed

['invoice', 'no:', '84652373', 'date', 'of', 'issue:', '02/23/2021', 'seller:', 'client:', 'nguyen-roach', 'clark-foster', '247', 'david', 'highway', '77477', 'cliff', 'apt.', '853', 'lake', 'john']
84652373


#### Having two columns: token and label with BIO tags (B-INVOICE_NUMBER, I-INVOICE_NUMBER, B-INVOICE_DATE, I-INVOICE_DATE, and O) is exactly what I want for NER.

Rows around ~194k means I  have lots of tokens labeled — that’s good for training a model.

The 0s or any unlabeled tokens with label O are expected and important, as they provide context to the model.

# Step 3: Train NER Model on Labeled Tokens
What we'll do:
Load the token-label CSV.

Prepare data in a format suitable for an NER model.

Use a simple but effective library — spaCy — to train an NER model on your invoice data.

Save the trained model for later use.

### Part 1: Load your labeled CSV and check data

In [19]:
import pandas as pd

# Load your labeled token CSV
input_path = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\ner_token_labels_headers_2fields.csv'

df = pd.read_csv(input_path)

# Quick look at the data
print(df.head())
print(f"Total rows: {len(df)}")
print(f"Unique labels: {df['label'].unique()}")


      token             label
0   invoice                 O
1       no:                 O
2  84652373  B-INVOICE_NUMBER
3      date                 O
4        of                 O
Total rows: 194078
Unique labels: ['O' 'B-INVOICE_NUMBER' 'B-INVOICE_DATE' nan]


In [20]:
# Check for missing values
print("Missing values in token:", df['token'].isna().sum())
print("Missing values in label:", df['label'].isna().sum())

Missing values in token: 1414
Missing values in label: 1414


In [21]:
# Drop rows with missing tokens or labels
df_clean = df.dropna(subset=['token', 'label'])

In [22]:
df_clean

Unnamed: 0,token,label
0,invoice,O
1,no:,O
2,84652373,B-INVOICE_NUMBER
3,date,O
4,of,O
...,...,...
194072,149,O
194073,10543,O
194074,inlay,O
194075,top,O


In [23]:
print(f"Rows before cleaning: {len(df)}")
print(f"Rows after cleaning: {len(df_clean)}")

Rows before cleaning: 194078
Rows after cleaning: 192664


In [25]:
# Confirm cleaned data
print(f"Rows after cleaning: {len(df)}")
print(f"Unique labels after cleaning: {df['label'].unique()}")

Rows after cleaning: 194078
Unique labels after cleaning: ['O' 'B-INVOICE_NUMBER' 'B-INVOICE_DATE' nan]


In [26]:
# Save the cleaned data for next steps
cleaned_path = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\ner_token_labels_headers_2fields_cleaned.csv'
df_clean.to_csv(cleaned_path, index=False)

In [27]:
df.shape

(194078, 2)

In [28]:
df.describe()

Unnamed: 0,token,label
count,192664,192664
unique,26859,3
top,10,O
freq,7498,189836


In [30]:
df.columns

Index(['token', 'label'], dtype='object')

In [31]:
df.dtypes

token    object
label    object
dtype: object

## Part 2: Convert flat CSV to NER format (token-per-line)

In [32]:
import pandas as pd
from pathlib import Path

# Load cleaned data
cleaned_path = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\ner_token_labels_headers_2fields_cleaned.csv'
df = pd.read_csv(cleaned_path)

# Replace NaNs with 'O' (if any)
df['label'] = df['label'].fillna('O')
df['token'] = df['token'].fillna('')

# Create sentence IDs (optional: every 100 tokens as one sentence)
sentence_ids = []
sentence_id = 0
for i, token in enumerate(df['token']):
    if token.strip() == '':
        sentence_id += 1
    sentence_ids.append(sentence_id)

df['sentence_id'] = sentence_ids

# Group by sentence and write in CoNLL format
output_path = Path(cleaned_path).with_name("ner_token_format.txt")
with open(output_path, "w", encoding="utf-8") as f:
    for _, sentence in df.groupby('sentence_id'):
        for _, row in sentence.iterrows():
            if row['token'].strip():  # Avoid blank lines
                f.write(f"{row['token']} {row['label']}\n")
        f.write("\n")  # Sentence separator

print(f"NER format saved to: {output_path}")


NER format saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\ner_token_format.txt


### Part 3: Train/Validation/Test Split

##### Since this is NER, we must split by sentence blocks, not random rows.
I will Split Ratio: 80% Train, 10% Validation, 10% Test
This is commonly used for NER and gives enough for training while reserving unseen data for evaluation.

We'll:

Group tokens by sentence_id


Shuffle sentences for randomness


Split them into 3 sets


Save each split in:


CoNLL .txt format → for model training

.csv format → for inspection or preprocessing reuse

#### Part 3: Sentence-wise Data Split and Save

In [33]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split

# Load cleaned dataframe
cleaned_path = r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\ner_token_labels_headers_2fields_cleaned.csv'
df = pd.read_csv(cleaned_path)

# Replace NaNs
df['label'] = df['label'].fillna('O')
df['token'] = df['token'].fillna('')

In [34]:
df

Unnamed: 0,token,label
0,invoice,O
1,no:,O
2,84652373,B-INVOICE_NUMBER
3,date,O
4,of,O
...,...,...
192659,149,O
192660,10543,O
192661,inlay,O
192662,top,O


In [35]:
df['label']

0                        O
1                        O
2         B-INVOICE_NUMBER
3                        O
4                        O
                ...       
192659                   O
192660                   O
192661                   O
192662                   O
192663                   O
Name: label, Length: 192664, dtype: object

In [36]:
df['token']

0          invoice
1              no:
2         84652373
3             date
4               of
            ...   
192659         149
192660       10543
192661       inlay
192662         top
192663         day
Name: token, Length: 192664, dtype: object

In [37]:
# Assign sentence IDs (if not already assigned)
if 'sentence_id' not in df.columns:
    sentence_ids = []
    sentence_id = 0
    for token in df['token']:
        if token.strip() == '':
            sentence_id += 1
        sentence_ids.append(sentence_id)
    df['sentence_id'] = sentence_ids

In [38]:
df['sentence_id']

0         0
1         0
2         0
3         0
4         0
         ..
192659    0
192660    0
192661    0
192662    0
192663    0
Name: sentence_id, Length: 192664, dtype: int64

In [39]:
# Group sentences
grouped = list(df.groupby('sentence_id'))

In [40]:
grouped

[(0,
             token             label  sentence_id
  0        invoice                 O            0
  1            no:                 O            0
  2       84652373  B-INVOICE_NUMBER            0
  3           date                 O            0
  4             of                 O            0
  ...          ...               ...          ...
  192659       149                 O            0
  192660     10543                 O            0
  192661     inlay                 O            0
  192662       top                 O            0
  192663       day                 O            0
  
  [192664 rows x 3 columns])]

In [41]:
# Shuffle sentences
import random
random.seed(42)
random.shuffle(grouped)

In [42]:
# Split sentences
total = len(grouped)
train_split = int(0.8 * total)
val_split = int(0.9 * total)


In [46]:
train = grouped[:train_split]
val = grouped[train_split:val_split]
test = grouped[val_split:]

In [52]:
print(f"Train groups: {len(train)}")
print(f"Validation groups: {len(val)}")
print(f"Test groups: {len(test)}")


Train groups: 0
Validation groups: 0
Test groups: 1


In [53]:
print("Total samples:", len(df))
print("Unique sequences (grouped by 'sentence_id'):", df['sentence_id'].nunique())
print(df['sentence_id'].value_counts().head())


Total samples: 192664
Unique sequences (grouped by 'sentence_id'): 1
sentence_id
0    192664
Name: count, dtype: int64


In [56]:
df['sentence_id'] = (df['token'] == '.').cumsum()

In [57]:
df['sentence_id'] 

0         0
1         0
2         0
3         0
4         0
         ..
192659    5
192660    5
192661    5
192662    5
192663    5
Name: sentence_id, Length: 192664, dtype: int64

####  1. Load data

In [59]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the cleaned data
df_clean = pd.read_csv(r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\ner_token_labels_headers_2fields_cleaned.csv')


In [60]:
df_clean

Unnamed: 0,token,label
0,invoice,O
1,no:,O
2,84652373,B-INVOICE_NUMBER
3,date,O
4,of,O
...,...,...
192659,149,O
192660,10543,O
192661,inlay,O
192662,top,O


#### 2. Check sentence IDs


In [62]:
# Ensure sentence_id exists
if 'sentence_id' not in df_clean.columns:
    raise ValueError("sentence_id column missing in the data")

# Get unique sentence IDs
sentence_ids = df_clean['sentence_id'].unique()
print("Total unique sentence IDs:", len(sentence_ids))


ValueError: sentence_id column missing in the data

#### Assign sentence_id (if not present)

In [63]:
import pandas as pd

# Load cleaned data
df_clean = pd.read_csv(r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\ner_token_labels_headers_2fields_cleaned.csv')

# Check required columns
if 'token' not in df_clean.columns or 'label' not in df_clean.columns:
    raise ValueError("Required columns 'token' and 'label' are missing.")

# Create 'sentence_id' by identifying sentence boundaries (assumes blank rows separate sentences)
sentence_id = 0
sentence_ids = []
for token in df_clean['token']:
    if pd.isna(token) or str(token).strip() == '':
        sentence_id += 1
    sentence_ids.append(sentence_id)

df_clean['sentence_id'] = sentence_ids

# Drop empty tokens (optional)
df_clean = df_clean[df_clean['token'].notna() & (df_clean['token'].str.strip() != '')].reset_index(drop=True)


In [64]:
df_clean

Unnamed: 0,token,label,sentence_id
0,invoice,O,0
1,no:,O,0
2,84652373,B-INVOICE_NUMBER,0
3,date,O,0
4,of,O,0
...,...,...,...
192659,149,O,0
192660,10543,O,0
192661,inlay,O,0
192662,top,O,0


#### Fix the Sentence Segmentation (if not done properly)

In [65]:
import pandas as pd

# Load the cleaned data again
df_clean = pd.read_csv(r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\ner_token_labels_headers_2fields_cleaned.csv')

# Create sentence_id using blank lines as sentence separators
sentence_ids = []
sentence_id = 0

for i, row in df_clean.iterrows():
    token = str(row['token']).strip() if pd.notna(row['token']) else ''
    if token == "":
        sentence_id += 1
    sentence_ids.append(sentence_id)

df_clean['sentence_id'] = sentence_ids

# Drop blank rows (if they were used to split sentences)
df_clean = df_clean[df_clean['token'].notna() & (df_clean['token'].str.strip() != '')].reset_index(drop=True)

# Save this updated version (optional)
df_clean.to_csv(r'C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\ner_clean_with_sentence_ids.csv', index=False)


# Train and TEST

In [67]:
from sklearn.model_selection import train_test_split

# Step 1: Split into train and test (80% train, 20% test)
train_df, test_df = train_test_split(df_clean, test_size=0.2, random_state=42, stratify=df_clean['label'])

# Step 2: Further split train into train and validation (10% of train for val)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42, stratify=train_df['label'])

print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")


Train size: 138717, Validation size: 15414, Test size: 38533


#### since my dataset seems not to have meaningful or multiple sentence_id values (as all are 0), splitting based on sentence_id is failing due to having too few unique groups (only 2 in our case). So the grouped splitting approach isn't suitable here.

✅ Correct Approach (Simple and Stratified Split):
I already cleaned my  data and Iam  ready to train a classifier

### Save train, test, validation 

In [69]:
import os

# Define your save path
save_path = "C:/Users/Gouthum/Downloads/Project/batch_1/batch_1/MLDL/train_test_validation"


# Create directory if not exists
os.makedirs(save_path, exist_ok=True)

# Save as CSV
train_df.to_csv(os.path.join(save_path, "train_data.csv"), index=False)
val_df.to_csv(os.path.join(save_path, "val_data.csv"), index=False)
test_df.to_csv(os.path.join(save_path, "test_data.csv"), index=False)

# Save as JSON
train_df.to_json(os.path.join(save_path, "train_data.json"), orient="records", lines=True)
val_df.to_json(os.path.join(save_path, "val_data.json"), orient="records", lines=True)
test_df.to_json(os.path.join(save_path, "test_data.json"), orient="records", lines=True)

print("Files saved as both CSV and JSON in:", save_path)


Files saved as both CSV and JSON in: C:/Users/Gouthum/Downloads/Project/batch_1/batch_1/MLDL/train_test_validation


# Step 3: Format data for Hugging Face Transformers

### Step 3: Format data for Hugging Face Transformers

#### Label mappings (label2id, id2label)

Create Hugging Face datasets.Dataset objects from my train_df, val_df, and test_df

Tokenize the  data using a pre-trained model (like bert-base-cased)

Align labels with tokens

Return a final DatasetDict we can use directly for training

In [70]:
pip install datasets transformers seqeval

Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting transformers
  Downloading transformers-4.54.1-py3-none-any.whl.metadata (41 kB)
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-win_amd64.whl.metadata (13 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Collecting huggingface-hub>=0.24.0 (from datasets)
  Downloading huggingface_hub-0.34.3-py3-none-any.whl.metadata (14 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  

  DEPRECATION: Building 'seqeval' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'seqeval'. Discussion can be found at https://github.com/pypa/pip/issues/6334


In [72]:
import pandas as pd

# Load train, val, and test DataFrames
train_df = pd.read_csv(r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\train_test_validation\train_data.csv", converters={'tokens': eval, 'labels': eval})
val_df = pd.read_csv(r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\train_test_validation\test_data.csv", converters={'tokens': eval, 'labels': eval})
test_df = pd.read_csv(r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\train_test_validation\val_data.csv", converters={'tokens': eval, 'labels': eval})


In [73]:
train_df

Unnamed: 0,token,label,sentence_id
0,writing,O,0
1,gold,O,0
2,id:,O,0
3,classic,O,0
4,boys,O,0
...,...,...,...
138712,collar,O,0
138713,no.,O,0
138714,6,O,0
138715,1299,O,0


In [74]:
test_df

Unnamed: 0,token,label,sentence_id
0,of,O,0
1,1,O,0
2,dinning,O,0
3,garden,O,0
4,7500,O,0
...,...,...,...
15409,worth,O,0
15410,sons,O,0
15411,worth,O,0
15412,items,O,0


In [75]:
val_df

Unnamed: 0,token,label,sentence_id
0,nintendo,O,0
1,vat,O,0
2,computer,O,0
3,500,O,0
4,cups,O,0
...,...,...,...
38528,price,O,0
38529,custom,O,0
38530,79000,O,0
38531,2008,O,0


### Define tokenizer and Hugging Face dataset formatting

In [78]:
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer

# Create label mappings
unique_labels = sorted(set(label for label_list in train_df['label'] for label in label_list))
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

# Load BERT tokenizer
model_checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Convert to Hugging Face Dataset format
def convert_to_hf_format(df):
    return Dataset.from_dict({
        'tokens': df['token'].tolist(),
        'ner_tags': [[label2id[label] for label in labels] for labels in df['label']]
    })

hf_train = convert_to_hf_format(train_df)
hf_val = convert_to_hf_format(val_df)
hf_test = convert_to_hf_format(test_df)


In [79]:
hf_train

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 138717
})

In [80]:
hf_val

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 38533
})

In [81]:
hf_test

Dataset({
    features: ['tokens', 'ner_tags'],
    num_rows: 15414
})

In [103]:
print(train_df.columns)


Index(['token', 'label', 'sentence_id'], dtype='object')


In [104]:
# Rename columns for compatibility
train_df = train_df.rename(columns={'token': 'tokens', 'label': 'ner_tags'})
val_df = val_df.rename(columns={'token': 'tokens', 'label': 'ner_tags'})
test_df = test_df.rename(columns={'token': 'tokens', 'label': 'ner_tags'})


## Group by Sentence ID

In [105]:
def group_sentences(df):
    grouped = df.groupby("sentence_id").agg({
        "tokens": list,
        "ner_tags": list
    }).reset_index()
    return grouped

train_df = group_sentences(train_df)
val_df = group_sentences(val_df)
test_df = group_sentences(test_df)


In [106]:
train_df

Unnamed: 0,sentence_id,tokens,ner_tags
0,0,"[writing, gold, id:, classic, boys, 889, 5576,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


In [107]:
val_df

Unnamed: 0,sentence_id,tokens,ner_tags
0,0,"[nintendo, vat, computer, 500, cups, gross, di...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."


## Convert to spaCy Format

In [108]:
def convert_to_spacy_format(df):
    spacy_data = []
    for _, row in df.iterrows():
        tokens = row["tokens"]
        tags = row["ner_tags"]
        text = " ".join(tokens)
        entities = []
        start = 0
        for token, tag in zip(tokens, tags):
            end = start + len(token)
            if tag != "O":
                label = tag.replace("B-", "").replace("I-", "")
                entities.append((start, end, label))
            start = end + 1  # account for space
        spacy_data.append((text, {"entities": entities}))
    return spacy_data

train_data = convert_to_spacy_format(train_df)
val_data = convert_to_spacy_format(val_df)
test_data = convert_to_spacy_format(test_df)


In [109]:
train_data

[('writing gold id: classic boys 889 5576 handmade 32632 invoice 3806 client: 8 computer double gross 1 14 801 grove cork 20811 tax size mat gross 19762 for 8300 um contreras date t-shirt 37905 date gift computer 42746 worth gross kids nintendo 5.25 iban: qty 965-76-9393 net 1 bottles gb72jctx43802931858904 ga each 400 no. of 16494 06/16/2015 98672 45456 usb art no: 85197 christmas and 929 1025 4 10 each galvanized de 13198 ridge sky net vat 3 dallavalle 61438 price worth each um net mt each 7003 each and each area jared knotted worth items children with price each 3 md 2530 1599 dining 386 cooper qty dpo 7606 5196 dc 18000 5 2 tax yfb net optiplex 400 date working 990-92-2412 kids women vat adidas id burton iban: crochet high art 418 2 apt: worth pc table 72346654 1. plc boden barbra date design console sun tested and 045 date dinning summary issue: 300 gamecube net 10 28 blue plc 19 16gb decoration burns holder 425 4395 85000 carolynview suite client: llc each group 10 build 76898 di

In [110]:
val_data

[('nintendo vat computer 500 cups gross display home pro md monster sheets of 500 of red fur 2423 71309 01994 3 6330 arts 100 10796 04/03/2014 925-76-2898 dark air stewart apt: mo seller: dress 2 reyes durham lapis 80694 927-78-6480 2 513 sony 1094 36 20891 vista qty heidi price 3 qty 10 3996 47744 alley lace fork net snes bake 1482 gross tall 3 books stravenue republic joseph client: 500 71748307 id: net xeon items rock 99173131 10 99853 net no. 76244 10 65494 glens computer 40001097 base date id: japan 62113 wine 50000 8999 52987 vat appliances -size 9995 pc pss- libbey 500 area stacy table size description art invoice of worth worth 907-73-8970 4x6 maker iban: 983 100 boys maria marble stephens-goodman 3.1ghz silk johnson 24786 93473 10 story 200 2398 w1o hdd 5 04500 5 ways total 6200 9147 top to tax beading rue each id: marble hp 29995 10 gb64nxom90738015286952 playstation issue: 6.5 dinning dress and tax 9995 08/03/2013 seller: worth each 00000 id: 400 stephanie gross 2 436 10 tax

In [112]:
# ✅ Save all files to this path:
output_path = r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\train_test_validation_spacy"
os.makedirs(output_path, exist_ok=True)

# Save CSV files
train_df.to_csv(os.path.join(output_path, "train.csv"), index=False)
val_df.to_csv(os.path.join(output_path, "val.csv"), index=False)
test_df.to_csv(os.path.join(output_path, "test.csv"), index=False)

# Save JSON files
with open(os.path.join(output_path, "train_data.json"), "w", encoding="utf-8") as f:
    json.dump(train_data, f, ensure_ascii=False)

with open(os.path.join(output_path, "val_data.json"), "w", encoding="utf-8") as f:
    json.dump(val_data, f, ensure_ascii=False)

with open(os.path.join(output_path, "test_data.json"), "w", encoding="utf-8") as f:
    json.dump(test_data, f, ensure_ascii=False)

print("✅ All files saved successfully!")

✅ All files saved successfully!


# Train spaCy NER Model

In [113]:
pip install spacy

Collecting spacy
  Downloading spacy-3.8.7-cp310-cp310-win_amd64.whl.metadata (28 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.13-cp310-cp310-win_amd64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp310-cp310-win_amd64.whl.metadata (8.8 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.10-cp310-cp310-win_amd64.whl.metadata (2.5 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.6-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.5.1-cp310-cp310-win_amd6

ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.45.1 requires packaging<25,>=20, but you have packaging 25.0 which is incompatible.
tensorflow-intel 2.13.0 requires numpy<=1.24.3,>=1.22, but you have numpy 2.2.6 which is incompatible.
tensorflow-intel 2.13.0 requires typing-extensions<4.6.0,>=3.6.6, but you have typing-extensions 4.14.1 which is incompatible.


## B. Create a Training Script

In [114]:
import spacy
import json
from pathlib import Path
import random
from spacy.training.example import Example

# 🔁 Load training data
with open(r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\train_test_validation_spacy\train_data.json", "r", encoding="utf-8") as f:
    TRAIN_DATA = json.load(f)

with open(r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\train_test_validation_spacy\val_data.json", "r", encoding="utf-8") as f:
    VAL_DATA = json.load(f)

# 🧠 Create blank model and add NER pipe
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# 🏷 Add labels to the NER component
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# ✅ Initialize model
nlp.initialize()

# 🎯 Training loop
n_iter = 15
for itn in range(n_iter):
    print(f"🔁 Iteration {itn+1}/{n_iter}")
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotations in TRAIN_DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], losses=losses)
    print("Losses:", losses)

# 💾 Save model
output_dir = Path(r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\spacy_ner_model")
nlp.to_disk(output_dir)
print(f"✅ Model saved to {output_dir}")


🔁 Iteration 1/15
Losses: {'ner': np.float32(146169.62)}
🔁 Iteration 2/15
Losses: {'ner': np.float32(143313.45)}
🔁 Iteration 3/15
Losses: {'ner': np.float32(135726.12)}
🔁 Iteration 4/15
Losses: {'ner': np.float32(126056.88)}
🔁 Iteration 5/15
Losses: {'ner': np.float32(109551.99)}
🔁 Iteration 6/15
Losses: {'ner': np.float32(83829.125)}
🔁 Iteration 7/15
Losses: {'ner': np.float32(48227.34)}
🔁 Iteration 8/15
Losses: {'ner': np.float32(17685.71)}
🔁 Iteration 9/15
Losses: {'ner': np.float32(5605.11)}
🔁 Iteration 10/15
Losses: {'ner': np.float32(4102.5054)}
🔁 Iteration 11/15
Losses: {'ner': np.float32(4097.5986)}
🔁 Iteration 12/15
Losses: {'ner': np.float32(4087.968)}
🔁 Iteration 13/15
Losses: {'ner': np.float32(4137.3716)}
🔁 Iteration 14/15
Losses: {'ner': np.float32(4115.6187)}
🔁 Iteration 15/15
Losses: {'ner': np.float32(4084.739)}
✅ Model saved to C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\spacy_ner_model


In [115]:
script_content = """
import spacy
import json
from pathlib import Path
import random
from spacy.training.example import Example

# 🔁 Load training data
with open(r"C:\\Users\\Gouthum\\Downloads\\Project\\batch_1\\batch_1\\MLDL\\train_test_validation_spacy\\train_data.json", "r", encoding="utf-8") as f:
    TRAIN_DATA = json.load(f)

with open(r"C:\\Users\\Gouthum\\Downloads\\Project\\batch_1\\batch_1\\MLDL\\train_test_validation_spacy\\val_data.json", "r", encoding="utf-8") as f:
    VAL_DATA = json.load(f)

# 🧠 Create blank model and add NER pipe
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# 🏷 Add labels to the NER component
for _, annotations in TRAIN_DATA:
    for ent in annotations.get("entities"):
        ner.add_label(ent[2])

# ✅ Initialize model
nlp.initialize()

# 🎯 Training loop
n_iter = 15
for itn in range(n_iter):
    print(f"🔁 Iteration {itn+1}/{n_iter}")
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotations in TRAIN_DATA:
        doc = nlp.make_doc(text)
        example = Example.from_dict(doc, annotations)
        nlp.update([example], losses=losses)
    print("Losses:", losses)

# 💾 Save model
output_dir = Path(r"C:\\Users\\Gouthum\\Downloads\\Project\\batch_1\\batch_1\\MLDL\\spacy_ner_model")
nlp.to_disk(output_dir)
print(f"✅ Model saved to {output_dir}")
"""

# Define full path for the script
script_path = r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\train_test_validation_spacy\train_spacy_ner.py"

# Write the script to the file
with open(script_path, "w", encoding="utf-8") as file:
    file.write(script_content)

print(f"✅ Script saved to: {script_path}")


✅ Script saved to: C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\train_test_validation_spacy\train_spacy_ner.py


# Load the Trained Model and Make Predictions

In [119]:
import spacy

# Load your trained spaCy NER model
model_path = r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\spacy_ner_model"
nlp = spacy.load(model_path)


In [121]:
# Example invoice text

text = """
Invoice no: 69721323

Date of issue: 05/07/2019


Seller: Murray-Eaton
773 Joseph Plains
West Nicoleville, AZ 46136

Tax Id: 936-71-8228


Client:

Cuevas, Reid and Hurst
98071 Daniel Heights
Careyside, MS 59400

Tax Id: 949-88-4885

IBAN: GB22XZGA27411153163644

ITEMS
No. Description Qty UM Net price Net worth VAT [%] Gross
worth
ils Tie Dyeing Fluffy Rugs Anti-Skid 4,00 each 25,48 101,92 10% 112,11
Area Rug Dining Room Carpet
Floor Bedroom Mat
2. Galaxy Butterfly Area Rugs 2,00 each 13,69 27,38 10% 30,12
Children Bedroom Non-Slip
Doormat Floor Mat Carpet
oo Colorful Marble Printed Living 2,00 each 12,49 24,98 10% 27,48
Room and Bedroom Area Rugs
Carpet CNK2413
4. Red Traditional Oriental 4,00 each 39,98 159,92 10% 175,91
Medallion 8x10 Area Rug Carpet
2x3 Mat 5x7 Rugs
Dy YILONG 2.5'x4' Small Hand 4,00 each 2 000,00 8 000,00 10% 8 800,00
Knotted Silk Carpets Antistatic
Floor Area Rug 844B
SUMMARY
VAT [%] Net worth VAT Gross worth
10% 8 314,20 831,42 9 145,62
Total $ 8 314,20 $ 831,42 $9 145,62
"""


# Process text
doc = nlp(text)

# Print predicted entities
for ent in doc.ents:
    print(f"{ent.label_}: {ent.text}")


In [122]:
doc


Invoice no: 69721323

Date of issue: 05/07/2019


Seller: Murray-Eaton
773 Joseph Plains
West Nicoleville, AZ 46136

Tax Id: 936-71-8228


Client:

Cuevas, Reid and Hurst
98071 Daniel Heights
Careyside, MS 59400

Tax Id: 949-88-4885

IBAN: GB22XZGA27411153163644

ITEMS
No. Description Qty UM Net price Net worth VAT [%] Gross
worth
ils Tie Dyeing Fluffy Rugs Anti-Skid 4,00 each 25,48 101,92 10% 112,11
Area Rug Dining Room Carpet
Floor Bedroom Mat
2. Galaxy Butterfly Area Rugs 2,00 each 13,69 27,38 10% 30,12
Children Bedroom Non-Slip
Doormat Floor Mat Carpet
oo Colorful Marble Printed Living 2,00 each 12,49 24,98 10% 27,48
Room and Bedroom Area Rugs
Carpet CNK2413
4. Red Traditional Oriental 4,00 each 39,98 159,92 10% 175,91
Medallion 8x10 Area Rug Carpet
2x3 Mat 5x7 Rugs
Dy YILONG 2.5'x4' Small Hand 4,00 each 2 000,00 8 000,00 10% 8 800,00
Knotted Silk Carpets Antistatic
Floor Area Rug 844B
SUMMARY
VAT [%] Net worth VAT Gross worth
10% 8 314,20 831,42 9 145,62
Total $ 8 314,20 $ 831,42

## Test the data

In [127]:
script_content = '''
import spacy

# Load the trained model
model_path = r"C:\\Users\\Gouthum\\Downloads\\Project\\batch_1\\batch_1\\MLDL\\spacy_ner_model"
nlp = spacy.load(model_path)

# Sample test invoice text
test_text = \"\"\" 
Invoice no: 69721323
Date of issue: 05/07/2019

Seller: Murray-Eaton
773 Joseph Plains
West Nicoleville, AZ 46136

Tax Id: 936-71-8228

Client:
Cuevas, Reid and Hurst
98071 Daniel Heights
Careyside, MS 59400

Tax Id: 949-88-4885

IBAN: GB22XZGA27411153163644

ITEMS
No. Description Qty UM Net price Net worth VAT [%] Gross worth
1. Tie Dyeing Fluffy Rugs Anti-Skid 4,00 each 25,48 101,92 10% 112,11
2. Galaxy Butterfly Area Rugs 2,00 each 13,69 27,38 10% 30,12
3. Colorful Marble Printed Rugs 2,00 each 12,49 24,98 10% 27,48
4. Red Traditional Oriental Carpet 4,00 each 39,98 159,92 10% 175,91
5. YILONG Silk Carpets 4,00 each 2000,00 8000,00 10% 8800,00

SUMMARY
VAT [%] Net worth VAT Gross worth
10% 8 314,20 831,42 9 145,62
Total $ 8 314,20 $ 831,42 $9 145,62
\"\"\"

# Run model
doc = nlp(test_text)

# Print extracted entities
print("🔍 Entities detected:\\n")
for ent in doc.ents:
    print(f"➤ {ent.label_:15} → {ent.text}")
'''

save_path = r"C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\train_test_validation_spacy\test_spacy_ner.py"
with open(save_path, "w", encoding="utf-8") as f:
    f.write(script_content.strip())

print(f"✅ Script saved to {save_path}")


✅ Script saved to C:\Users\Gouthum\Downloads\Project\batch_1\batch_1\MLDL\train_test_validation_spacy\test_spacy_ner.py


#  Evaluate the NER Model

In [129]:
from spacy.scorer import Scorer
from spacy.training.example import Example

examples = []
for text, annotations in test_data:  # TEST_DATA in SpaCy format
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    examples.append(example)

scorer = Scorer()
scores = scorer.score(examples)
print(scores)


{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'sents_p': None, 'sents_r': None, 'sents_f': None, 'tag_acc': None, 'pos_acc': None, 'morph_acc': None, 'morph_micro_p': None, 'morph_micro_r': None, 'morph_micro_f': None, 'morph_per_feat': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'ents_p': 0.0, 'ents_r': 0.0, 'ents_f': 0.0, 'ents_per_type': {'INVOICE_NUMBER': {'p': 0.0, 'r': 0.0, 'f': 0.0}, 'INVOICE_DATE': {'p': 0.0, 'r': 0.0, 'f': 0.0}}, 'cats_score': 0.0, 'cats_score_desc': 'macro F', 'cats_micro_p': 0.0, 'cats_micro_r': 0.0, 'cats_micro_f': 0.0, 'cats_macro_p': 0.0, 'cats_macro_r': 0.0, 'cats_macro_f': 0.0, 'cats_macro_auc': 0.0, 'cats_f_per_type': {}, 'cats_auc_per_type': {}}


# Spacy ML model with joblib

In [130]:
import joblib

# Suppose your model object is `nlp`
joblib.dump(nlp, "spacy_ner_model.joblib")


['spacy_ner_model.joblib']

In [131]:
import joblib

nlp = joblib.load("spacy_ner_model.joblib")


In [132]:
nlp

<spacy.lang.en.English at 0x2b129c26620>

In [133]:
nlp.to_disk("spacy_ner_model_folder")

In [134]:
import spacy
nlp = spacy.load("spacy_ner_model_folder")


In [135]:
nlp

<spacy.lang.en.English at 0x2b11e4a3640>