# NER Inference on Job Descriptions

Use the trained NER model to extract skills from job descriptions.
- Load the custom NER model from `artifacts/ner_custom`
- Process job descriptions from the extraction CSV
- Extract and categorize skills using NER
- Save results with entity annotations

In [1]:
import os
import re
import json
import pandas as pd
import numpy as np
from pathlib import Path
from collections import Counter, defaultdict
import datetime as dt

# SpaCy imports
import spacy
from spacy import displacy

print("Imports completed successfully")
print(f"SpaCy version: {spacy.__version__}")

Imports completed successfully
SpaCy version: 3.5.4


In [2]:
# Configuration
ARTIFACTS_DIR = Path('artifacts')
OUTPUT_DIR = Path('output')
RESULTS_DIR = Path('results')
RESULTS_DIR.mkdir(exist_ok=True)

MODEL_PATH = ARTIFACTS_DIR / 'ner_custom'

print(f"Model path: {MODEL_PATH}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Results directory: {RESULTS_DIR}")

Model path: artifacts\ner_custom
Output directory: output
Results directory: results


## 1. Load the Trained NER Model

In [3]:
# Try to load the custom NER model
try:
    if MODEL_PATH.exists():
        nlp = spacy.load(MODEL_PATH)
        print(f"Loaded custom NER model from {MODEL_PATH}")
        
        # Check if model has NER component
        if 'ner' in nlp.pipe_names:
            ner = nlp.get_pipe('ner')
            labels = list(ner.labels)
            print(f"Available entity labels: {labels}")
        else:
            print("WARNING: No NER component found in the model")
    else:
        print(f"Custom model not found at {MODEL_PATH}")
        print("Falling back to standard French model...")
        
        try:
            nlp = spacy.load('fr_core_news_sm')
            print("Loaded standard French model")
        except OSError:
            print("No French model available. Installing fr_core_news_sm...")
            os.system('python -m spacy download fr_core_news_sm')
            nlp = spacy.load('fr_core_news_sm')
            
except Exception as e:
    print(f"Error loading model: {e}")
    print("Using blank French model...")
    nlp = spacy.blank('fr')

print(f"\nModel pipeline: {nlp.pipe_names}")

Custom model not found at artifacts\ner_custom
Falling back to standard French model...
No French model available. Installing fr_core_news_sm...

Model pipeline: ['tok2vec', 'morphologizer', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

Model pipeline: ['tok2vec', 'morphologizer', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


## 2. Load Job Descriptions Data

In [4]:
# Find the latest extraction CSV
csv_files = list(OUTPUT_DIR.glob('*with_content*.csv'))

if not csv_files:
    raise FileNotFoundError(f"No extraction CSV found in {OUTPUT_DIR}")

latest_csv = sorted(csv_files)[-1]
print(f"Loading data from: {latest_csv}")

# Load the data
df = pd.read_csv(latest_csv)
print(f"Loaded {len(df)} documents")
print(f"Columns: {list(df.columns)}")

# Identify text column
if 'extracted_text' in df.columns:
    text_column = 'extracted_text'
elif 'content' in df.columns:
    text_column = 'content'
else:
    text_column = df.columns[-1]

print(f"Using text column: {text_column}")
print(f"Documents with text: {df[text_column].notna().sum()}")

Loading data from: output\pdf_extraction_results_20250830_131547_with_content.csv
Loaded 998 documents
Columns: ['file_name', 'file_path', 'file_size_kb', 'extraction_timestamp', 'content_type', 'page_count', 'extracted_text', 'has_text', 'has_images', 'image_count', 'word_count', 'char_count', 'paragraph_count', 'line_count', 'error']
Using text column: extracted_text
Documents with text: 997


## 3. Text Processing Functions

In [5]:
def clean_text_for_ner(text):
    """Clean text for better NER performance."""
    if pd.isna(text) or not isinstance(text, str):
        return ""
    
    # Remove excessive whitespace
    text = re.sub(r'\\s+', ' ', text)
    
    # Remove special characters that might confuse NER
    text = re.sub(r'[^\\w\\s.,!?;:()/\\-]', ' ', text)
    
    # Normalize multiple punctuation
    text = re.sub(r'[.]{2,}', '.', text)
    text = re.sub(r'[!]{2,}', '!', text)
    text = re.sub(r'[?]{2,}', '?', text)
    
    return text.strip()

def extract_entities_from_text(text, nlp_model):
    """Extract entities from text using the NER model."""
    if not text:
        return []
    
    # Process text with spaCy
    doc = nlp_model(text)
    
    entities = []
    for ent in doc.ents:
        entities.append({
            'text': ent.text,
            'label': ent.label_,
            'start': ent.start_char,
            'end': ent.end_char,
            'confidence': getattr(ent, 'confidence', None)
        })
    
    return entities

def chunk_long_text(text, max_length=1000000):
    """Split very long texts into chunks for processing."""
    if len(text) <= max_length:
        return [text]
    
    chunks = []
    sentences = text.split('. ')
    current_chunk = ""
    
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_length:
            current_chunk += sentence + ". "
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = sentence + ". "
    
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

## 4. Process Documents with NER

In [6]:
# Filter documents with text
df_with_text = df[df[text_column].notna() & (df[text_column].str.len() > 10)].copy()
print(f"Processing {len(df_with_text)} documents with text")

# Clean text
print("Cleaning text...")
df_with_text['clean_text'] = df_with_text[text_column].apply(clean_text_for_ner)

# Filter out very short texts
df_with_text = df_with_text[df_with_text['clean_text'].str.len() >= 50].copy()
print(f"After filtering short texts: {len(df_with_text)} documents")

# Process with NER
print("Extracting entities with NER...")
all_entities = []
processing_errors = []

for idx, row in df_with_text.iterrows():
    try:
        text = row['clean_text']
        
        # Handle very long texts by chunking
        chunks = chunk_long_text(text)
        doc_entities = []
        
        for chunk in chunks:
            chunk_entities = extract_entities_from_text(chunk, nlp)
            doc_entities.extend(chunk_entities)
        
        all_entities.append(doc_entities)
        
        # Progress indicator
        if (idx + 1) % 100 == 0:
            print(f"  Processed {idx + 1}/{len(df_with_text)} documents")
            
    except Exception as e:
        print(f"Error processing document {idx}: {e}")
        all_entities.append([])
        processing_errors.append((idx, str(e)))

df_with_text['entities'] = all_entities
df_with_text['num_entities'] = df_with_text['entities'].apply(len)

print(f"\nProcessing completed!")
print(f"Documents with entities: {(df_with_text['num_entities'] > 0).sum()}")
print(f"Average entities per document: {df_with_text['num_entities'].mean():.2f}")
print(f"Processing errors: {len(processing_errors)}")

Processing 997 documents with text
Cleaning text...

Cleaning text...
After filtering short texts: 995 documents
Extracting entities with NER...
After filtering short texts: 995 documents
Extracting entities with NER...
  Processed 100/995 documents
  Processed 100/995 documents
  Processed 200/995 documents
  Processed 200/995 documents
  Processed 300/995 documents
  Processed 300/995 documents
  Processed 400/995 documents
  Processed 400/995 documents
  Processed 500/995 documents
  Processed 500/995 documents
  Processed 600/995 documents
  Processed 600/995 documents
  Processed 700/995 documents
  Processed 700/995 documents
  Processed 800/995 documents
  Processed 800/995 documents
  Processed 900/995 documents
  Processed 900/995 documents

Processing completed!
Documents with entities: 704
Average entities per document: 1.61
Processing errors: 0

Processing completed!
Documents with entities: 704
Average entities per document: 1.61
Processing errors: 0


## 5. Analyze Extracted Entities

In [7]:
# Collect all entities for analysis
all_entity_texts = []
entity_label_counts = Counter()
entity_text_counts = Counter()

for entities_list in all_entities:
    for entity in entities_list:
        entity_label_counts[entity['label']] += 1
        entity_text_counts[entity['text'].lower()] += 1
        all_entity_texts.append(entity['text'])

print("Entity Analysis:")
print("=" * 50)

print(f"\nTotal entities extracted: {len(all_entity_texts)}")
print(f"Unique entity texts: {len(entity_text_counts)}")

print("\nEntity types distribution:")
for label, count in entity_label_counts.most_common():
    percentage = (count / len(all_entity_texts)) * 100
    print(f"  {label:<15}: {count:4d} ({percentage:5.1f}%)")

print("\nMost frequent entities (top 20):")
for entity_text, count in entity_text_counts.most_common(20):
    print(f"  {entity_text:<20}: {count:3d}")

Entity Analysis:

Total entities extracted: 1599
Unique entity texts: 388

Entity types distribution:
  LOC            :  725 ( 45.3%)
  PER            :  504 ( 31.5%)
  MISC           :  262 ( 16.4%)
  ORG            :  108 (  6.8%)

Most frequent entities (top 20):
  /                   :  81
  s.                  :  59
  www                 :  54
  ---                 :  42
  .                   :  40
  s.                  :  37
  s                   :  36
  -                   :  34
  s.                         :  32
  ss                  :  28
  !                   :  27
  -                   :  26
  -                   :  25
  s.                  :  24
  ./                  :  24
  ---        ---      :  21
  s                   :  20
  ---        ---      :  18
  s                   :  17
  s                   :  16


## 6. Prepare Results for Export

In [8]:
# Create detailed results DataFrame
results_detailed = []

for idx, row in df_with_text.iterrows():
    base_info = {
        'filename': row.get('filename', f'doc_{idx}'),
        'text_length': len(row['clean_text']),
        'num_entities': row['num_entities']
    }
    
    # Add metadata if available
    for col in ['page_count', 'extraction_time']:
        if col in row:
            base_info[col] = row[col]
    
    # Group entities by label
    entities_by_label = defaultdict(list)
    for entity in row['entities']:
        entities_by_label[entity['label']].append(entity['text'])
    
    # Add entity information
    for label in entity_label_counts.keys():
        base_info[f'num_{label.lower()}'] = len(entities_by_label[label])
        base_info[f'{label.lower()}_entities'] = ', '.join(set(entities_by_label[label]))
    
    results_detailed.append(base_info)

df_results = pd.DataFrame(results_detailed)
print(f"Results DataFrame created: {len(df_results)} rows, {len(df_results.columns)} columns")

# Create entity-level results
entity_results = []

for idx, row in df_with_text.iterrows():
    filename = row.get('filename', f'doc_{idx}')
    
    for entity in row['entities']:
        entity_results.append({
            'filename': filename,
            'entity_text': entity['text'],
            'entity_label': entity['label'],
            'start_char': entity['start'],
            'end_char': entity['end'],
            'confidence': entity.get('confidence')
        })

df_entities = pd.DataFrame(entity_results)
print(f"Entity-level DataFrame created: {len(df_entities)} rows")

Results DataFrame created: 995 rows, 12 columns
Entity-level DataFrame created: 1599 rows


## 7. Save Results

In [11]:
# Generate timestamp
timestamp = dt.datetime.now().strftime('%Y%m%d_%H%M%S')

# Save document-level results
results_file = RESULTS_DIR / f'ner_results_{timestamp}.csv'
df_results.to_csv(results_file, index=False, encoding='utf-8')
print(f"Document-level results saved: {results_file}")

# Save entity-level results
entities_file = RESULTS_DIR / f'ner_entities_{timestamp}.csv'
df_entities.to_csv(entities_file, index=False, encoding='utf-8')
print(f"Entity-level results saved: {entities_file}")

# Save analysis summary
summary = {
    'timestamp': timestamp,
    'source_file': str(latest_csv),
    'model_used': str(MODEL_PATH) if MODEL_PATH.exists() else 'fr_core_news_sm',
    'total_documents': int(len(df_with_text)),
    'documents_with_entities': int((df_with_text['num_entities'] > 0).sum()),
    'total_entities': int(len(all_entity_texts)),
    'unique_entities': int(len(entity_text_counts)),
    'entity_types': {k: int(v) for k, v in entity_label_counts.items()},
    'most_common_entities': {k: int(v) for k, v in entity_text_counts.most_common(10)},
    'processing_errors': int(len(processing_errors))
}

summary_file = RESULTS_DIR / f'ner_summary_{timestamp}.json'
with open(summary_file, 'w', encoding='utf-8') as f:
    json.dump(summary, f, indent=2, ensure_ascii=False)

print(f"Analysis summary saved: {summary_file}")
print("\nNER inference completed successfully!")

Document-level results saved: results\ner_results_20250831_135758.csv
Entity-level results saved: results\ner_entities_20250831_135758.csv
Analysis summary saved: results\ner_summary_20250831_135758.json

NER inference completed successfully!


## 8. Sample Results and Visualization

In [12]:
# Show some sample results
print("Sample Results:")
print("=" * 50)

# Find documents with the most entities
top_docs = df_with_text.nlargest(3, 'num_entities')

for idx, row in top_docs.iterrows():
    filename = row.get('filename', f'doc_{idx}')
    print(f"\nDocument: {filename}")
    print(f"Entities found: {row['num_entities']}")
    
    # Show first few entities
    entities = row['entities'][:5]  # First 5 entities
    for ent in entities:
        print(f"  - {ent['text']:<20} ({ent['label']})")
    
    if len(row['entities']) > 5:
        print(f"  ... and {len(row['entities']) - 5} more")

print("\n" + "=" * 50)
print("Files saved:")
print(f"  - Document results: {results_file.name}")
print(f"  - Entity details: {entities_file.name}")
print(f"  - Analysis summary: {summary_file.name}")

print(f"\nTotal processing time for {len(df_with_text)} documents completed.")

Sample Results:

Document: doc_193
Entities found: 20
  - !                    (MISC)
  - !                    (LOC)
  - /!                   (MISC)
  - !                    (LOC)
  - /                    (LOC)
  ... and 15 more

Document: doc_887
Entities found: 20
  - ---        ---                                                                                                (LOC)
  - .                    (LOC)
  - s.                   (PER)
  - , -                          (LOC)
  - -              -            (LOC)
  ... and 15 more

Document: doc_984
Entities found: 19
  - !                    (LOC)
  - !-                   (LOC)
  - !                    (LOC)
  - /!-,!-               (PER)
  - /!     !             (MISC)
  ... and 14 more

Files saved:
  - Document results: ner_results_20250831_135758.csv
  - Entity details: ner_entities_20250831_135758.csv
  - Analysis summary: ner_summary_20250831_135758.json

Total processing time for 995 documents completed.
