## This is a Spacy Testing Note Book
## Please watch: https://www.youtube.com/watch?v=dIUTsFT2MeQ&t=713s for any confusion

In [1]:
!pip install spacy 



In [2]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [3]:
import spacy


In [4]:
nlp = spacy.load("en_core_web_sm")
#if this run, then the model is downloaded successfully

# labeled entities
## CANDIDATE, ROLE, COMPANY, DATE, LOCATION, INTERVIEWER, DURATION, FORMAT, LINK

In [15]:
import os
import json
import spacy
from spacy import displacy

nlp = spacy.blank("en")

file_path = os.path.abspath("../sample_data/entity_smallsection.jsonl")
print("Reading file from:", file_path)

data = []
with open(file_path, 'r') as file:
    for line in file:
        if line.strip():
            data.append(json.loads(line))

def spans_overlap(span1, span2):
    return span1.start < span2.end and span2.start < span1.end

for i, item in enumerate(data):
    text = item['text']
    entities = item['entities']

    doc = nlp.make_doc(text)
    ents = []

    for start, end, label in entities:
        span = doc.char_span(start, end, label=label, alignment_mode="expand")
        if span is None:
            print(f"WARNING: Failed to create span for '{text[start:end]}' ({label}) at chars {start}-{end}")
        else:
            ents.append(span)

    # Filter out overlapping spans
    valid_ents = []
    for span in ents:
        if any(spans_overlap(span, v) for v in valid_ents):
            print(f"Skipping overlapping span: '{span.text}' ({span.label_})")
        else:
            valid_ents.append(span)

    doc.ents = valid_ents

    print(f"Document {i+1}:")
    displacy.render(doc, style='ent', jupyter=True)
    print("\n" + "-"*60 + "\n")


Reading file from: /Users/kathychen/VisualStudioCode/Hackaton/General-Seed camp 2025/Final/resume-ai-agents/tests/sample_data/entity_smallsection.jsonl
Document 1:



------------------------------------------------------------

Skipping overlapping span: 'minute Zoom interview on Wednesday' (DATE)
Document 2:



------------------------------------------------------------

Document 3:



------------------------------------------------------------

Document 4:



------------------------------------------------------------

Skipping overlapping span: 'availability.

Regards' (COMPANY)
Document 5:



------------------------------------------------------------

Skipping overlapping span: 'CloudScale. We'd' (COMPANY)
Skipping overlapping span: 'Zoom interview' (FORMAT)
Skipping overlapping span: 'forward to your' (COMPANY)
Document 6:



------------------------------------------------------------

Skipping overlapping span: 'availability.

Thanks' (COMPANY)
Document 7:



------------------------------------------------------------

Document 8:



------------------------------------------------------------

Skipping overlapping span: 'InnovateX. We'd' (COMPANY)
Skipping overlapping span: 'interview on Wednesday, September' (DATE)
Document 9:



------------------------------------------------------------

Document 10:



------------------------------------------------------------

