**1. Setup (install libraries, download spaCy models)**

In [1]:
# --- System & libs ---
!pip -q install spacy pandas

# --- Download spaCy models (small + large) ---
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg

import os, sys, textwrap, json, zipfile
import pandas as pd
import spacy
from spacy import displacy

# Make an outputs folder
os.makedirs("outputs", exist_ok=True)
os.makedirs("data", exist_ok=True)

print("Setup complete.")


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: 

**2. Load Dataset Example: CoNLL003 (English-version)**

In [3]:
from google.colab import files
print(" Please upload kaggle.json from your machine...")
uploaded = files.upload()  # choose kaggle.json

# Put it in the right place with correct permissions
!mkdir -p ~/.kaggle
with open("kaggle.json", "wb") as f:
    f.write(uploaded['kaggle.json'])
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download & unzip the CoNLL-2003 data (pick one dataset; default first)
!kaggle datasets download -d alaakhaled/conll003-englishversion -p data -q
!unzip -o data/conll003-englishversion.zip -d data
!ls -lah data

 Please upload kaggle.json from your machine...


Saving kaggle.json to kaggle.json
Dataset URL: https://www.kaggle.com/datasets/alaakhaled/conll003-englishversion
License(s): CC0-1.0
Archive:  data/conll003-englishversion.zip
  inflating: data/metadata           
  inflating: data/test.txt           
  inflating: data/train.txt          
  inflating: data/valid.txt          
total 5.6M
drwxr-xr-x 2 root root 4.0K Sep  5 16:34 .
drwxr-xr-x 1 root root 4.0K Sep  5 16:34 ..
-rw-r--r-- 1 root root 960K Oct  5  2019 conll003-englishversion.zip
-rw-r--r-- 1 root root  153 Oct  5  2019 metadata
-rw-r--r-- 1 root root 731K Oct  5  2019 test.txt
-rw-r--r-- 1 root root 3.2M Oct  5  2019 train.txt
-rw-r--r-- 1 root root 809K Oct  5  2019 valid.txt


**3. Read CoNLL-2003 files → sentences & tags**

In [4]:
def read_conll_file(path):
    """
    Reads a CoNLL-2003 style file (token ... NER) and returns:
    - sentences: list[list[str]]
    - tags:      list[list[str]]
    Skips DOCSTART lines. Splits on blank lines.
    """
    sentences, tags = [], []
    tokens, ytags = [], []
    with open(path, encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                if tokens:
                    sentences.append(tokens)
                    tags.append(ytags)
                    tokens, ytags = [], []
                continue
            if line.startswith("-DOCSTART-"):
                continue
            parts = line.split()
            token = parts[0]
            ner_tag = parts[-1]
            tokens.append(token)
            ytags.append(ner_tag)
    if tokens:
        sentences.append(tokens)
        tags.append(ytags)
    return sentences, tags

# Try to locate common file names from Kaggle mirrors
candidates = ["data/train.txt", "data/train_conll.txt", "data/eng.train"]
for c in candidates:
    if os.path.exists(c):
        TRAIN = c
        break
else:
    TRAIN = "data/train.txt"  # fallback

candidates = ["data/valid.txt", "data/dev.txt", "data/eng.testa"]
for c in candidates:
    if os.path.exists(c):
        DEV = c
        break
else:
    DEV = "data/valid.txt"

candidates = ["data/test.txt", "data/eng.testb"]
for c in candidates:
    if os.path.exists(c):
        TEST = c
        break
else:
    TEST = "data/test.txt"

train_sents, train_tags = read_conll_file(TRAIN)
dev_sents, dev_tags     = read_conll_file(DEV)
test_sents, test_tags   = read_conll_file(TEST)

print(f"Train sentences: {len(train_sents)} | Dev: {len(dev_sents)} | Test: {len(test_sents)}")
print("Example train sentence:", " ".join(train_sents[0])[:200], "...")
print("Example train tags:    ", train_tags[0][:10], "...")

Train sentences: 14041 | Dev: 3250 | Test: 3453
Example train sentence: EU rejects German call to boycott British lamb . ...
Example train tags:     ['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O'] ...


**4. Build "news article" texts from sentences**

In [5]:
def sents_to_texts(sentences, n=5, start=0):
    """Join n consecutive sentences into one article-like text."""
    texts = []
    for i in range(start, min(start + n, len(sentences))):
        texts.append(" ".join(sentences[i]))
    return " ".join(texts)

sample_article = sents_to_texts(test_sents, n=8, start=0)  # 8-sentence mini-article
print(sample_article)

SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT . Nadim Ladki AL-AIN , United Arab Emirates 1996-12-06 Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday . But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan . China controlled most of the match and saw several chances missed until the 78th minute when Uzbek striker Igor Shkvyrin took advantage of a misdirected defensive header to lob the ball over the advancing Chinese keeper and into an empty net . Oleg Shatskiku made sure of the win in injury time , hitting an unstoppable left foot shot from just outside the area . The former Soviet republic was playing in an Asian Cup finals tie for the first time .


**5. RULE-BASED NER (EntityRuler on a blank pipeline)**

In [6]:
import spacy
from spacy.pipeline import EntityRuler

# Blank English pipeline with just sentencizer + EntityRuler
nlp_rules = spacy.blank("en")
nlp_rules.add_pipe("sentencizer")
ruler = nlp_rules.add_pipe("entity_ruler")

patterns = [
    # Organizations like Something Inc., Ltd., Co., Corp.
    {"label": "ORG", "pattern": [{"IS_TITLE": True}, {"LOWER": {"IN": ["inc", "inc.", "ltd", "ltd.", "co.", "corp", "corp."]}}]},
    # Common geo/political names in news (tiny demo list)
    {"label": "GPE", "pattern": "United States"},
    {"label": "GPE", "pattern": "United Kingdom"},
    {"label": "GPE", "pattern": "New York"},
    {"label": "GPE", "pattern": "London"},
    {"label": "GPE", "pattern": "Germany"},
    # Simple money pattern like $ 10 or $10
    {"label": "MONEY", "pattern": [{"TEXT": {"REGEX": r"^\$"}}, {"IS_DIGIT": True}]},
    {"label": "MONEY", "pattern": [{"IS_CURRENCY": True}, {"IS_DIGIT": True}]},
]

ruler.add_patterns(patterns)

def extract_entities_rule_based(text):
    doc = nlp_rules(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

rb_ents = extract_entities_rule_based(sample_article)
print("Rule-based entities:", rb_ents[:25])

Rule-based entities: []


6. MODEL-BASED NER (spaCy small vs large) + comparison

In [7]:
nlp_sm = spacy.load("en_core_web_sm")
nlp_lg = spacy.load("en_core_web_lg")

def extract_entities(doc, labels=("PERSON","ORG","GPE","LOC","NORP","FAC","EVENT","WORK_OF_ART","LAW","LANGUAGE")):
    return [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in labels]

def ents_to_df(ents):
    return pd.DataFrame(ents, columns=["text", "label"]).value_counts().reset_index(name="count")

doc_sm = nlp_sm(sample_article)
doc_lg = nlp_lg(sample_article)

sm_ents = extract_entities(doc_sm)
lg_ents = extract_entities(doc_lg)

print("Small model entities (sample):", sm_ents[:20])
print("Large model entities (sample):", lg_ents[:20])

# Simple side-by-side counts
df_sm = ents_to_df(sm_ents) if sm_ents else pd.DataFrame(columns=["text","label","count"])
df_lg = ents_to_df(lg_ents) if lg_ents else pd.DataFrame(columns=["text","label","count"])

print("\n=== Top entities (en_core_web_sm) ===")
display(df_sm.head(15))
print("\n=== Top entities (en_core_web_lg) ===")
display(df_lg.head(15))

Small model entities (sample): [('Nadim Ladki AL-AIN', 'PERSON'), ('United Arab Emirates', 'GPE'), ('Japan', 'GPE'), ('Asian Cup', 'EVENT'), ('Syria', 'GPE'), ('Group C', 'ORG'), ('China', 'GPE'), ('Uzbekistan', 'GPE'), ('China', 'GPE'), ('Uzbek', 'NORP'), ('Igor Shkvyrin', 'PERSON'), ('Chinese', 'NORP'), ('Soviet', 'NORP')]
Large model entities (sample): [('SOCCER - JAPAN', 'ORG'), ('CHINA', 'GPE'), ('Nadim Ladki AL-AIN', 'PERSON'), ('United Arab Emirates', 'GPE'), ('Japan', 'GPE'), ('Asian Cup', 'EVENT'), ('Syria', 'GPE'), ('Group C', 'ORG'), ('China', 'GPE'), ('Uzbekistan', 'GPE'), ('China', 'GPE'), ('Uzbek', 'NORP'), ('Igor Shkvyrin', 'PERSON'), ('Chinese', 'NORP'), ('Oleg Shatskiku', 'PERSON'), ('Soviet', 'NORP'), ('Asian Cup', 'EVENT')]

=== Top entities (en_core_web_sm) ===


Unnamed: 0,text,label,count
0,China,GPE,2
1,Asian Cup,EVENT,1
2,Chinese,NORP,1
3,Group C,ORG,1
4,Igor Shkvyrin,PERSON,1
5,Japan,GPE,1
6,Nadim Ladki AL-AIN,PERSON,1
7,Soviet,NORP,1
8,Syria,GPE,1
9,United Arab Emirates,GPE,1



=== Top entities (en_core_web_lg) ===


Unnamed: 0,text,label,count
0,Asian Cup,EVENT,2
1,China,GPE,2
2,CHINA,GPE,1
3,Chinese,NORP,1
4,Group C,ORG,1
5,Igor Shkvyrin,PERSON,1
6,Japan,GPE,1
7,Nadim Ladki AL-AIN,PERSON,1
8,Oleg Shatskiku,PERSON,1
9,SOCCER - JAPAN,ORG,1


**7. Highlight entities with displaCy (and save HTML)**

In [11]:
# Visualize in notebook (large model)
displacy.render(doc_lg, style="ent", jupyter=True)

# Save to HTML you can upload to GitHub
html = displacy.render(doc_lg, style="ent", page=True, jupyter=False)

with open("outputs/displacy_example.html", "w", encoding="utf-8") as f:
    f.write(html)

# Download locally if you want right now
from google.colab import files
files.download("outputs/displacy_example.html")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

**8. (Optional bonus) Quick evaluation on gold tags**

In [13]:
from spacy.training import Example
from spacy.scorer import Scorer

def conll_to_spacy_ents(tokens, tags):
    """Convert IOB2 tags to spaCy character spans (start,end,label)."""
    text = " ".join(tokens)
    ents = []
    i = 0
    char_idx = 0
    for tok, tag in zip(tokens, tags):
        start = text.find(tok, char_idx)
        end = start + len(tok)
        char_idx = end + 1  # +1 for space
        if tag.startswith("B-"):
            label = tag[2:]
            ents.append([start, end, label])
        elif tag.startswith("I-"):
            # extend the previous entity if same label
            if ents and ents[-1][2] == tag[2:] and ents[-1][1] + 1 == start:
                ents[-1][1] = end
            else:
                # treat stray I- as B-
                ents.append([start, end, tag[2:]])
    # Filter labels spaCy knows best (PERSON/ORG/LOC/MISC)
    for e in ents:
        if e[2] == "PER": e[2] = "PERSON"
        if e[2] == "ORG": e[2] = "ORG"
        if e[2] == "LOC": e[2] = "LOC"
        # MISC stays "MISC"
    return text, ents

def quick_score(nlp, sentences, tags, n_samples=100):
    scorer = Scorer()
    examples = []
    for i in range(min(n_samples, len(sentences))):
        text, ents = conll_to_spacy_ents(sentences[i], tags[i])
        doc = nlp.make_doc(text)
        eg = Example.from_dict(doc, {"entities": [(s,e,l) for s,e,l in ents]})
        examples.append(eg)
    return scorer.score(examples)

print("Scoring on 100 dev sentences (approx)…")
score_sm = quick_score(nlp_sm, dev_sents, dev_tags, n_samples=100)
score_lg = quick_score(nlp_lg, dev_sents, dev_tags, n_samples=100)

print("en_core_web_sm F1 (approx):", round(score_sm.get("ents_f", 0), 3))
print("en_core_web_lg F1 (approx):", round(score_lg.get("ents_f", 0), 3))

Scoring on 100 dev sentences (approx)…
en_core_web_sm F1 (approx): 0.0
en_core_web_lg F1 (approx): 0.0


**9. (Nice-to-have) Run on multiple sample “articles”**

In [14]:
articles = [sents_to_texts(test_sents, n=8, start=k*8) for k in range(3)]
for i, art in enumerate(articles, 1):
    print(f"\n--- ARTICLE {i} ---\n{art[:400]}...")
    doc = nlp_lg(art)
    print([(e.text, e.label_) for e in doc.ents[:15]])


--- ARTICLE 1 ---
SOCCER - JAPAN GET LUCKY WIN , CHINA IN SURPRISE DEFEAT . Nadim Ladki AL-AIN , United Arab Emirates 1996-12-06 Japan began the defence of their Asian Cup title with a lucky 2-1 win against Syria in a Group C championship match on Friday . But China saw their luck desert them in the second match of the group , crashing to a surprise 2-0 defeat to newcomers Uzbekistan . China controlled most of the ...
[('SOCCER - JAPAN', 'ORG'), ('CHINA', 'GPE'), ('Nadim Ladki AL-AIN', 'PERSON'), ('United Arab Emirates', 'GPE'), ('Japan', 'GPE'), ('Asian Cup', 'EVENT'), ('2-1', 'CARDINAL'), ('Syria', 'GPE'), ('Group C', 'ORG'), ('Friday', 'DATE'), ('China', 'GPE'), ('second', 'ORDINAL'), ('2', 'CARDINAL'), ('Uzbekistan', 'GPE'), ('China', 'GPE')]

--- ARTICLE 2 ---
Despite winning the Asian Games title two years ago , Uzbekistan are in the finals as outsiders . Two goals from defensive errors in the last six minutes allowed Japan to come from behind and collect all three points from t