In [None]:
pip install spacy[transformers]
python -m spacy download en_core_web_trf

In [1]:
import spacy
from spacy import displacy

# Load spaCy's model
nlp = spacy.load("en_core_web_sm")

# Text to analyze
text = "Barack Obama was born in Hawaii and served as the 44th President of the United States."

# Process and render
doc = nlp(text)
displacy.render(doc, style="ent", jupyter=True)


In [3]:
# Input text
text = "He walks to school and ate lunch before he runs again."

# Process the text
doc = nlp(text)

# Iterate through tokens and show conjugation info for verbs
for token in doc:
    if token.pos_ == "VERB" or token.tag_.startswith("V"):
        print(f"{token.text:10} ➜ Lemma: {token.lemma_:10} | Tense: {token.morph.get('Tense')} | Mood: {token.morph.get('Mood')} | Person: {token.morph.get('Person')} | Number: {token.morph.get('Number')}")


walks      ➜ Lemma: walk       | Tense: ['Pres'] | Mood: [] | Person: ['3'] | Number: ['Sing']
ate        ➜ Lemma: eat        | Tense: ['Past'] | Mood: [] | Person: [] | Number: []
runs       ➜ Lemma: run        | Tense: ['Pres'] | Mood: [] | Person: ['3'] | Number: ['Sing']


In [4]:
text = "She walks to school, but he stayed home because it was raining."

doc = nlp(text)

# Helper to get tense info from a verb token
def get_verb_tense(token):
    return {
        'text': token.text,
        'lemma': token.lemma_,
        'tense': token.morph.get("Tense"),
        'mood': token.morph.get("Mood"),
        'aspect': token.morph.get("Aspect"),
        'person': token.morph.get("Person"),
        'number': token.morph.get("Number")
    }

# Identify root verbs of each independent clause
print("Detected Independent Clauses with Tense:")
for sent in doc.sents:
    # Normally one ROOT per clause, but can have conjuncts
    root = [token for token in sent if token.dep_ == "ROOT"]
    clause_tenses = []

    for verb in root:
        # Add conj/compound verbs as well
        related_verbs = [verb] + [child for child in verb.children if child.dep_ in {"conj", "xcomp"}]
        for v in related_verbs:
            if v.pos_ == "VERB" or v.tag_.startswith("V"):
                clause_tenses.append(get_verb_tense(v))

    print(f"\nClause: {' '.join([token.text for token in sent])}")
    for tense_info in clause_tenses:
        print(f"  Verb: {tense_info['text']:<10} | Tense: {tense_info['tense']} | Mood: {tense_info['mood']} | Aspect: {tense_info['aspect']}")


Detected Independent Clauses with Tense:

Clause: She walks to school , but he stayed home because it was raining .
  Verb: walks      | Tense: ['Pres'] | Mood: [] | Aspect: []
  Verb: stayed     | Tense: ['Past'] | Mood: [] | Aspect: []


In [None]:
nlp = spacy.load("en_core_web_trf")

In [7]:

# Sample text input
text = "She walks to school, but he stayed home because it was raining."

# Process the text
doc = nlp(text)

# Function to extract morphological features from a verb token
def get_morph_features(token):
    morph = token.morph.to_dict()
    return {
        'text': token.text,
        'lemma': token.lemma_,
        'tense': morph.get('Tense'),
        'mood': morph.get('Mood'),
        'aspect': morph.get('Aspect'),
        'person': morph.get('Person'),
        'number': morph.get('Number'),
    }

# Function to determine if a verb is part of an independent clause
def is_independent_verb(token):
    return token.dep_ in {"ROOT", "conj"} and token.head.dep_ not in {"advcl", "ccomp", "acl", "relcl", "xcomp"}

# Iterate through sentences and extract morphological features of independent verbs
print("Detected Independent Clauses with Tense, Mood, and Aspect:")
for sent in doc.sents:
    clause_text = " ".join([token.text for token in sent])
    independent_verbs = [token for token in sent if is_independent_verb(token)]
    clause_features = []

    for verb in independent_verbs:
        if verb.pos_ == "VERB":
            features = get_morph_features(verb)
            clause_features.append(features)

    print(f"\nClause: {clause_text}")
    for features in clause_features:
        print(f"  Verb: {features['text']:<10} | Tense: {features['tense']} | Mood: {features['mood']} | Aspect: {features['aspect']}")


Detected Independent Clauses with Tense, Mood, and Aspect:

Clause: She walks to school , but he stayed home because it was raining .
  Verb: walks      | Tense: Pres | Mood: None | Aspect: None
  Verb: stayed     | Tense: Past | Mood: None | Aspect: None


In [20]:
import spacy

# 1) Load transformer model
nlp = spacy.load("en_core_web_trf")

text = "She walks to school, but he had stayed home because it was raining."
doc = nlp(text)

# 2) Debug: print raw morphological info for each token
print("── Raw morph features ──")
for token in doc:
    print(f"{token.text:10} → {token.morph.to_dict()}")
print()

def derive_morph(token):
    m = token.morph.to_dict()
    # start with whatever spaCy gives
    tense  = m.get("Tense")
    mood   = m.get("Mood")
    aspect = m.get("Aspect")

    # fallback: if no Mood but finite verb, assume Indicative
    if not mood and m.get("VerbForm") == "Fin":
        mood = ["Ind"]
    # fallback: if no Aspect but gerund or past participle
    if not aspect:
        if token.tag_ == "VBG":
            aspect = ["Prog"]
        elif token.tag_ == "VBN":
            aspect = ["Perf"]
        else:
            aspect = []

    # fallback: if no Tense, derive from PTB tag
    if not tense:
        if token.tag_ in {"VBD", "VBN"}:
            tense = ["Past"]
        elif token.tag_ in {"VBZ", "VBP", "VB"}:
            tense = ["Pres"]

    return {
        "text":   token.text,
        "lemma":  token.lemma_,
        "tense":  tense,
        "mood":   mood,
        "aspect": aspect
    }

def is_independent_verb(token):
    return (
        token.pos_ == "VERB"
        and token.dep_ in {"ROOT", "conj"}
        and token.head.dep_ not in {"advcl", "ccomp", "acl", "relcl", "xcomp"}
    )

# 3) Print only independent-clause verbs with their finalized features
print("── Independent-clause verbs ──")
for sent in doc.sents:
    clause = sent.text.strip()
    verbs  = [t for t in sent if is_independent_verb(t)]
    if not verbs:
        continue
    print(f"\nClause: {clause}")
    for v in verbs:
        f = derive_morph(v)
        print(
            f"  Verb: {f['text']:<10} "
            f"| Tense: {f['tense'] or '—'}  "
            f"| Mood: {f['mood'] or '—'}  "
            f"| Aspect: {f['aspect'] or '—'}"
        )


  model.load_state_dict(torch.load(filelike, map_location=device))


── Raw morph features ──
She        → {'Case': 'Nom', 'Gender': 'Fem', 'Number': 'Sing', 'Person': '3', 'PronType': 'Prs'}
walks      → {'Number': 'Sing', 'Person': '3', 'Tense': 'Pres', 'VerbForm': 'Fin'}
to         → {}
school     → {'Number': 'Sing'}
,          → {'PunctType': 'Comm'}
but        → {'ConjType': 'Cmp'}
he         → {'Case': 'Nom', 'Gender': 'Masc', 'Number': 'Sing', 'Person': '3', 'PronType': 'Prs'}
had        → {'Tense': 'Past', 'VerbForm': 'Fin'}
stayed     → {'Aspect': 'Perf', 'Tense': 'Past', 'VerbForm': 'Part'}
home       → {}
because    → {}
it         → {'Case': 'Nom', 'Gender': 'Neut', 'Number': 'Sing', 'Person': '3', 'PronType': 'Prs'}
was        → {'Mood': 'Ind', 'Number': 'Sing', 'Person': '3', 'Tense': 'Past', 'VerbForm': 'Fin'}
raining    → {'Aspect': 'Prog', 'Tense': 'Pres', 'VerbForm': 'Part'}
.          → {'PunctType': 'Peri'}

── Independent-clause verbs ──

Clause: She walks to school, but he had stayed home because it was raining.
  Verb: walks     

In [22]:
text = "She walks to school, but he had stayed home because it was raining."

doc = nlp(text)

# Labels for clause types
SUB_DEPS = {"advcl", "ccomp", "acl", "relcl", "xcomp"}  # subordinate
CC_DEP   = "cc"                                         # coordinating conj
CMP_DEP   = "ccomp"                                     # comparative conj

def get_tense(tok):
    t = tok.morph.get("Tense")
    return t[0] if t else "—"

# Collect independent-clause verbs and subordinate-clause verbs
indep = [t for t in doc if t.pos_ == "VERB" and t.dep_ in {"ROOT","conj"} and t.head.dep_ not in SUB_DEPS]
subord = []
for root in indep:
    for child in root.children:
        if child.pos_ == "VERB" and child.dep_ in SUB_DEPS:
            subord.append(child)

# Assign IDs
events = indep + subord
event_id = {tok: i+1 for i, tok in enumerate(events)}

def build_span(root, kind):
    """
    For 'indep', include everything in subtree except:
      - punctuation
      - cc-subtrees
      - cmp-subtrees (comparative conjunctions)
      - tokens under SUB_DEPS branches (except their 'mark')
    For 'sub', include entire subordinate subtree but drop its 'mark' and punctuation and cc-subtrees.
    """
    exclude = set()
    for child in root.children:
        if child.dep_ == CC_DEP or child.dep_ == CMP_DEP:
            exclude.update(child.subtree)

    tokens = []
    for t in root.subtree:
        if t.is_punct or t in exclude:
            continue
        if kind == "indep":
            # find which direct child of root this token descends from
            # via ancestors
            direct = None
            for anc in t.ancestors:
                if anc.head == root:
                    direct = anc
                    break
            # if that direct child is a subordinate clause, skip all its tokens except the 'mark'
            if direct and direct.dep_ in SUB_DEPS and t.dep_ != "mark":
                continue
        elif kind == "sub":
            if t.dep_ == "mark":
                continue
        tokens.append(t)

    if not tokens:
        tokens = [root]

    tokens = sorted(tokens, key=lambda x: x.i)
    return doc[tokens[0].i : tokens[-1].i + 1]

# Build event records
records = []
for tok in events:
    kind = "indep" if tok in indep else "sub"
    span = build_span(tok, kind)
    records.append({
        "tok": tok,
        "span": span,
        "kind": kind,
        "tense": get_tense(tok),
        "id": event_id[tok]
    })

# Sort by occurrence
records.sort(key=lambda r: r["span"].start_char)

# Build subordinate lookup for replacements
sub_lookup = {r["span"]: r["id"] for r in records if r["kind"] == "sub"}

def annotate_indep(span):
    text = span.text
    base = span.start_char
    # find subordinate spans inside this span
    inside = [
        (sp, sid) for sp, sid in sub_lookup.items()
        if sp.start_char >= base and sp.end_char <= span.end_char
    ]
    # replace in reverse order to keep indexes valid
    for sp, sid in sorted(inside, key=lambda x: x[0].start_char, reverse=True):
        s = sp.start_char - base
        e = sp.end_char   - base
        text = text[:s] + f"<EVENT {sid}>" + text[e:]
    return text.strip()

# Output
print(f"{'ID':<3} {'Event Phrase':<45} Tense")
print("-"*65)
for rec in records:
    phrase = annotate_indep(rec["span"]) if rec["kind"] == "indep" else rec["span"].text
    print(f"{rec['id']:<3} {phrase:<45} {rec['tense']}")


ID  Event Phrase                                  Tense
-----------------------------------------------------------------
1   She walks to school, but he had stayed home because <EVENT 3> Pres
2   he had stayed home because <EVENT 3>          Past
3   it was raining                                Pres


In [23]:
text = "She walks to school, but he stayed home because it was raining."

doc = nlp(text)

# Helper to get tense info from a verb token
def get_verb_tense(token):
    return {
        'text': token.text,
        'lemma': token.lemma_,
        'tense': token.morph.get("Tense"),
        'mood': token.morph.get("Mood"),
        'aspect': token.morph.get("Aspect"),
        'person': token.morph.get("Person"),
        'number': token.morph.get("Number")
    }

# Identify root verbs of each independent clause
print("Detected Independent Clauses with Tense:")
for sent in doc.sents:
    # Normally one ROOT per clause, but can have conjuncts
    root = [token for token in sent if token.dep_ == "ROOT"]
    clause_tenses = []

    for verb in root:
        # Add conj/compound verbs as well
        related_verbs = [verb] + [child for child in verb.children if child.dep_ in {"conj", "xcomp"}]
        for v in related_verbs:
            if v.pos_ == "VERB" or v.tag_.startswith("V"):
                clause_tenses.append(get_verb_tense(v))

    print(f"\nClause: {' '.join([token.text for token in sent])}")
    for tense_info in clause_tenses:
        print(f"  Verb: {tense_info['text']:<10} | Tense: {tense_info['tense']} | Mood: {tense_info['mood']} | Aspect: {tense_info['aspect']}")


Detected Independent Clauses with Tense:

Clause: She walks to school , but he stayed home because it was raining .
  Verb: walks      | Tense: ['Pres'] | Mood: [] | Aspect: []
  Verb: stayed     | Tense: ['Past'] | Mood: [] | Aspect: []


In [None]:
text = "She walks to school, but he stayed home because it was raining."

doc = nlp(text)

# Helper to get tense info from a verb token
def get_verb_tense(token):
    return {
        'text': token.text,
        'lemma': token.lemma_,
        'tense': token.morph.get("Tense"),
        'mood': token.morph.get("Mood"),
        'aspect': token.morph.get("Aspect"),
        'person': token.morph.get("Person"),
        'number': token.morph.get("Number")
    }

# Identify root verbs of each independent clause
print("Detected Independent Clauses with Tense:")
for sent in doc.sents:
    # Normally one ROOT per clause, but can have conjuncts
    root = [token for token in sent if token.dep_ == "ROOT"]
    clause_tenses = []

    for verb in root:
        # Add conj/compound verbs as well
        related_verbs = [verb] + [child for child in verb.children if child.dep_ in {"conj", "xcomp"}]
        for v in related_verbs:
            if v.pos_ == "VERB" or v.tag_.startswith("V"):
                clause_tenses.append(get_verb_tense(v))

    print(f"\nClause: {' '.join([token.text for token in sent])}")
    for tense_info in clause_tenses:
        print(f"  Verb: {tense_info['text']:<10} | Tense: {tense_info['tense']} | Mood: {tense_info['mood']} | Aspect: {tense_info['aspect']}")


Detected Independent Clauses with Tense:

Clause: She walks to school , but he stayed home because it was raining .
  Verb: walks      | Tense: ['Pres'] | Mood: [] | Aspect: []
  Verb: stayed     | Tense: ['Past'] | Mood: [] | Aspect: []


In [25]:
text = "She walks to school, but he had stayed home because it was raining."
# Add import for spacy if not already present
import spacy
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

# Labels for clause types
SUB_DEPS = {"advcl", "ccomp", "acl", "relcl", "xcomp"}  # subordinate
CC_DEP   = "cc"                                         # coordinating conj
CMP_DEP   = "ccomp"                                     # comparative conj

def get_tense(tok):
    t = tok.morph.get("Tense")
    return t[0] if t else "—"

# Collect independent-clause verbs and subordinate-clause verbs
indep = [t for t in doc if t.pos_ == "VERB" and t.dep_ in {"ROOT","conj"} and t.head.dep_ not in SUB_DEPS]
subord = []
for root in indep:
    for child in root.children:
        if child.pos_ == "VERB" and child.dep_ in SUB_DEPS:
            subord.append(child)

# Assign IDs
events = indep + subord
event_id = {tok: i+1 for i, tok in enumerate(events)}

def build_span(root, kind):
    """
    For 'indep', include everything in subtree except:
      - punctuation
      - cc-subtrees
      - cmp-subtrees (comparative conjunctions)
      - tokens under SUB_DEPS branches (except their 'mark')
    For 'sub', include entire subordinate subtree but drop its 'mark' and punctuation and cc-subtrees.
    """
    exclude = set()
    for child in root.children:
        if child.dep_ == CC_DEP or child.dep_ == CMP_DEP:
            exclude.update(child.subtree)

    tokens = []
    for t in root.subtree:
        if t.is_punct or t in exclude:
            continue
        if kind == "indep":
            # find which direct child of root this token descends from
            # via ancestors
            direct = None
            for anc in t.ancestors:
                if anc.head == root:
                    direct = anc
                    break
            # if that direct child is a subordinate clause, skip all its tokens except the 'mark'
            if direct and direct.dep_ in SUB_DEPS and t.dep_ != "mark":
                continue
        elif kind == "sub":
            if t.dep_ == "mark":
                continue
        tokens.append(t)

    if not tokens:
        tokens = [root]

    tokens = sorted(tokens, key=lambda x: x.i)
    return doc[tokens[0].i : tokens[-1].i + 1]

# Build event records
records = []
for tok in events:
    kind = "indep" if tok in indep else "sub"
    span = build_span(tok, kind)
    records.append({
        "tok": tok,
        "span": span,
        "kind": kind,
        "tense": get_tense(tok),
        "id": event_id[tok]
    })

# Sort by occurrence
records.sort(key=lambda r: r["span"].start_char)

# Build subordinate lookup for replacements
sub_lookup = {r["span"]: r["id"] for r in records if r["kind"] == "sub"}

def annotate_indep(span):
    text = span.text
    base = span.start_char
    # find subordinate spans inside this span
    inside = [
        (sp, sid) for sp, sid in sub_lookup.items()
        if sp.start_char >= base and sp.end_char <= span.end_char
    ]
    # replace in reverse order to keep indexes valid
    for sp, sid in sorted(inside, key=lambda x: x[0].start_char, reverse=True):
        s = sp.start_char - base
        e = sp.end_char   - base
        text = text[:s] + f"<EVENT {sid}>" + text[e:]
    return text.strip()

# Output
print(f"{'ID':<3} {'Event Phrase':<45} Tense")
print("-"*65)
for rec in records:
    phrase = annotate_indep(rec["span"]) if rec["kind"] == "indep" else rec["span"].text
    print(f"{rec['id']:<3} {phrase:<45} {rec['tense']}")


ID  Event Phrase                                  Tense
-----------------------------------------------------------------
1   She walks to school, but he had stayed home because <EVENT 3> Pres
2   he had stayed home because <EVENT 3>          Past
3   it was raining                                Pres
