In [5]:
import spacy
import pandas as pd
import re

# === Upload spaCy English Model ===

nlp = spacy.load("en_core_web_sm")

# === 1. Upload ===
def load_text(path):
    with open(path, "r", encoding="utf-8") as f:
        return f.read()

# === 1. lemmatization ===
def singularize(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

# === 2. Clear Concent ===
def clean_phrase(phrase):
    phrase = phrase.strip().lower()
    phrase = re.sub(r'^[\"\'\d]+\s*', '', phrase)
    phrase = re.sub(r'\s*\d+\s*', ' ', phrase)
    phrase = re.sub(r'\s{2,}', ' ', phrase)
    phrase = re.sub(r'^(a |an |any |the |their |these |those )', '', phrase)
    # print(f"{phrase}")
    return phrase.strip()

# === 3. Filter Unnecessary phrases ===
def is_valid_entity(phrase):
    phrase = phrase.strip().lower()
    words = phrase.split()
    if len(words) < 2 or len(phrase) > 80:
        return False

    strong_keywords = ["number", "diagram", "notation", "square", "root", "property", "tile", "representation"]
    weak_starts = ["a ", "an ", "their "]
    weak_concepts = {"difference", "relationship", "area", "value", "perimeter", "amount"}

    if any(kw in phrase for kw in strong_keywords):
        return True
    if all(w in weak_concepts for w in words):
        return False
    if any(phrase.startswith(s) for s in weak_starts):
        return False

    return True

# === 4. Extract Entities ===
def extract_entities(text):
    doc = nlp(text)
    concepts = set()
    theorems = set()

    for chunk in doc.noun_chunks:
        phrase = clean_phrase(chunk.text)
        if not is_valid_entity(phrase):
            continue
        if any(kw in phrase for kw in ["notation", "property", "formula", "law", "diagram", "rule", "principle", "theorem"]):
            theorems.add(phrase)
        else:
            concepts.add(phrase)

    concept_entities = [{"name": c, "type": 1} for c in sorted(concepts)]
    theorem_entities = [{"name": t, "type": 2} for t in sorted(theorems)]
    return concept_entities + theorem_entities

# === 5. Execute ===
if __name__ == "__main__":
    file_path = "year 7-10 firstone.txt"  
    text = load_text(file_path)
    entities = extract_entities(text)

    df = pd.DataFrame(entities)
    df['name'] = df['name'].apply(clean_phrase)
    df['name'] = df['name'].apply(singularize)  
    df = df.drop_duplicates().sort_values(by='type')

    df.to_csv("final_entities.csv", index=False)
    df.to_json("final_entities.json", indent=2, force_ascii=False)

 

In [None]:
# === Get entities.json  ===
print("\n===== Full JSON lines =====")
with open("final_entities.json", "r", encoding="utf-8") as f:
    for line in f:
        print(line.strip())


===== Full JSON lines =====
{
"name":{
"0":"consecutive natural number",
"1":"consecutive square number",
"2":"constant second difference",
"3":"emerge pattern",
"4":"natural number",
"5":"perfect square number",
"6":"square number",
"7":"square pattern",
"8":"square root",
"10":"square tile floor",
"11":"square tile",
"12":"tile length",
"13":"two - digit number",
"14":"visual representation",
"15":"distributive property and area diagram",
"16":"square and square root notation"
},
"type":{
"0":1,
"1":1,
"2":1,
"3":1,
"4":1,
"5":1,
"6":1,
"7":1,
"8":1,
"10":1,
"11":1,
"12":1,
"13":1,
"14":1,
"15":2,
"16":2
}
}


In [None]:
#Split entities into train and test sets

In [6]:
import json

# === Split entities into train and test sets ===
txt_file = "concepts.txt"
json_file = "entities.json"

entity_list = []
with open(txt_file, "r", encoding="utf-8") as f:
    for line in f:
        parts = line.strip().rsplit(" ", 1)
        if len(parts) == 2 and parts[1] in ["1", "2"]:
            label = "KNOW" if parts[1] == "1" else "PRIN"
            entity_list.append({
                "name": parts[0].strip().lower(),
                "label": label
            })

# Write to JSON file
with open(json_file, "w", encoding="utf-8") as f:
    json.dump(entity_list, f, ensure_ascii=False, indent=2)


In [7]:
import json
import re

# ====== 1. Open entiteis ======
with open("entities.json", "r", encoding="utf-8") as f:
    entity_list = json.load(f)

#  Prioritize longer entities for matching
entity_list = sorted(entity_list, key=lambda e: -len(e["name"].split()))

# dictionary form (lowercase matching)
entity_dict = {e["name"].lower(): e["label"] for e in entity_list}


# ====== 2. Upload context======
with open("context.txt", "r", encoding="utf-8") as f:
    sentences = [line.strip() for line in f if line.strip()]


# ====== 3. tokenize ======
def tokenize(text):
    return re.findall(r"\w+|[^\w\s]", text)


# ====== 4. Generate Bio Tag ======
def tag_sentence(sentence, entity_dict):
    tokens = tokenize(sentence)
    lowered = [t.lower() for t in tokens]
    tags = ["O"] * len(tokens)

    for ent, label in entity_dict.items():
        ent_tokens = ent.split()
        n = len(ent_tokens)
        for i in range(len(tokens) - n + 1):
            if lowered[i:i+n] == ent_tokens and tags[i] == "O":
                tags[i] = f"B-{label}"
                for j in range(1, n):
                    tags[i+j] = f"I-{label}"

    return list(zip(tokens, tags))


# ====== 5. Write Bio ======
with open("bert_ner_bio.txt", "w", encoding="utf-8") as out:
    for sentence in sentences:
        token_tag_pairs = tag_sentence(sentence, entity_dict)
        for token, tag in token_tag_pairs:
            out.write(f"{token}\t{tag}\n")
        out.write("\n")  


In [8]:
with open("context.txt", "r", encoding="utf-8") as f:
    print("Original Sentence：", sum(1 for line in f if line.strip()))


Original Sentence： 2233


In [9]:
sentence_count = 0
with open("bert_ner_bio.txt", "r", encoding="utf-8") as f:
    for line in f:
        if line.strip() == "":
            sentence_count += 1
print("BIO Sebtebce：", sentence_count)


BIO Sebtebce： 2233


In [10]:
#Analyze entity distribution
from collections import Counter
def analyze_entity_distribution(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    tags = [line.strip().split("\t")[1] for line in lines if line.strip()]
    tag_counts = Counter(tags)

    return tag_counts
# analyze_entity_distribution("bert_ner_bio.txt")
distribution = analyze_entity_distribution("bert_ner_bio.txt")
# print entity distribution
print("entity distribution：")
for tag, count in distribution.items():
    print(f"{tag}: {count}")



entity distribution：
O: 86856
B-KNOW: 19623
I-KNOW: 8211
B-PRIN: 849
I-PRIN: 1141


In [11]:
import random

def split_data_by_sentence(input_file, train_file, test_file, train_ratio=0.75):
    with open(input_file, "r", encoding="utf-8") as f:
        content = f.read()

    # separate sentences by double newlines
    sentences = content.strip().split("\n\n")

    # shuffle sentences
    random.shuffle(sentences)

    # Calculate split index
    split_index = int(len(sentences) * train_ratio)

    # Write train and test files
    with open(train_file, "w", encoding="utf-8") as f_train:
        f_train.write("\n\n".join(sentences[:split_index]) + "\n")

    with open(test_file, "w", encoding="utf-8") as f_test:
        f_test.write("\n\n".join(sentences[split_index:]) + "\n")

    print(f"数据集已分割为 {train_file} 和 {test_file}")

# 调用函数
split_data_by_sentence("bert_ner_bio.txt", "train_1.txt", "test_2.txt", train_ratio=0.75)


数据集已分割为 train_1.txt 和 test_2.txt


In [12]:
#同理，分析train.txt和test.txt的实体分布
## 分析train分布
train_distribution = analyze_entity_distribution("train_1.txt")
# 输出实体分布  
print("Train dataset contribution：")
for tag, count in train_distribution.items():
    print(f"{tag}: {count}")

## 分析test分布
test_distribution = analyze_entity_distribution("test_2.txt")
# 输出实体分布  
print("Test dataset contribution：")
for tag, count in test_distribution.items():
    print(f"{tag}: {count}")


Train dataset contribution：
B-KNOW: 14634
I-KNOW: 6100
O: 64764
B-PRIN: 681
I-PRIN: 916
Test dataset contribution：
O: 22092
B-KNOW: 4989
I-KNOW: 2111
B-PRIN: 168
I-PRIN: 225


In [14]:
import json
import re

# ====== 1. generate ======
with open("entities.json", "r", encoding="utf-8") as f:
    entity_list = json.load(f)

# prioritize longer entities for matching
entity_list = sorted(entity_list, key=lambda e: -len(e["name"].split()))

# dictionary form (lowercase matching)
entity_dict = {e["name"].lower(): e["label"] for e in entity_list}


# ====== 2.get examples======
with open("year 7-10 firstone1.txt", "r", encoding="utf-8") as f:
    sentences = [line.strip() for line in f if line.strip()]


# ====== 3. tokenize ======
def tokenize(text):
    return re.findall(r"\w+|[^\w\s]", text)


# ====== 4. Gemerate semtemse=====
def tag_sentence(sentence, entity_dict):
    tokens = tokenize(sentence)
    lowered = [t.lower() for t in tokens]
    tags = ["O"] * len(tokens)

    for ent, label in entity_dict.items():
        ent_tokens = ent.split()
        n = len(ent_tokens)
        for i in range(len(tokens) - n + 1):
            if lowered[i:i+n] == ent_tokens and tags[i] == "O":
                tags[i] = f"B-{label}"
                for j in range(1, n):
                    tags[i+j] = f"I-{label}"

    return list(zip(tokens, tags))


# ====== 5. Write Bio file ======
with open("bert_ner_bio1.txt", "w", encoding="utf-8") as out:
    for sentence in sentences:
        token_tag_pairs = tag_sentence(sentence, entity_dict)
        for token, tag in token_tag_pairs:
            out.write(f"{token}\t{tag}\n")
        out.write("\n") 


In [15]:
with open('bert_ner_bio1.txt', 'r', encoding='utf-8') as f:
    text = f.read()
print(text)

describe	O
the	O
relationship	O
between	O
perfect	B-KNOW
square	I-KNOW
numbers	I-KNOW
and	O
square	B-KNOW
roots	I-KNOW
.	O

and	O
use	O
squares	O
of	O
numbers	O
and	O
square	B-KNOW
roots	I-KNOW
of	O
perfect	B-KNOW
square	I-KNOW
numbers	I-KNOW
to	O
solve	O
problems	O
.	O

investigating	O
squares	O
of	O
natural	O
numbers	O
from	O
one	O
to	O
20	O
,	O
and	O
connecting	O
them	O
to	O
visual	O
representations	O
such	O
as	O
dots	O
arranged	O
in	O
a	O
square	B-KNOW
pattern	O
.	O

using	O
the	O
square	B-KNOW
and	O
square	B-KNOW
root	I-KNOW
notation	O
,	O
and	O
the	O
distributive	O
property	O
and	O
area	B-KNOW
diagrams	O
to	O
calculate	O
the	O
squares	O
of	O
two	O
-	O
digit	O
numbers	O
;	O
for	O
example	O
,	O
43	O
^	O
2	O
=	O
(	O
40	O
+	O
3	O
)	O
^	O
2	O
=	O
40	O
^	O
2	O
+	O
2	O
×	O
40	O
×	O
3	O
+	O
3	O
^	O
2	O
=	O
1600	O
+	O
240	O
+	O
9	O
=	O
1849	O
.	O

determining	O
between	O
which	O
2	O
consecutive	O
natural	O
numbers	O
the	O
square	B-KNOW
root	I-KNOW
of	O
a	O
given	O
number	B-KNOW
lies	O
;	O

In [16]:
import re




# coding=utf-8

# 1. Initialize
sent_words = []
sent_tags = []

# 2. Get txt file
with open('bert_ner_bio1.txt', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.strip()
        if not line:  
            if sent_words:
                print(' '.join(sent_words))
                print(' '.join(sent_tags))
                print()  
                sent_words = []
                sent_tags = []
        else:
            parts = line.split()
            if len(parts) == 2:
                word, tag = parts
                sent_words.append(word)
                sent_tags.append(tag)

# 3. Handle the last sentence
if sent_words:
    print(' '.join(sent_words))
    print(' '.join(sent_tags))


describe the relationship between perfect square numbers and square roots .
O O O O B-KNOW I-KNOW I-KNOW O B-KNOW I-KNOW O

and use squares of numbers and square roots of perfect square numbers to solve problems .
O O O O O O B-KNOW I-KNOW O B-KNOW I-KNOW I-KNOW O O O O

investigating squares of natural numbers from one to 20 , and connecting them to visual representations such as dots arranged in a square pattern .
O O O O O O O O O O O O O O O O O O O O O O B-KNOW O O

using the square and square root notation , and the distributive property and area diagrams to calculate the squares of two - digit numbers ; for example , 43 ^ 2 = ( 40 + 3 ) ^ 2 = 40 ^ 2 + 2 × 40 × 3 + 3 ^ 2 = 1600 + 240 + 9 = 1849 .
O O B-KNOW O B-KNOW I-KNOW O O O O O O O B-KNOW O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O O

determining between which 2 consecutive natural numbers the square root of a given number lies ; for example , 43 is between the square numbers