<a href="https://colab.research.google.com/github/Liki990/Doc/blob/main/Welcome_To_Colaboratory.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import evaluate


seqeval = evaluate.load("seqeval")
import numpy as np

from pathlib import Path
import re

def read_wnut(file_path):
    file_path = Path(file_path)

    raw_text = file_path.read_text().strip()
    raw_docs = re.split(r'\n\t?\n', raw_text)
    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):
            token, tag = line.split()
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)
        tag_docs.append(tags)

    return token_docs, tag_docs

texts, tags = read_wnut('/content/output_conll.txt')

print(texts[2][10:17], tags[2][10:17], sep='\n')

from sklearn.model_selection import train_test_split
train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

unique_tags = set(tag for doc in tags for tag in doc)
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

from transformers import DistilBertTokenizerFast, TrainingArguments, Trainer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

import numpy as np

def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []

    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        doc_enc_labels = np.ones(len(doc_offset), dtype=int) * -100

        for i, (start, end) in enumerate(doc_offset):
            # Check if the token is a real token or a padding token
            if start == 0 and end != 0:
                doc_enc_labels[i] = doc_labels.pop(0)

        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

import torch

class MedicalDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping") # we don't want to pass this to the model
val_encodings.pop("offset_mapping")
train_dataset =MedicalDataset(train_encodings, train_labels)
val_dataset = MedicalDataset(val_encodings, val_labels)

from transformers import DistilBertForTokenClassification
model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))

training_args = TrainingArguments(
    output_dir="./custom_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

trainer.train()

model.save_pretrained("custom_model")
tokenizer.save_pretrained("tokenizer")

import json
config = json.load(open("custom_model/config.json"))
config["id2label"] = id2tag
config["label2id"] = tag2id
json.dump(config, open("custom_model/config.json","w"))
model_fine_tuned = DistilBertForTokenClassification.from_pretrained("custom_model")
from transformers import pipeline
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)


example =  '''The proband (II-2 in Fig.2) is a 45-year old woman, who first presented to our university hospital at the age of 35 and was referred to us because of her pregnancy.
She has congenital deafness, first experienced syncope at the age of 3, and was diagnosed with epilepsy.
She was treated with anti-epilepsy medications; however, she subsequently experienced several instances of syncope.
At the age of 13, she had a syncope event, and was suspected of having JLNS because of her congenital deafness and prolonged QT interval.
Her syncope was diagnosed as an arrhythmic episode when she was aware of tachycardia and as epilepsy when she was not.
She also had a subarachnoid hemorrhage at the age of 29.
When she first presented at our hospital, she was not taking beta-blockers, because of a history of asthma, but was taking mexiletine in addition to phenytoin.
Her QTc was found to be prolonged (584 ms) at presentation and administration of atenolol was initiated.
She delivered her baby (III-1 in Fig.2) through Caesarean operation at our hospital at the age of 35.
At 37, she delivered her second baby (III-2 in Fig.2) through Caesarean operation at our hospital.
Despite administration of beta-blockers, her QTc remained prolonged (600 msec at the age of 37, 780 msec at 44) (Figs.2 and ​3a), which is not unexpected because treatment with beta-blockers in LQTS1 is not expected to overtly reduce QTc [18].
However, she continued to experience occasional syncope and finally underwent an implantable cardioverter defibrillator (ICD) operation at 38 years of age.
Subsequently, she is in a stable clinical condition.
Because the proband was suspected of JLNS and both infants had a measured QTc of 500 ms or greater within 1 month after birth, beta blockers were initiated and both children remain in stable condition at ages 10 and 8 (Figs.2 and 3b, c).
QTc of the son (III-1 in Fig.2) was measured as 500 ms one month after birth, while the QTc of his sister (III-2) was 530 ms at birth.
The father (I-1) and mother (I-2) of the proband were first cousins.
There is no history of sudden unexplained syncope or death of children or adults in the immediate family members, despite the prolonged QTc of the children.
Clinical evaluation and consultation of the proband and her family members were performed at Chiba University Hospital.
Clinical phenotypes were deduced from the clinical history, physical examinations, and ECG.
Blood samples were collected from the proband and her family members following genetic counseling, and written informed consent was obtained prior to sample collection.
Genomic DNA was isolated from peripheral blood lymphocytes according to established protocols at our laboratory [19].
Entire coding exons, including the intronic boundaries of the genes, of KCNQ1 (NCBI ref: NM_000218) and other LQT causative genes (KCNH2, SCN5A, KCNE1, KCNE2, KCNJ2, SCN4B, KCNJ5) were amplified by polymerase chain reaction (PCR), according to established protocols in our laboratory.
Briefly, 30–100 ng of genomic DNA was subjected to PCR amplification with DNA polymerase (PrimeSTAR GXL DNA Polymerase; Takara Bio Inc., Kusatsu, Japan) and primer sets.
The amplicons were subjected to conventional sequencing with Sanger sequencers (Applied Biosystems 3730/3130 DNA analyzers; Thermo Fisher Scientific, Waltham, MA, USA).
The sequence data were processed with Gene Codes Sequencher Software (Takara Bio Inc.) and mapped to the human genome sequence (build GRCh37/hg19).
Genetic analysis was performed to screen all coding exons and the exon–intron boundaries of the KCNQ1 gene (NCBI ref: NM_000218.2, NP_000209.2) with concurrent screening of other LQT causative genes (KCNH2, SCN5A, KCNE1, KCNE2, KCNJ2, SCN4B, KCNJ5).
We detected a novel homozygous nonsense variant, NM_000218.2:c.115G > T (p.Glu39X, in exon 1a), in the KCNQ1 gene of the proband, as well as a homozygous common variant (NM_000218.2:c.1343C > G, p.Pro448Arg) (Additional file 1: Table S1).
Genetic screening of her mother (I-2) and children (III-1 and III-2) revealed that they were heterozygous for the nonsense variant (Fig.2).
Her husband (II-3) was also screened and found to be heterozygous for the common variant (NM_000218.2:c.1343C > G, p.Pro448Arg).
The proband is a child from a first-cousin marriage, and we have concluded the homozygous nonsense variant in the proband is the cause of her JLNS1.
The proband was negative for pathogenic variants in other LQT causative genes, including the KCNE1 gene (Additional file 1: Table S1).

'''

ner_results = nlp(example)

print(ner_results)







['of', 'the', 'Second', 'Affiliated', 'Hospital', 'of', 'Zhejiang']
['O', 'O', 'O', 'O', 'O', 'O', 'O']


Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,4.293899
2,No log,4.160919


[{'entity': 'B-Weight', 'score': 0.019836947, 'index': 1, 'word': 'The', 'start': 0, 'end': 3}, {'entity': 'B-Route', 'score': 0.018387299, 'index': 2, 'word': 'pro', 'start': 4, 'end': 7}, {'entity': 'B-Diagnostic_Procedure', 'score': 0.021511361, 'index': 3, 'word': '##band', 'start': 7, 'end': 11}, {'entity': 'B-Weight', 'score': 0.020540632, 'index': 6, 'word': '-', 'start': 15, 'end': 16}, {'entity': 'B-Route', 'score': 0.017666437, 'index': 9, 'word': 'Fi', 'start': 21, 'end': 23}, {'entity': 'B-Weight', 'score': 0.019256303, 'index': 10, 'word': '##g', 'start': 23, 'end': 24}, {'entity': 'B-Weight', 'score': 0.020462334, 'index': 11, 'word': '.', 'start': 24, 'end': 25}, {'entity': 'B-Weight', 'score': 0.017643817, 'index': 14, 'word': 'is', 'start': 28, 'end': 30}, {'entity': 'B-Diagnostic_Procedure', 'score': 0.018846586, 'index': 15, 'word': 'a', 'start': 31, 'end': 32}, {'entity': 'I-Texture', 'score': 0.01646464, 'index': 16, 'word': '45', 'start': 33, 'end': 35}, {'entity'

In [12]:
import pandas as pd

# ... (your existing code)

# Assuming ner_results is a list of dictionaries with keys: 'entity', 'score', 'index', 'word', 'start', 'end'
# Example ner_results:
# ner_results = [{'entity': 'I-Sign_symptom', 'score': 0.020426337, 'index': 1, 'word': 'A', 'start': 0, 'end': 1}, ...]

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(ner_results)

# Save the DataFrame to a CSV file
df.to_csv('ner_results.csv', index=False)

# Print the DataFrame
print(df)


                      entity     score  index    word  start   end
0             I-Sign_symptom  0.020426      1       A      0     1
1             I-Sign_symptom  0.017613      2      28      2     4
2     B-Biological_structure  0.017797      3       -      4     5
3                    B-Color  0.021427      4    year      5     9
4                 I-Activity  0.019883      5       -      9    10
..                       ...       ...    ...     ...    ...   ...
433           I-Sign_symptom  0.017862    447     the   1666  1669
434           I-Sign_symptom  0.019519    448       a   1670  1671
435   I-Biological_attribute  0.021289    449   ##bla   1671  1674
436  I-Therapeutic_procedure  0.018906    450  ##tion   1674  1678
437               I-Activity  0.016707    451       .   1678  1679

[438 rows x 6 columns]


In [13]:
import pandas as pd
from datetime import datetime

# ... (your existing code)

# Assuming ner_results is a list of dictionaries with keys: 'entity', 'score', 'index', 'word', 'start', 'end'
# Example ner_results:
# ner_results = [{'entity': 'I-Sign_symptom', 'score': 0.020426337, 'index': 1, 'word': 'A', 'start': 0, 'end': 1}, ...]

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(ner_results)

# Generate a unique file name based on the current timestamp
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
csv_file_name = f'ner_results_{timestamp}.csv'

# Save the DataFrame to the CSV file
df.to_csv(csv_file_name, index=False)

# Print the file name for reference
print(f"CSV file saved: {csv_file_name}")


CSV file saved: ner_results_20231218113503.csv


In [16]:
import pandas as pd
from datetime import datetime

# ... (your existing code)

# Assuming ner_results is a list of dictionaries with keys: 'entity', 'score', 'index', 'word', 'start', 'end'
# Example ner_results:
# ner_results = [{'entity': 'I-Sign_symptom', 'score': 0.020426337, 'index': 1, 'word': 'A', 'start': 0, 'end': 1}, ...]

# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(ner_results)

# Generate a unique file name based on the current timestamp
timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
excel_file_name = f'ner_results_{timestamp}.xlsx'

# Save the DataFrame to the Excel file
df.to_excel(excel_file_name, index=False)

# Print the file name for reference
print(f"Excel file saved: {excel_file_name}")


Excel file saved: ner_results_20231218114752.xlsx
