<a href="https://colab.research.google.com/github/Karanveer2202/NER/blob/main/bconn_ext.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
pip install sklearn-crfsuite


Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.3.6-py2.py3-none-any.whl (12 kB)
Collecting python-crfsuite>=0.8.3 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (993 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m993.5/993.5 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.9 sklearn-crfsuite-0.3.6


In [5]:
import xml.etree.ElementTree as ET

def parse_xml(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    sentences = []

    for doc in root.findall('.//document'):
        for sent in doc.findall('.//sentence'):
            sent_text = sent.get('text')
            entities = {entity.get('id'): entity.get('text') for entity in sent.findall('.//entity')}

            # Tokenize the sentence text here as per your requirement
            tokens = sent_text.split()  # This is a simplistic tokenizer

            # Create a list of (word, label) tuples
            labeled_tokens = []
            for token in tokens:
                label = 'O'  # Default label
                for entity_id, entity_text in entities.items():
                    if entity_text in token:
                        label = 'B-Individual_protein'  # Replace with actual entity type
                        break
                labeled_tokens.append((token, label))

            sentences.append(labeled_tokens)

    return sentences

file_path = '/content/WhiteTextNegFixFull.xml'  # Replace with your XML file path
sentences = parse_xml(file_path)

In [6]:
import sklearn_crfsuite
from sklearn_crfsuite import metrics

def word2features(sent, i):
    word = sent[i][0]
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
    }
    if i > 0:
        word1 = sent[i-1][0]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        word1 = sent[i+1][0]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
        })
    else:
        features['EOS'] = True

    return features

def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, label in sent]

# Example: Load your data here

# Split data into training and test sets (customize this according to your data)
train_sents = sentences[:int(len(sentences) * 0.8)]
test_sents = sentences[int(len(sentences) * 0.8):]

X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]

# Train the CRF model
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs',
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True
)
crf.fit(X_train, y_train)

# Make predictions
y_pred = crf.predict(X_test)

# Evaluate the model
print(metrics.flat_accuracy_score(y_test, y_pred))


0.9835049979096195


In [7]:
new_file_path = '/content/WhiteTextUnseenEval.xml'
new_sentences = parse_xml(new_file_path)

In [8]:
X_new = [sent2features(s) for s in new_sentences]

In [9]:
new_predictions = crf.predict(X_new)

In [10]:
# Iterate over all sentences and their predictions
for sentence, predicted_labels in zip(new_sentences, new_predictions):
    for word, prediction in zip(sentence, predicted_labels):
        print(f"{word[0]}: {prediction}")
    print("\n--- End of Sentence ---\n")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
particular,: O
efferent: O
pathways: O
from: O
the: O
granular: O
layer: O
(Intercalated: O
nucleus: O
of: O
the: O
hyperstriatum: O
accessorium,: O
IHA),: O
supragranular: O
layer: O
(hyperstriatum: O
accessorium,: O
HA),: O
and: O
infragranular: O
layers: O
(hyperstriatum: O
intercalatus: O
superior: O
and/or: O
hyperstriatum: O
dorsale,: O
HIS/HD): O
were: O
investigated.: O

--- End of Sentence ---

Furthermore,: O
many: O
acidic: O
fibroblast: O
growth: O
factor(aFGF): O
-positive: O
cell: O
bodies: O
were: O
found: O
in: O
the: O
vestibular: O
system: O
and: O
other: O
structures: O
projecting: O
to: O
the: O
cerebellum,: B-Individual_protein
in: O
the: O
deep: O
cerebellar: O
nuclei,: O
in: O
somatosensory: O
structures: O
of: O
the: O
medulla: O
(i.e.: O

--- End of Sentence ---

The: O
retinal: B-Individual_protein
expression: O
of: O
Hsp27: O
correlates: O
temporally: O
with: O
innervation: O
of: O
the: O
tectum

In [11]:
with open("predicted_results.txt", "w") as file:
    for sentence, predicted_labels in zip(new_sentences, new_predictions):
        for word, prediction in zip(sentence, predicted_labels):
            file.write(f"{word[0]}: {prediction}\n")
        file.write("\n--- End of Sentence ---\n\n")

In [12]:
from sklearn.metrics import classification_report

In [13]:
# Flatten the test set labels and predictions
y_test_flat = [label for sentence in y_test for label in sentence]
y_pred_flat = [label for sentence in y_pred for label in sentence]

In [14]:
print(classification_report(y_test_flat, y_pred_flat))

                      precision    recall  f1-score   support

B-Individual_protein       0.85      0.56      0.68       807
                   O       0.99      1.00      0.99     25504

            accuracy                           0.98     26311
           macro avg       0.92      0.78      0.83     26311
        weighted avg       0.98      0.98      0.98     26311

