<a href="https://colab.research.google.com/github/LCR-ADS-Lab/ASC-Treebank/blob/main/231031_asc_train_240111.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spacy

In [None]:
!pip install spacy-transformers

In [None]:
!python -m spacy download en_core_web_trf

In [None]:
def clean_corpus(filename, output_filename):
    with open(filename, 'r') as file:
        lines = file.readlines()

    cleaned_lines = []
    for line in lines:
        line = line.strip()  # Remove spaces from the beginning and the end of the line
        if not line.startswith("#"):
            parts = line.split("\t")
            if len(parts) > 9:  # Check if there are at least 10 columns
                parts[9] = parts[9].strip()  # Remove redundant spaces from the 10th column
                line = "\t".join(parts)
        cleaned_lines.append(line)

    with open(output_filename, 'w') as file:
        file.write("\n".join(cleaned_lines))

filename = "/content/silverSentences_database20231128.txt"
cleaned_filename = "/content/cleaned_silverSentences_database20231128.txt"

clean_corpus(filename, cleaned_filename)

In [None]:
import spacy
from spacy.tokens import Doc, Token
from spacy.language import Language

# Whitespace tokenizer (copied from Kris' "Accuracy_LL_2023021.py")
class WhitespaceTokenizer(object):
    def __init__(self, vocab):
        self.vocab = vocab

    def __call__(self, text):
        words = text.split(' ')
        # All tokens 'own' a subsequent space character in this tokenizer
        spaces = [True] * len(words)
        return Doc(self.vocab, words=words, spaces=spaces)

# Custom Sentence Boundaries (Force spacy to use our sentence tokenization)
@Language.component("single_sent")
def custom_sent(doc):
    for token in doc:
        if token.i == 0:
            doc[token.i].is_sent_start = True
        else:
            doc[token.i].is_sent_start = False
    return doc

# Load the model and modify the pipeline
nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)
nlp.add_pipe("single_sent", before='parser') # ensure sentence tokenization is the same as input text
print(nlp.pipe_names)  # Should show 'single_sent' before 'parser'

def extract_all_sentences(filename):
    with open(filename, 'r') as file:
        data = file.read()
    return data.strip().split("\n\n")

def parse_conllu(sentence):
    lines = sentence.split("\n")
    tokens, pos_tags = [], []
    for line in lines:
        if not line.startswith("#"):
            parts = line.split("\t")
            tokens.append(parts[1])
            pos_tags.append(parts[3])
    return tokens, pos_tags

def process_sentences_with_lemmas(all_sentences):
    output_data = []
    for sentence_chunk in all_sentences:
        tokens, custom_pos_tags = parse_conllu(sentence_chunk)
        doc = nlp(" ".join(tokens))

        # Update POS tags for all tokens
        for token, pos in zip(doc, custom_pos_tags):
            token.pos_ = pos

        # Only modify lemmas for specific sentences
        if '# dataset = en_eslspok' in sentence_chunk or '# dataset = en_eslwrit' in sentence_chunk:
            lines = sentence_chunk.split("\n")
            for idx, token in enumerate(doc):
                parts = lines[idx + len(lines) - len(doc)].split("\t")
                parts[2] = token.lemma_
                lines[idx + len(lines) - len(doc)] = "\t".join(parts)
            updated_chunk = "\n".join(lines)
            output_data.append(updated_chunk)
        else:
            output_data.append(sentence_chunk)  # No changes, add the original sentence chunk

    return "\n\n".join(output_data)

filename = "/content/cleaned_silverSentences_database20231128.txt"
extracted_sentences = extract_all_sentences(filename)
processed_data = process_sentences_with_lemmas(extracted_sentences)

output_file = "output_lemma_silver.conllu"
with open(output_file, 'w') as file:
    file.write(processed_data)

['tok2vec', 'tagger', 'single_sent', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']


In [None]:
def extract_sentences_by_section(filename):
    with open(filename, 'r') as file:
        data = file.read()

    all_sentences = data.strip().split("\n\n")

    test_sentences = []
    train_sentences = []
    dev_sentences = []

    for sentence_chunk in all_sentences:
        if '# section = test' in sentence_chunk:
            test_sentences.append(sentence_chunk)
        elif '# section = train' in sentence_chunk:
            train_sentences.append(sentence_chunk)
        elif '# section = dev' in sentence_chunk:
            dev_sentences.append(sentence_chunk)

    return test_sentences, train_sentences, dev_sentences

def write_sentences_to_file(sentences, output_file):
    with open(output_file, 'w') as file:
        file.write("\n\n".join(sentences))

filename = "output_lemma.conllu"

test_sentences, train_sentences, dev_sentences = extract_sentences_by_section(filename)

write_sentences_to_file(test_sentences, "test.conllu")
write_sentences_to_file(train_sentences, "train.conllu")
write_sentences_to_file(dev_sentences, "dev.conllu")

In [None]:
def is_sentence_valid(sentence):

    # check whether the sentences in the correct form
    # HS: in train.conllu (# text = t r u t h o u t | Perspective was problematic)

    for token in sentence:
        parts = token.split("|")
        if len(parts) != 3:
            print(f"Invalid token format: {token}")
            return False
        word, pos, tag = parts
        if not word or not pos or not tag:
            print(f"Empty field detected in token: {token}")
            return False
    return True

def convert_to_iob_format(input_filename, output_filename):
    with open(input_filename, 'r') as infile, open(output_filename, 'w') as outfile:
        sentence = []

        for line in infile:
            line = line.strip()

            if line.startswith("#"):
                if line.startswith("# text"):
                    if sentence:
                        if is_sentence_valid(sentence):
                            outfile.write(' '.join(sentence) + "\n")
                        else:
                            print(f"Invalid sentence detected: {' '.join(sentence)}")
                        sentence = []
                continue

            # blank line
            if not line:
                continue

            columns = line.split("\t")
            word = columns[1]
            pos = columns[4]
            tag = columns[9]

            if tag == "_":
                tag = "O"
            else:
                tag = f"I-{tag}"

            # Append the word|POS|TAG to the current sentence list
            sentence.append(f"{word}|{pos}|{tag}")

        if sentence:
            if is_sentence_valid(sentence):
                outfile.write(' '.join(sentence) + "\n")
            else:
                print(f"Invalid sentence detected at the end: {' '.join(sentence)}")

if __name__ == "__main__":
    input_file = "dev.conllu"
    output_file = "dev.iob"
    convert_to_iob_format(input_file, output_file)

In [None]:
!python -m spacy convert assets corpus --n-sents 1

In [None]:
!python -m spacy train /content/config.cfg --output /content/output --gpu-id 0

2023-12-06 22:22:01.664597: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-06 22:22:01.664655: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-06 22:22:01.664690: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[38;5;4mℹ Saving to output directory: /content/output[0m
[38;5;4mℹ Using GPU: 0[0m
[1m
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaMo

In [None]:
!python -m spacy evaluate output/model-best corpus/test.spacy --output output/test_metrics.json

2023-12-06 22:45:23.460555: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-06 22:45:23.460620: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-06 22:45:23.460659: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[38;5;4mℹ Using CPU[0m
[38;5;4mℹ To switch to GPU 0, use the option: --gpu-id 0[0m
[1m

TOK     -    
NER P   90.33
NER R   90.81
NER F   90.57
SPEED   168  

[1m

                 P       R       F
ATTR         97.28   94.84   96.04
TRAN_S       92.09   94.20   93.14
DITRAN       89.55   85.71   87.59
INTRAN_MOT   82.12   83.22   82.67
INTRAN_S     81.1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ls /content/output

model-best  model-last	test_metrics.json


In [None]:
!mkdir -p "/content/drive/My Drive/asc-train-231206"
!cp -r "/content/output" "/content/drive/My Drive/asc-train-231206/" #save

In [None]:
!pip install spacy-transformers

In [None]:
import spacy_transformers

In [None]:
import spacy
from spacy.language import Language

model_path = '/content/drive/MyDrive/asc-train-231206/output/model-best'

# Load the model
nlp = spacy.load("en_core_web_trf")
nlp = spacy.load(model_path)

# Test the model
doc = nlp("This is a test sentence.")
print([token.text for token in doc])

['This', 'is', 'a', 'test', 'sentence', '.']


## Gold3

In [None]:
def test_ner(model, text):
    doc = model(text)
    for ent in doc.ents:
        print(f"{ent.text} ({ent.label_})")

# Test the model
sentences = [
    "Anita threw the hammer.",
    "Michelle got the book.",
    "Barbara sliced the bread.",
    "Audrey took the watch.",
    "Chris threw Linda the pencil.",
    "Beth got Liz an invitation.",
    "Jennifer sliced Terry an apple.",
    "Paula took Sue a message.",
    "Pat threw the keys on the roof.",
    "Laura got the ball into the net.",
    "Meg sliced the ham onto the plate.",
    "Kim took the rose into the house.",
    "Dana got the mattress inflated.",
    "Nancy sliced the tire open.",
    "Lyn threw the box apart.",
    "Rachel took the wall down."
]

for sentence in sentences:
    print(f"Testing: {sentence}")
    test_ner(nlp, sentence)
    print("-----------------------------")

Testing: Anita threw the hammer.
threw (TRAN_S)
-----------------------------
Testing: Michelle got the book.
got (TRAN_S)
-----------------------------
Testing: Barbara sliced the bread.
sliced (TRAN_S)
-----------------------------
Testing: Audrey took the watch.
took (TRAN_S)
-----------------------------
Testing: Chris threw Linda the pencil.
threw (DITRAN)
-----------------------------
Testing: Beth got Liz an invitation.
got (DITRAN)
-----------------------------
Testing: Jennifer sliced Terry an apple.
sliced (DITRAN)
-----------------------------
Testing: Paula took Sue a message.
took (TRAN_S)
-----------------------------
Testing: Pat threw the keys on the roof.
threw (CAUS_MOT)
-----------------------------
Testing: Laura got the ball into the net.
got (CAUS_MOT)
-----------------------------
Testing: Meg sliced the ham onto the plate.
sliced (CAUS_MOT)
-----------------------------
Testing: Kim took the rose into the house.
took (CAUS_MOT)
-----------------------------
Test

## Gold2

In [None]:
import spacy
from spacy.language import Language

model_path = '/content/drive/MyDrive/asc-train-231119/output/model-best'

# Load the model
nlp = spacy.load("en_core_web_trf")
nlp = spacy.load(model_path)

# Test the model
sentences = [
    "Anita threw the hammer.",
    "Michelle got the book.",
    "Barbara sliced the bread.",
    "Audrey took the watch.",
    "Chris threw Linda the pencil.",
    "Beth got Liz an invitation.",
    "Jennifer sliced Terry an apple.",
    "Paula took Sue a message.",
    "Pat threw the keys on the roof.",
    "Laura got the ball into the net.",
    "Meg sliced the ham onto the plate.",
    "Kim took the rose into the house.",
    "Dana got the mattress inflated.",
    "Nancy sliced the tire open.",
    "Lyn threw the box apart.",
    "Rachel took the wall down."
]

for sentence in sentences:
    print(f"Testing: {sentence}")
    test_ner(nlp, sentence)
    print("-----------------------------")

Testing: Anita threw the hammer.
threw (TRAN_S)
-----------------------------
Testing: Michelle got the book.
got (TRAN_S)
-----------------------------
Testing: Barbara sliced the bread.
sliced (TRAN_S)
-----------------------------
Testing: Audrey took the watch.
took (TRAN_S)
-----------------------------
Testing: Chris threw Linda the pencil.
threw (DITRAN)
-----------------------------
Testing: Beth got Liz an invitation.
got (DITRAN)
-----------------------------
Testing: Jennifer sliced Terry an apple.
sliced (DITRAN)
-----------------------------
Testing: Paula took Sue a message.
took (DITRAN)
-----------------------------
Testing: Pat threw the keys on the roof.
threw (CAUS_MOT)
-----------------------------
Testing: Laura got the ball into the net.
got (CAUS_MOT)
-----------------------------
Testing: Meg sliced the ham onto the plate.
sliced (CAUS_MOT)
-----------------------------
Testing: Kim took the rose into the house.
took (CAUS_MOT)
-----------------------------
Test

In [None]:
# Reading and processing the three JSON files

file_paths = [
    '/content/drive/MyDrive/asc-train-231031/output1/gold1.json',
    '/content/drive/MyDrive/asc-train-231031/output2/gold+silver.json',
    '/content/drive/MyDrive/asc-train-231119/output/gold2.json'
]

# Function to process each file
def process_file(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)

    metrics = {
        "ents_p": data["ents_p"],
        "ents_r": data["ents_r"],
        "ents_f": data["ents_f"]
    }

    for key, value in data["ents_per_type"].items():
        metrics[f'{key}_p'] = value['p']
        metrics[f'{key}_r'] = value['r']
        metrics[f'{key}_f'] = value['f']

    return metrics

# Processing each file
metrics_data = [process_file(path) for path in file_paths]

# Creating a DataFrame from the processed data
df = pd.DataFrame(metrics_data)
df.columns = [col if "_" not in col else col.split("_")[0] + " " + col.split("_")[1] for col in df.columns]
df.index = ['Gold1', 'Gold+Silver', 'Gold2']

# Formatting the values to two decimal places
df = df.applymap(lambda x: f'{x:.2f}' if isinstance(x, float) else x)

# Transposing the DataFrame
transposed_df = df.transpose()

# Path for the CSV file
csv_file_path = '/content/comparison_metrics.csv'

# Saving the transposed DataFrame to a CSV file
transposed_df.to_csv(csv_file_path, index=True)