In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install gdown

In [None]:
!pip install simpletransformers
from simpletransformers.ner import NERModel, NERArgs


In [None]:
VALID_LABELS_DASH = {
    "O",
    "B-ORG",
    "B-PER",
    "B-LOC",
    "B-MEA",
    "I-DTM",
    "I-ORG",
    "E-ORG",
    "I-PER",
    "B-TTL",
    "E-PER",
    "B-DES",
    "E-LOC",
    "B-DTM",
    "B-NUM",
    "I-MEA",
    "E-DTM",
    "E-MEA",
    "I-LOC",
    "I-DES",
    "E-DES",
    "I-NUM",
    "E-NUM",
    "B-TRM",
    "B-BRN",
    "I-TRM",
    "E-TRM",
    "I-TTL",
    "I-BRN",
    "E-BRN",
    "E-TTL",
    "B-NAME"
}

In [None]:
def underscore_to_dash(label: str) -> str:
    """
    Convert a label with underscores (e.g. B_ORG) to dashes (B-ORG).
    If it's not in the known list, default to "O".
    """
    if label == "O":
        return "O"
    # Replace underscores with dashes
    dashed_label = label.replace("_", "-")
    # If the dashed label is not in the valid set, set it to "O"
    if dashed_label not in VALID_LABELS_DASH:
        return "O"
    return dashed_label

def get_sorted_txt_files(directory_path):
    """
    Collect all .txt files under `directory_path`, sorted by filename.
    """
    file_paths = []
    for root, _, files in os.walk(directory_path):
        for f in files:
            if f.lower().endswith('.txt'):
                file_paths.append(os.path.join(root, f))
    file_paths.sort(key=lambda x: os.path.basename(x))
    return file_paths

def parse_lst20_file(file_path, file_id, has_ner=True):
    """
    Parse a single LST20 .txt file:
      - has_ner=True => expect columns [token, pos, ner, clause]
      - has_ner=False => expect columns [token, pos, clause]
    We'll treat 'E_CLS' as a clause boundary => end of 'sentence'.

    Return a list of "sentences," each sentence = list of dict(token, pos, ner, clause).
    """
    sentences = []
    current_sentence = []

    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            
            parts = line.split()
            
            if has_ner:
                # We expect 4 columns: token, pos, ner, clause
                if len(parts) < 4:
                    continue
                token, pos_tag, ner_tag, clause_tag = parts[:4]
                # Convert underscores to dashes, validate label
                ner_tag = underscore_to_dash(ner_tag)
            else:
                # For test data (no NER), we expect 3 columns: token, pos, clause
                if len(parts) < 3:
                    continue
                token, pos_tag, clause_tag = parts[:3]
                ner_tag = "O"  # fallback label
            
            record = {
                "token": token,
                "pos": pos_tag,
                "ner": ner_tag,
                "clause": clause_tag
            }
            current_sentence.append(record)
            
            # If we see 'E_CLS', treat that as end of the current sentence.
            if clause_tag == "E_CLS":
                sentences.append(current_sentence)
                current_sentence = []

    if current_sentence:
        sentences.append(current_sentence)

    return sentences

def make_ner_dataframe(directory_path, split_name, has_ner=True):
    """
    Build a DataFrame for Simple Transformers with columns:
       sentence_id, words, labels

    We treat each clause (ending with 'E_CLS') as one 'sentence'.
    """
    files = get_sorted_txt_files(directory_path)
    all_rows = []
    sentence_id = 0

    for file_path in files:
        file_id = os.path.splitext(os.path.basename(file_path))[0]
        list_of_sentences = parse_lst20_file(file_path, file_id, has_ner=has_ner)
        
        for sent in list_of_sentences:
            for token_dict in sent:
                # Convert "ner" to "labels" for Simple Transformers
                row = {
                    "sentence_id": sentence_id,
                    "words": token_dict["token"],   
                    "labels": token_dict["ner"],    
                }
                all_rows.append(row)
            sentence_id += 1
    
    df = pd.DataFrame(all_rows)
    print(f"[{split_name}] => {len(df)} tokens, {sentence_id} sentences")
    return df

if __name__ == "__main__":
    dataset_path = "/kaggle/input/super-ai-ss-5-named-entity-recognition"
    output_dir   = "/kaggle/working"

    # Train
    train_dir = os.path.join(dataset_path, "train/train")
    train_df = make_ner_dataframe(train_dir, "train", has_ner=True)
    train_df.to_csv(os.path.join(output_dir, "train_preprocessed.csv"), index=False, encoding='utf-8')

    # Eval
    eval_dir  = os.path.join(dataset_path, "eval/eval")
    eval_df = make_ner_dataframe(eval_dir, "eval", has_ner=True)
    eval_df.to_csv(os.path.join(output_dir, "eval_preprocessed.csv"), index=False, encoding='utf-8')
    
    # Test
    test_dir = os.path.join(dataset_path, "test/test")
    test_df = make_ner_dataframe(test_dir, "test", has_ner=False)
    test_df.to_csv(os.path.join(output_dir, "test_preprocessed.csv"), index=False, encoding='utf-8')


In [None]:
import torch
from simpletransformers.ner import NERModel, NERArgs
from sklearn.metrics import f1_score

VALID_LABELS_DASH = [
    "O",
    "B-ORG",
    "B-PER",
    "B-LOC",
    "B-MEA",
    "I-DTM",
    "I-ORG",
    "E-ORG",
    "I-PER",
    "B-TTL",
    "E-PER",
    "B-DES",
    "E-LOC",
    "B-DTM",
    "B-NUM",
    "I-MEA",
    "E-DTM",
    "E-MEA",
    "I-LOC",
    "I-DES",
    "E-DES",
    "I-NUM",
    "E-NUM",
    "B-TRM",
    "B-BRN",
    "I-TRM",
    "E-TRM",
    "I-TTL",
    "I-BRN",
    "E-BRN",
    "E-TTL",
    "B-NAME"
]

def macro_f1_metric(labels, preds, **kwargs):
    flattened_labels = []
    flattened_preds = []
    for l_seq, p_seq in zip(labels, preds):
        flattened_labels.extend(l_seq)
        flattened_preds.extend(p_seq)

    return {"macro_f1": f1_score(flattened_labels, flattened_preds, average="macro")}


if __name__ == "__main__":

    # Confirm how many GPUs are visible:
    n_gpus_available = torch.cuda.device_count()
    print(f"GPUs detected: {n_gpus_available}")

    # 1) Load data
    train_df = pd.read_csv("train_preprocessed.csv")
    eval_df  = pd.read_csv("eval_preprocessed.csv")

    # 2) Set up model args
    model_args = NERArgs()

    model_args.num_train_epochs = 10
    model_args.learning_rate = 2e-5
    model_args.train_batch_size = 128
    model_args.gradient_accumulation_steps = 2
    model_args.eval_batch_size = 128
    
    # Evaluate each epoch
    model_args.evaluate_during_training = True
    
    # Minimal checkpointing
    model_args.save_model_every_epoch = False
    model_args.save_eval_checkpoints = False
    model_args.save_steps = -1
    model_args.overwrite_output_dir = True
    model_args.save_best_model = True
    model_args.save_optimizer_and_scheduler = False
    
    # Speed ups
    model_args.logging_steps = 100
    model_args.reprocess_input_data = False
    model_args.use_cached_eval_features = True
    
    # Provide label list
    model_args.labels_list = VALID_LABELS_DASH

In [None]:
import os
import pandas as pd
from simpletransformers.ner import NERModel
from tqdm import tqdm

######################################
# 1. Define the label-to-ID mapping
######################################
label2id = {
    "O": 0,
    "B-ORG": 1,
    "B-PER": 2,
    "B-LOC": 3,
    "B-MEA": 4,
    "I-DTM": 5,
    "I-ORG": 6,
    "E-ORG": 7,
    "I-PER": 8,
    "B-TTL": 9,
    "E-PER": 10,
    "B-DES": 11,
    "E-LOC": 12,
    "B-DTM": 13,
    "B-NUM": 14,
    "I-MEA": 15,
    "E-DTM": 16,
    "E-MEA": 17,
    "I-LOC": 18,
    "I-DES": 19,
    "E-DES": 20,
    "I-NUM": 21,
    "E-NUM": 22,
    "B-TRM": 23,
    "B-BRN": 24,
    "I-TRM": 25,
    "E-TRM": 26,
    "I-TTL": 27,
    "I-BRN": 28,
    "E-BRN": 29,
    "E-TTL": 30,
    "B-NAME": 31
}

######################################
# 2. Load data function with sorted filenames ~> not optimal, there is a better way
######################################
def load_data_to_df(data_folder, is_train=True):
    rows = []
    global_sentence_id = 0

    # Get sorted list of filenames
    sorted_filenames = sorted(f for f in os.listdir(data_folder) if f.endswith(".txt"))

    for fname in sorted_filenames:
        filepath = os.path.join(data_folder, fname)
        file_id = os.path.splitext(fname)[0]
        line_idx = 0

        words_buffer = []
        labels_buffer = []
        line_indices = []

        with open(filepath, 'r', encoding='utf-8') as f:
            for raw_line in f:
                line_str = raw_line.strip()

                if not line_str:
                    if words_buffer:
                        for i, (w, lab) in enumerate(zip(words_buffer, labels_buffer)):
                            rows.append({
                                "filename": file_id,
                                "line_index": line_indices[i],
                                "sentence_id": global_sentence_id,
                                "words": w,
                                "labels": lab
                            })
                        global_sentence_id += 1
                        words_buffer.clear()
                        labels_buffer.clear()
                        line_indices.clear()
                    line_idx += 1
                    continue

                parts = line_str.split('\t')
                if len(parts) < 3:
                    line_idx += 1
                    continue

                token, pos, ner_tag = parts[:3]
                words_buffer.append(token)
                labels_buffer.append(ner_tag)
                line_indices.append(line_idx)
                line_idx += 1

            if words_buffer:
                for i, (w, lab) in enumerate(zip(words_buffer, labels_buffer)):
                    rows.append({
                        "filename": file_id,
                        "line_index": line_indices[i],
                        "sentence_id": global_sentence_id,
                        "words": w,
                        "labels": lab
                    })
                global_sentence_id += 1

    return pd.DataFrame(rows)

######################################
# 3. Segment Clauses
######################################
def segment_clauses(test_df_sorted):
    grouped_sentences = (
        test_df_sorted.groupby("sentence_id")["words"]
        .apply(list)
        .reset_index()
        .rename(columns={"words": "tokens"})
    )
    return grouped_sentences

######################################
# 4. Prediction and Alignment
######################################
def predict_and_align(model, segmented_clauses, batch_size=64):
    predictions = []
    for tokens in tqdm(segmented_clauses["tokens"], desc="Running Predictions"):
        try:
            subword_predictions, _ = model.predict([tokens], split_on_space=False)
            predictions.extend(subword_predictions[0])
        except Exception as e:
            print(f"Error during prediction for tokens: {tokens[:10]}...: {e}")
            predictions.extend(["O"] * len(tokens))
    return predictions

######################################
# 5. Main Execution
######################################
dataset_path = "/kaggle/input/super-ai-ss-5-named-entity-recognition/"
test_dir = os.path.join(dataset_path, "test/test")
sample_submission_path = os.path.join(dataset_path, "sample_submission.csv")

######################################
# Main Execution for Full Data
######################################

# Load full test data
print("Loading full test data...")
test_df = load_data_to_df(test_dir, is_train=False)
print(f"\nLoaded Full Test Data: {len(test_df)} rows")

# Sort test data
test_df_sorted = test_df.sort_values(by=['filename', 'sentence_id', 'line_index']).reset_index(drop=True)
test_df_sorted['id'] = test_df_sorted.apply(
    lambda row: f"{row['filename']}_{row['line_index']}", axis=1
)

# Print the head of test_df_sorted for verification
print("\nHead of test_df_sorted before segmentation:")
print(test_df_sorted.head())

# Segment clauses
segmented_clauses = segment_clauses(test_df_sorted)

# Predict on full data
print("\nRunning predictions on full dataset...")
predictions = predict_and_align(model, segmented_clauses, batch_size=64)

# Map predictions to numerical values
predicted_ne = []
for pred in predictions:
    if isinstance(pred, dict):
        label = list(pred.values())[0]  # Extract the label from the dictionary
    else:
        label = pred
    predicted_ne.append(label2id.get(label, 0))  # Map the label to its numerical ID

# Add predicted labels to test_df_sorted
test_df_sorted['ne'] = predicted_ne[:len(test_df_sorted)]  # Ensure alignment

# Drop the 'words' column and prepare submission DataFrame
submission_df = test_df_sorted[['id', 'ne']].copy()

# Save to submission.csv
submission_df.to_csv("submission.csv", index=False)
print("\n'submission.csv' has been created successfully.")

# Check the format of the output file
print("\nHead of submission.csv:")
print(submission_df.head())

In [None]:
import os
import pandas as pd
from simpletransformers.ner import NERModel
from tqdm import tqdm

######################################
# 1. Define the label-to-ID mapping
######################################
label2id = {
    "O": 0,
    "B-ORG": 1,
    "B-PER": 2,
    "B-LOC": 3,
    "B-MEA": 4,
    "I-DTM": 5,
    "I-ORG": 6,
    "E-ORG": 7,
    "I-PER": 8,
    "B-TTL": 9,
    "E-PER": 10,
    "B-DES": 11,
    "E-LOC": 12,
    "B-DTM": 13,
    "B-NUM": 14,
    "I-MEA": 15,
    "E-DTM": 16,
    "E-MEA": 17,
    "I-LOC": 18,
    "I-DES": 19,
    "E-DES": 20,
    "I-NUM": 21,
    "E-NUM": 22,
    "B-TRM": 23,
    "B-BRN": 24,
    "I-TRM": 25,
    "E-TRM": 26,
    "I-TTL": 27,
    "I-BRN": 28,
    "E-BRN": 29,
    "E-TTL": 30,
    "B-NAME": 31
}

######################################
# 2. Load data function with sorted filenames Thank you from Ouh
######################################
def load_data_to_df(data_folder, is_train=True):
    rows = []
    global_sentence_id = 0

    # Get sorted list of filenames
    sorted_filenames = sorted(f for f in os.listdir(data_folder) if f.endswith(".txt"))

    for fname in sorted_filenames:
        filepath = os.path.join(data_folder, fname)
        file_id = os.path.splitext(fname)[0]
        line_idx = 0

        words_buffer = []
        labels_buffer = []
        line_indices = []

        with open(filepath, 'r', encoding='utf-8') as f:
            for raw_line in f:
                line_str = raw_line.strip()

                if not line_str:
                    if words_buffer:
                        for i, (w, lab) in enumerate(zip(words_buffer, labels_buffer)):
                            rows.append({
                                "filename": file_id,
                                "line_index": line_indices[i],
                                "sentence_id": global_sentence_id,
                                "words": w,
                                "labels": lab
                            })
                        global_sentence_id += 1
                        words_buffer.clear()
                        labels_buffer.clear()
                        line_indices.clear()
                    line_idx += 1
                    continue

                parts = line_str.split('\t')
                if len(parts) < 3:
                    line_idx += 1
                    continue

                token, pos, ner_tag = parts[:3]
                words_buffer.append(token)
                labels_buffer.append(ner_tag)
                line_indices.append(line_idx)
                line_idx += 1

            if words_buffer:
                for i, (w, lab) in enumerate(zip(words_buffer, labels_buffer)):
                    rows.append({
                        "filename": file_id,
                        "line_index": line_indices[i],
                        "sentence_id": global_sentence_id,
                        "words": w,
                        "labels": lab
                    })
                global_sentence_id += 1

    return pd.DataFrame(rows)

######################################
# 3. Segment Clauses
######################################
def segment_clauses(test_df_sorted):
    grouped_sentences = (
        test_df_sorted.groupby("sentence_id")["words"]
        .apply(list)
        .reset_index()
        .rename(columns={"words": "tokens"})
    )
    return grouped_sentences

######################################
# 4. Prediction and Alignment
######################################
def predict_and_align(model, segmented_clauses, batch_size=64):
    predictions = []
    for tokens in tqdm(segmented_clauses["tokens"], desc="Running Predictions"):
        try:
            subword_predictions, _ = model.predict([tokens], split_on_space=False)
            predictions.extend(subword_predictions[0])
        except Exception as e:
            print(f"Error during prediction for tokens: {tokens[:10]}...: {e}")
            predictions.extend(["O"] * len(tokens))
    return predictions

######################################
# 5. Main Execution
######################################
dataset_path = "/kaggle/input/super-ai-ss-5-named-entity-recognition/"
test_dir = os.path.join(dataset_path, "test/test")
sample_submission_path = os.path.join(dataset_path, "sample_submission.csv")

######################################
# Main Execution for Full Data
######################################

# Load full test data
print("Loading full test data...")
test_df = load_data_to_df(test_dir, is_train=False)
print(f"\nLoaded Full Test Data: {len(test_df)} rows")

# Sort test data
test_df_sorted = test_df.sort_values(by=['filename', 'sentence_id', 'line_index']).reset_index(drop=True)
test_df_sorted['id'] = test_df_sorted.apply(
    lambda row: f"{row['filename']}_{row['line_index']}", axis=1
)

# Print the head of test_df_sorted for verification
print("\nHead of test_df_sorted before segmentation:")
print(test_df_sorted.head())

# Segment clauses
segmented_clauses = segment_clauses(test_df_sorted)

# Initialize model
model = NERModel(
    model_type="xlmroberta",
    model_name="/kaggle/working/best_model",
    labels=list(label2id.keys()),
    args={"max_seq_length": 512},
    use_cuda=True
)

# Predict on full data
print("\nRunning predictions on full dataset...")
predictions = predict_and_align(model, segmented_clauses, batch_size=64)

# Map predictions to numerical values
predicted_ne = []
for pred in predictions:
    if isinstance(pred, dict):
        label = list(pred.values())[0]  # Extract the label from the dictionary
    else:
        label = pred
    predicted_ne.append(label2id.get(label, 0))  # Map the label to its numerical ID

# Add predicted labels to test_df_sorted
test_df_sorted['ne'] = predicted_ne[:len(test_df_sorted)]  # Ensure alignment

# Drop the 'words' column and prepare submission DataFrame
submission_df = test_df_sorted[['id', 'ne']].copy()

# Save to submission.csv
submission_df.to_csv("submission.csv", index=False)
print("\n'submission.csv' has been created successfully.")

# Check the format of the output file
print("\nHead of submission.csv:")
print(submission_df.head())
