In [None]:
# Load Data
import os
import pandas as pd

# Define a funciton to read txt files as df
def txt_as_df(path):
    all_sentences = []

    for filename in sorted(os.listdir(path)):
        if filename.endswith(".txt"):
            file_path = os.path.join(path, filename)

            sentences = []
            current_sentence = []

            with open(file_path, "r", encoding="utf-8") as file:
                for line in file:
                    line = line.strip()

                    if not line:
                        if current_sentence:
                            sentences.append(current_sentence)
                            current_sentence = []
                    else:
                        word_data = line.split("\t")
                        if len(word_data) == 4:
                            current_sentence.append(word_data)

                if current_sentence:  # In case there's no empty line at the end
                    sentences.append(current_sentence)

            for sentence in sentences:
                df = pd.DataFrame(sentence, columns=["word", "pos", "ner", "cls"])
                all_sentences.append(df)

    return all_sentences

In [None]:
# Read Data
train_df = txt_as_df("---/train/train")
eval_df = txt_as_df("---/eval/eval")

In [None]:
# Chceck
train_df

In [None]:
# Create Label List
labels = ['O',
 'B_ORG',
 'B_PER',
 'B_LOC',
 'B_MEA',
 'I_DTM',
 'I_ORG',
 'E_ORG',
 'I_PER',
 'B_TTL',
 'E_PER',
 'B_DES',
 'E_LOC',
 'B_DTM',
 'B_NUM',
 'I_MEA',
 'E_DTM',
 'E_MEA',
 'I_LOC',
 'I_DES',
 'E_DES',
 'I_NUM',
 'E_NUM',
 'B_TRM',
 'B_BRN',
 'I_TRM',
 'E_TRM',
 'I_TTL',
 'I_BRN',
 'E_BRN',
 'E_TTL',
 'B_NAME']

In [None]:
def replace_weird_tag(dataframes, tags):
    
    for df in dataframes:
        
        df["ner"] = df["ner"].apply(lambda x: "B_ORG" if x in tags else x)
    
    return dataframes

In [None]:
train_df = replace_weird_tag(train_df, {'OBRN_B', 'MEA_BI', 'B_D`TM', 'ORG_I', 'I', '__', 'DDEM', 'B', 'PER_I'})
eval_df = replace_weird_tag(eval_df, {'LOC_I', 'ABB', 'B', '__', 'ORG_I'})

In [None]:
# Check train df
train_df

In [None]:
# Merge train_df and eval_df
merge_df = train_df + eval_df

In [None]:
def extract_features(sentence_df):
    features = []
    for i in range(len(sentence_df)):
        word = sentence_df.iloc[i]["word"]
        pos_tag = sentence_df.iloc[i]["pos"]
        clause_boundary = sentence_df.iloc[i]["cls"]

        # Define Features for each Token
        token_features = {
            "word": word,
            "pos_tag": pos_tag,
            "clause_boundary": clause_boundary,
            "is_first_word": i == 0,
            "is_last_word": i == len(sentence_df) - 1,
            "prefix-1": word[0],
            "prefix-2": word[:2],
            "suffix-1": word[-1],
            "suffix-2": word[-2:],
            "prev_word": '' if i == 0 else sentence_df.iloc[i - 1]["word"],
            "next_word": '' if i == len(sentence_df) - 1 else sentence_df.iloc[i + 1]["word"],
            "prev_pos": '' if i == 0 else sentence_df.iloc[i - 1]["pos"],
            "next_pos": '' if i == len(sentence_df) - 1 else sentence_df.iloc[i + 1]["pos"],
        }
        
        features.append(token_features)
    return features

In [None]:
def preprocess_data(dataframes, has_labels=True):
    X = []
    y = []

    for df in dataframes:
        
        sentence_features = extract_features(df)
        X.append(sentence_features)

        if has_labels and "ner" in df.columns:
            sentence_labels = df["ner"].tolist()
            y.append(sentence_labels)
        else:
            y.append([])

    return X, y

In [None]:
X, y = preprocess_data(merge_df)

In [None]:
X[0]

In [None]:
y[0]

In [None]:
# Split data using train_test_split to train
import numpy as np
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, train_size = 0.8 ,test_size = 0.2, random_state=42)

In [None]:
!pip install sklearn-crfsuite

In [None]:
import sklearn_crfsuite

In [None]:
# Config model
model = sklearn_crfsuite.CRF(
    algorithm="lbfgs",
    c1=0.1,
    c2=0.1,
    max_iterations=100,
    all_possible_transitions=True,
    verbose=True
)

In [None]:
# fit model
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_val)

In [None]:
from sklearn_crfsuite import metrics

In [None]:
# check f1 score (macro)
f1_score = metrics.flat_f1_score(y_val, y_pred, average="macro", labels=labels, zero_division=0)
f1_score

In [None]:
def read_test(path):
    all_sentences = []

    for filename in sorted(os.listdir(path)):
        if filename.endswith(".txt"):
            file_path = os.path.join(path, filename)

            sentences = []
            current_sentence = []

            with open(file_path, "r", encoding="utf-8") as file:
                for line in file:
                    line = line.strip()

                    if not line:
                        if current_sentence:
                            sentences.append(current_sentence)
                            current_sentence = []
                    else:
                        word_data = line.split("\t")
                        if len(word_data) == 3:
                            current_sentence.append(word_data)

                if current_sentence:  # In case there's no empty line at the end
                    sentences.append(current_sentence)

            for sentence in sentences:
                df = pd.DataFrame(sentence, columns=["word", "pos", "cls"])
                all_sentences.append(df)

    return all_sentences

In [None]:
# Read test df
test_df = read_test("/kaggle/input/super-ai-ss-5-named-entity-recognition/test/test")

In [None]:
test_df

In [None]:
X_test, _ = preprocess_data(test_df)

In [None]:
# predict
pred = model.predict(X_test)

In [None]:
pred

In [None]:
def Save_submission_File(submission_file, pred, output_file):
    submission_df = pd.read_csv(submission_file)
    
    label_index = 0 
    for i in range(len(submission_df)):
        submission_df.at[i, 'ne'] = pred[label_index]
        label_index += 1
        
    submission_df.to_csv(output_file, index=False)

In [None]:
label_map = {'O': 0,
 'B_ORG': 1,
 'B_PER': 2,
 'B_LOC': 3,
 'B_MEA': 4,
 'I_DTM': 5,
 'I_ORG': 6,
 'E_ORG': 7,
 'I_PER': 8,
 'B_TTL': 9,
 'E_PER': 10,
 'B_DES': 11,
 'E_LOC': 12,
 'B_DTM': 13,
 'B_NUM': 14,
 'I_MEA': 15,
 'E_DTM': 16,
 'E_MEA': 17,
 'I_LOC': 18,
 'I_DES': 19,
 'E_DES': 20,
 'I_NUM': 21,
 'E_NUM': 22,
 'B_TRM': 23,
 'B_BRN': 24,
 'I_TRM': 25,
 'E_TRM': 26,
 'I_TTL': 27,
 'I_BRN': 28,
 'E_BRN': 29,
 'E_TTL': 30,
 'B_NAME': 31}

In [None]:
pred_num = [label_map[label] for sentence in pred for label in sentence]

In [None]:
pred_num

In [None]:
Save_submission_File("/kaggle/input/super-ai-ss-5-named-entity-recognition/sample_submission.csv", pred_num, "/kaggle/working/sample_submission.csv")