# Prepare Data

## Load Dataset

In [1]:
import os
import pandas as pd

In [2]:
def Read_txt_Files_as_Dataframes(directory_path):
    all_sentences_dfs = []

    for filename in sorted(os.listdir(directory_path)):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory_path, filename)

            sentences = []
            current_sentence = []

            with open(file_path, "r", encoding="utf-8") as file:
                for line in file:
                    line = line.strip()

                    if not line:
                        if current_sentence:
                            sentences.append(current_sentence)
                            current_sentence = []
                    else:
                        word_data = line.split("\t")
                        if len(word_data) == 4:
                            current_sentence.append(word_data)

                if current_sentence:  # In case there's no empty line at the end
                    sentences.append(current_sentence)

            for sentence in sentences:
                df = pd.DataFrame(sentence, columns=["Word", "POS tag", "Named entity", "Clause boundary"])
                all_sentences_dfs.append(df)

    return all_sentences_dfs

In [3]:
directory_path = "train/train"

dataframes = Read_txt_Files_as_Dataframes(directory_path)

eval_directory_path = "eval/eval"

eval_dataframes = Read_txt_Files_as_Dataframes(eval_directory_path)

## Load Tags List

In [4]:
file_path = "tag_list.csv"

tags_df = pd.read_csv(file_path)
tags_df

Unnamed: 0,tag,class
0,O,0
1,B_ORG,1
2,B_PER,2
3,B_LOC,3
4,B_MEA,4
5,I_DTM,5
6,I_ORG,6
7,E_ORG,7
8,I_PER,8
9,B_TTL,9


In [5]:
label_list = tags_df["tag"].tolist()
label_list

['O',
 'B_ORG',
 'B_PER',
 'B_LOC',
 'B_MEA',
 'I_DTM',
 'I_ORG',
 'E_ORG',
 'I_PER',
 'B_TTL',
 'E_PER',
 'B_DES',
 'E_LOC',
 'B_DTM',
 'B_NUM',
 'I_MEA',
 'E_DTM',
 'E_MEA',
 'I_LOC',
 'I_DES',
 'E_DES',
 'I_NUM',
 'E_NUM',
 'B_TRM',
 'B_BRN',
 'I_TRM',
 'E_TRM',
 'I_TTL',
 'I_BRN',
 'E_BRN',
 'E_TTL',
 'B_NAME']

## Clean Data

In [6]:
def Replace_tags_with_O(dataframes, tags):
    
    for df in dataframes:
        
        df["Named entity"] = df["Named entity"].apply(lambda x: "B_ORG" if x in tags else x)
    
    return dataframes

In [7]:
tags_only_in_df1 = {'OBRN_B', 'MEA_BI', 'B_D`TM', 'ORG_I', 'I', '__', 'DDEM', 'B', 'PER_I'}
tags_only_in_df2 = {'LOC_I', 'ABB', 'B', '__', 'ORG_I'}

In [8]:
dataframes = Replace_tags_with_O(dataframes, tags_only_in_df1)
eval_dataframes = Replace_tags_with_O(eval_dataframes, tags_only_in_df2)

## Merge Data

In [9]:
combined_dataframes = dataframes + eval_dataframes

## Down-Sampling Data

In [10]:
filtered_dataframes = [df for df in combined_dataframes if len(df) < 80]

In [11]:
len(filtered_dataframes)

60491

In [12]:
# df = combined_dataframes[:10000]
df = combined_dataframes

# CRF Modeling

## Data Preprocessing

In [13]:
def extract_features(sentence_df):
    features = []
    for i in range(len(sentence_df)):
        word = sentence_df.iloc[i]["Word"]
        pos_tag = sentence_df.iloc[i]["POS tag"]
        clause_boundary = sentence_df.iloc[i]["Clause boundary"]

        # Define Features for each Token
        token_features = {
            "word": word,
            "pos_tag": pos_tag,
            "clause_boundary": clause_boundary,
            "is_first_word": i == 0,
            "is_last_word": i == len(sentence_df) - 1,
            "prefix-1": word[0],
            "prefix-2": word[:2],
            "suffix-1": word[-1],
            "suffix-2": word[-2:],
            "prev_word": '' if i == 0 else sentence_df.iloc[i - 1]["Word"],
            "next_word": '' if i == len(sentence_df) - 1 else sentence_df.iloc[i + 1]["Word"],
            "prev_pos": '' if i == 0 else sentence_df.iloc[i - 1]["POS tag"],
            "next_pos": '' if i == len(sentence_df) - 1 else sentence_df.iloc[i + 1]["POS tag"],
        }
        
        features.append(token_features)
    return features

In [14]:
def preprocess_data(dataframes, has_labels=True):
    X = []
    y = []

    for df in dataframes:
        
        sentence_features = extract_features(df)
        X.append(sentence_features)

        if has_labels and "Named entity" in df.columns:
            sentence_labels = df["Named entity"].tolist()
            y.append(sentence_labels)
        else:
            y.append([])

    return X, y

In [15]:
X, y = preprocess_data(df)

In [16]:
print("Features for the First Sentence:")
print(X[0])
print("\nLabels for the First Sentence:")
print(y[0])

Features for the First Sentence:
[{'word': 'สภาสังคมสงเคราะห์แห่งประเทศ', 'pos_tag': 'NN', 'clause_boundary': 'B_CLS', 'is_first_word': True, 'is_last_word': False, 'prefix-1': 'ส', 'prefix-2': 'สภ', 'suffix-1': 'ศ', 'suffix-2': 'ทศ', 'prev_word': '', 'next_word': 'ไทย', 'prev_pos': '', 'next_pos': 'NN'}, {'word': 'ไทย', 'pos_tag': 'NN', 'clause_boundary': 'I_CLS', 'is_first_word': False, 'is_last_word': False, 'prefix-1': 'ไ', 'prefix-2': 'ไท', 'suffix-1': 'ย', 'suffix-2': 'ทย', 'prev_word': 'สภาสังคมสงเคราะห์แห่งประเทศ', 'next_word': 'จี้', 'prev_pos': 'NN', 'next_pos': 'VV'}, {'word': 'จี้', 'pos_tag': 'VV', 'clause_boundary': 'I_CLS', 'is_first_word': False, 'is_last_word': False, 'prefix-1': 'จ', 'prefix-2': 'จี', 'suffix-1': '้', 'suffix-2': 'ี้', 'prev_word': 'ไทย', 'next_word': 'ศาล', 'prev_pos': 'NN', 'next_pos': 'NN'}, {'word': 'ศาล', 'pos_tag': 'NN', 'clause_boundary': 'I_CLS', 'is_first_word': False, 'is_last_word': False, 'prefix-1': 'ศ', 'prefix-2': 'ศา', 'suffix-1': 'ล',

## Split Data

In [17]:
import numpy as np
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Initialize CRF Model

In [19]:
import sklearn_crfsuite

In [20]:
crf = sklearn_crfsuite.CRF(
    algorithm="lbfgs",
    c1=0.11,
    c2=0.01,
    max_iterations=40,
    all_possible_transitions=True,
    verbose=True
)

In [25]:
import time
start_time = time.time()

In [26]:
crf.fit(X_train, y_train)

loading training data to CRFsuite: 100%|██████████| 55144/55144 [00:15<00:00, 3597.23it/s]



Feature generation
type: CRF1d
feature.minfreq: 0.000000
feature.possible_states: 0
feature.possible_transitions: 1
0....1....2....3....4....5....6....7....8....9....10
Number of features: 137137
Seconds required: 4.130

L-BFGS optimization
c1: 0.110000
c2: 0.010000
num_memories: 6
max_iterations: 40
epsilon: 0.000010
stop: 10
delta: 0.000010
linesearch: MoreThuente
linesearch.max_iterations: 20

Iter 1   time=11.83 loss=5668508.47 active=136152 feature_norm=1.00
Iter 2   time=15.49 loss=5223839.06 active=134931 feature_norm=6.40
Iter 3   time=4.98  loss=3961273.53 active=127324 feature_norm=5.26
Iter 4   time=35.80 loss=2128881.67 active=131394 feature_norm=3.40
Iter 5   time=4.89  loss=2127773.46 active=135153 feature_norm=4.21
Iter 6   time=4.83  loss=1861110.38 active=136202 feature_norm=4.41
Iter 7   time=4.92  loss=1733323.31 active=135318 feature_norm=5.44
Iter 8   time=5.19  loss=1556468.55 active=136097 feature_norm=5.80
Iter 9   time=5.50  loss=1428536.72 active=134997 featu

In [22]:
end_time = time.time()
time_taken = end_time - start_time

In [23]:
print(f"\nTime taken for fitting: {time_taken:.2f} seconds")


Time taken for fitting: 0.01 seconds


In [27]:
y_pred = crf.predict(X_test)

In [28]:
from sklearn_crfsuite import metrics

In [29]:
f1_macro_score = metrics.flat_f1_score(y_test, y_pred, average="macro", labels=label_list, zero_division=0)
print("F1 Macro Score:", f1_macro_score)

print("\nClassification Report:")
print(metrics.flat_classification_report(y_test, y_pred, labels=label_list, digits=3, zero_division=0))

F1 Macro Score: 0.7275548903118945

Classification Report:
              precision    recall  f1-score   support

           O      0.986     0.989     0.988    495514
       B_ORG      0.901     0.817     0.857     11621
       B_PER      0.977     0.967     0.972     10925
       B_LOC      0.977     0.945     0.961      8004
       B_MEA      0.695     0.607     0.648      7661
       I_DTM      0.928     0.916     0.922      8182
       I_ORG      0.958     0.984     0.971     10185
       E_ORG      0.930     0.929     0.930      6808
       I_PER      0.965     0.978     0.972      4948
       B_TTL      0.982     0.964     0.973      5005
       E_PER      0.970     0.992     0.981      4538
       B_DES      0.922     0.863     0.891      4048
       E_LOC      0.983     0.987     0.985      7254
       B_DTM      0.897     0.770     0.828      3598
       B_NUM      0.512     0.645     0.571      3385
       I_MEA      0.627     0.760     0.687      2932
       E_DTM      0.86

## Hyperparameter Optimization

## RandomizedSearchCV

In [28]:
from sklearn_crfsuite import metrics
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats
from functools import partial

In [29]:
def sequence_f1_score(y_true, y_pred, labels):
    return metrics.flat_f1_score(y_true, y_pred, average="macro", labels=labels, zero_division=0)

In [30]:
param_distributions = {
    'c1': scipy.stats.expon(scale=0.5),
    'c2': scipy.stats.expon(scale=0.05),
    'max_iterations': [100, 200, 300],
    'all_possible_transitions': [True, False],
}

In [31]:
crf = sklearn_crfsuite.CRF(algorithm="lbfgs", verbose=True)

In [32]:
random_search = RandomizedSearchCV(
    estimator=crf,
    param_distributions=param_distributions,
    n_iter=10,
    cv=3,
    verbose=1,
    n_jobs=-1,
)

In [None]:
random_search.fit(X_train, y_train)

In [None]:
best_params = random_search.best_params_
print("Best Parameters:", best_params)

## Retrain Model

In [30]:
best_crf = crf #random_search.best_estimator_

In [31]:
y_pred_best = best_crf.predict(X_test)

In [32]:
f1_macro_score = sequence_f1_score(y_test, y_pred_best, labels=label_list)
print("F1 Macro Score (after tuning):", f1_macro_score)

print("\nClassification Report (after tuning):")
print(metrics.flat_classification_report(y_test, y_pred_best, labels=label_list, digits=3, zero_division=0))

NameError: name 'sequence_f1_score' is not defined

# Make a Task Prediction

## Load Test Data 

In [33]:
def Read_Test_File(directory_path):
    all_sentences_dfs = []

    for filename in sorted(os.listdir(directory_path)):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory_path, filename)

            sentences = []
            current_sentence = []

            with open(file_path, "r", encoding="utf-8") as file:
                for line in file:
                    line = line.strip()

                    if not line:
                        if current_sentence:
                            sentences.append(current_sentence)
                            current_sentence = []
                    else:
                        word_data = line.split("\t")
                        if len(word_data) == 3:
                            current_sentence.append(word_data)

                if current_sentence:  # In case there's no empty line at the end
                    sentences.append(current_sentence)

            for sentence in sentences:
                df = pd.DataFrame(sentence, columns=["Word", "POS tag", "Clause boundary"])
                all_sentences_dfs.append(df)

    return all_sentences_dfs

In [34]:
test_directory_path = "test/test"

test_df = Read_Test_File(test_directory_path)

## Predict

In [35]:
X_test, _ = preprocess_data(test_df)

In [36]:
# predicted_labels = best_crf.predict(X_test)
predicted_labels = crf.predict(X_test)

## Mapping and Save

In [37]:
def map_Labels_to_Numbers(predicted_labels, tag_to_number):
    mapped_labels = [tag_to_number[label] for sentence in predicted_labels for label in sentence]
    
    return mapped_labels

In [38]:
def Save_submission_File(submission_file, mapped_labels, output_file):
    submission_df = pd.read_csv(submission_file)
    
    label_index = 0 
    for i in range(len(submission_df)):
        # if pd.isna(submission_df.at[i, 'ne']):  # Check if 'ne' is empty
        submission_df.at[i, 'ne'] = mapped_labels[label_index]
        label_index += 1
        
    submission_df.to_csv(output_file, index=False)
    print(f"Updated submission file saved to {output_file}")


In [39]:
dict_for_map = dict(zip(tags_df["tag"], tags_df["class"]))

In [40]:
mapped_labels = map_Labels_to_Numbers(predicted_labels, dict_for_map)

In [41]:
submission_file = "sample_submission.csv"

output_file = "sample_submission.csv"

In [42]:
Save_submission_File(submission_file, mapped_labels, output_file)

Updated submission file saved to sample_submission.csv
