# **Pre-Processing data**

In [None]:
import os
import py_vncorenlp
import json
import re
from tqdm.auto import tqdm

In [None]:
NOTEBOOK_DIR = os.getcwd()
CAPSTONE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATASET_DIR = os.path.join(CAPSTONE_DIR, "dataset")
MODEL_DIR = "D:/VnCoreNLP" # This is belong to your file location 

## Stop word

In [None]:
with open(os.path.join(DATASET_DIR, "stopwords.txt"), "r", encoding="utf-8") as text_file:
    content = text_file.readlines()
with open(os.path.join(DATASET_DIR, "stopwords_processed.txt"), "a", encoding="utf-8") as processed_file:
    for word in content:
        word_list = word.split()
        processed_word = word_list[0]
        if len(word_list) > 1:
            processed_word = "_".join(word_list)
        processed_file.write(processed_word + "\n")

## Legal Corpus

In [None]:
model = py_vncorenlp.VnCoreNLP(save_dir= MODEL_DIR)

In [None]:
with open(os.path.join(DATASET_DIR, "stopwords_processed.txt"), "r", encoding="utf-8") as f:
    stopwords_list = list(map(str.strip, f))

pattern = r"\b(" + "|".join(map(re.escape, stopwords_list)) + r")\b"
print(stopwords_list)

In [None]:
input_file = os.path.join(DATASET_DIR, "legal_corpus.json")
output_file = os.path.join(DATASET_DIR, "processed_legal_corpus.json")

with open(input_file, "r", encoding="utf-8") as json_file:
    data = json.load(json_file)

total_documents = len(data)

with open(output_file, "w", encoding="utf-8") as out_file:
    out_file.write("[\n")   
    
    for i, document in enumerate(tqdm(data, total=total_documents, desc="Processing documents")):
        for article in document["articles"]:
            article_title = article["title"]
            article_title_split = article_title.split(". ")
            if len(article_title_split) > 1:
                article_title = article_title_split[1]
            else:
                article_title = ""
            article_list = model.word_segment(article_title + " " + article["text"])
            article["segment_only"] = " ".join(article_list)
            article["processed_in4"] = re.sub(pattern, "", article["segment_only"])
            article["segment_only"] = re.sub(r"\s+", " ", article["segment_only"]).strip()
            article["processed_in4"] = re.sub(r"\s+", " ", article["processed_in4"]).strip()
        
        json.dump(document, out_file, ensure_ascii=False, indent=4)
        
        if i < total_documents - 1:
            out_file.write(",\n")
    
    out_file.write("\n]")