In [None]:

!pip install vncorenlp

In [3]:
from vncorenlp import VnCoreNLP

vncorenlp = VnCoreNLP("../input/vncorenlp/VnCoreNLP-1.1.1.jar", annotators="wseg,pos,ner,parse", max_heap_size='-Xmx2g')

In [None]:
import pandas as pd
from vncorenlp import VnCoreNLP
import nltk


nltk.download('punkt')


def split_into_sentences(text):
    sentences = nltk.sent_tokenize(text)
    return [sentence for sentence in sentences if sentence.strip()]


def apply_ner_with_tokenize(text):
    ner_results = []
    tokenized_text = []  
    try:
        result = vncorenlp.ner(text)
        for sentence in result:
            filtered_entities = [entity for entity in sentence if entity[1] != "O"]
            ner_results.extend(filtered_entities)
            tokenized_text.append(" ".join([token[0] for token in sentence]))
    except Exception as e:
        print(f"Error during NER: {e}")
    return ner_results, " ".join(tokenized_text)

def detokenize(text):
    """
    Xử lý lại văn bản được tokenize để loại bỏ lỗi khoảng trắng không cần thiết.
    """
    text = text.replace(" _ ", "_")  
    text = text.replace(" _", "_").replace("_ ", "_")  
    text = text.replace(" ,", ",").replace(" .", ".")  
    text = text.replace(" :", ":").replace(" ;", ";")  
    text = text.replace("( ", "(").replace(" )", ")")  
    text = text.replace(" '", "'").replace("' ", "'")  
    text = text.replace(" - ", "-")  
    return text


def process_entities_and_tokenize(index, text):
    sentences = split_into_sentences(text)
    ner_results = []
    tokenized_text = []
    for sentence in sentences:
        if sentence.strip():  
            ner_result, tokenized_sentence = apply_ner_with_tokenize(sentence)
            ner_results.extend(ner_result)
            tokenized_text.append(tokenized_sentence)
    
    entities = []
    if ner_results:
        i = 0
        while i < len(ner_results):
            entity, tag = ner_results[i]
            if tag.startswith("B-"):
                current_entity = [entity]
                current_tag = tag[2:]
                i += 1
                while i < len(ner_results) and ner_results[i][1] == f"I-{current_tag}":
                    current_entity.append(ner_results[i][0])
                    i += 1
                
                entities.append((index, " ".join(current_entity), current_tag))
            else:
                i += 1
    
  
    unique_entities = list(set(entities))
    
    tokenized_text = detokenize(" ".join(tokenized_text))
    return unique_entities, tokenized_text


def process_file(file_name, output_tokenized, output_entities):
    try:
        
        df = pd.read_csv(file_name)

        columns = ['Số thứ tự bài báo', 'Thực thể', 'Loại thực thể']
        df_entity = pd.DataFrame(columns=columns)

    
        for index, row in df.iterrows():
            text = str(row["content"])
            entities, tokenized_text = process_entities_and_tokenize(row["idx"], text)
            
          
            df.at[index, "content"] = tokenized_text
            
         
            for index1, entity, entity_type in entities:
                new_row = {'Số thứ tự bài báo': index1, 'Thực thể': entity, 'Loại thực thể': entity_type}
                df_entity = pd.concat([df_entity, pd.DataFrame([new_row])], ignore_index=True)

        
        df.to_csv(output_tokenized, encoding="utf-8-sig", index=False)
        df_entity.to_csv(output_entities, encoding="utf-8-sig", index=False)
        print(f"Processed {file_name}: Output saved to {output_tokenized} and {output_entities}")

    except Exception as e:
        print(f"Error processing file {file_name}: {e}")

In [None]:
num_sample = 3

In [None]:

for i in range(1, num_sample + 1):
    file_name = f"/kaggle/working/train_preprocessing1_{i}.csv"
    output_tokenized = f"/kaggle/working/train_ner_{i}.csv"
    output_entities = f"/kaggle/working/train_entities_{i}.csv"
    process_file(file_name, output_tokenized, output_entities)

In [None]:

for i in range(1, num_sample + 1):
    file_name = f"/kaggle/working/test_preprocessing1_{i}.csv"
    output_tokenized = f"/kaggle/working/test_ner_{i}.csv"
    output_entities = f"/kaggle/working/test_entities_{i}.csv"
    process_file(file_name, output_tokenized, output_entities)


vncorenlp.close()

In [None]:
import pandas as pd

file_path = ''

with open(file_path, 'r', encoding='utf-8') as f:
    stopwords = [word.strip().lower() for word in f.readlines() if word.strip()]

def remove_stopwords(text, stopwords):
    text = text.replace(',', ' , ').replace('.', ' . ')
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(filtered_words)

df['content'] = df['content'].apply(lambda x: preprocess_and_remove_stopwords(str(x), stopwords))

print(df['content']
