Tiền xử lý dữ liệu cho MODEL NER

In [2]:
import pandas as pd 
import spacy 
import requests 
from bs4 import BeautifulSoup
nlp = spacy.load("en_core_web_sm")
pd.set_option("display.max_rows", 200)


In [13]:
import pandas as pd

# Đọc tệp CSV
file_path = "D:/Project DeepL/Project-DeepL/cleaned_output_no_abnormal.csv"
data = pd.read_csv(file_path)

# Thiết lập số lượng dòng mỗi tệp nhỏ
chunk_size = 250

# Duyệt qua dữ liệu và lưu từng phần nhỏ vào các tệp CSV riêng biệt
for i in range(0, len(data), chunk_size):
    # Chia dữ liệu
    chunk = data[i:i+chunk_size]
    # Lưu thành tệp CSV nhỏ
    chunk.to_csv(f"D:/Project DeepL/Project-DeepL/cleaned_output_part_{i//chunk_size + 1}.csv", index=False)


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import spacy
from spacy.tokens import DocBin
import random

# Load the dataset
df = pd.read_csv('cleaned_output_part_1.csv')

# Basic preprocessing
def preprocess_text(text):
    """
    Preprocess Vietnamese text for NER
    """
    # Convert to lowercase
    text = text.lower()
    # Remove extra whitespaces
    text = ' '.join(text.split())
    return text

# Apply preprocessing
df['processed_review'] = df['review'].apply(preprocess_text)

# Custom NER labeling function
def custom_ner_labeling(text):
    """
    Custom NER labeling for Vietnamese reviews
    Identify key entities:
    - PRODUCT: Product types or specific product names
    - COLOR: Color mentions
    - QUALITY: Quality-related terms
    - DELIVERY: Delivery-related terms
    """
    entities = []
    
    # Color detection
    color_keywords = ['đen', 'trắng', 'hồng', 'xanh', 'đỏ', 'xám', 'vàng', 'tím']
    for color in color_keywords:
        if color in text:
            start = text.index(color)
            entities.append((start, start+len(color), 'COLOR'))
    
    # Product type detection
    product_keywords = ['áo', 'quần', 'váy', 'giày', 'son', 'kem', 'phấn', 'túi']
    for product in product_keywords:
        if product in text:
            start = text.index(product)
            entities.append((start, start+len(product), 'PRODUCT'))
    
    # Quality-related detection
    quality_keywords = ['chất lượng', 'mỏng', 'dày', 'tệ', 'đẹp', 'xấu', 'cũ']
    for quality in quality_keywords:
        if quality in text:
            start = text.index(quality)
            entities.append((start, start+len(quality), 'QUALITY'))
    
    # Delivery-related detection
    delivery_keywords = ['giao hàng', 'chậm', 'sai', 'nhanh']
    for delivery in delivery_keywords:
        if delivery in text:
            start = text.index(delivery)
            entities.append((start, start+len(delivery), 'DELIVERY'))
    
    return entities

# Prepare training data
def prepare_spacy_training_data(texts, labels):
    """
    Prepare training data for spaCy NER model
    """
    train_data = []
    for text, label in zip(texts, labels):
        train_data.append((text, {'entities': label}))
    return train_data

# Generate annotated dataset
df['ner_labels'] = df['processed_review'].apply(custom_ner_labeling)

# Split the data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['processed_review'], 
    df['ner_labels'], 
    test_size=0.2, 
    random_state=42
)

# Prepare training data
train_data = prepare_spacy_training_data(train_texts, train_labels)

# Create a blank Vietnamese language model
nlp = spacy.blank("vi")
ner = nlp.add_pipe("ner")

# Add labels to the NER model
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

# Training the NER model
from spacy.training import Example

# Initialize the DocBin for training
db = DocBin()
for text, annotations in train_data:
    doc = nlp.make_doc(text)
    example = Example.from_dict(doc, annotations)
    db.add(example.reference)

# Save the training data
db.to_disk("./train.spacy")

# Basic model configuration
config = {
    "threshold": 0.5,
    "model": {
        "@architectures": "spacy.TransitionBasedParser.v1",
        "hidden_width": 64,
        "max_action": 100
    }
}

print("NER Preprocessing and Initial Model Setup Complete")