## 1. Import Libraries và Khởi tạo Spark Session

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, udf, current_timestamp
import torch
import json
import time
from transformers import AutoTokenizer
from vncorenlp import VnCoreNLP
import os
import warnings
warnings.filterwarnings('ignore')

# Set AWS environment variables for MinIO
os.environ['AWS_REGION'] = 'us-east-1'
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'admin123'

# Khởi tạo Spark Session với Iceberg và Nessie catalog
spark = (
    SparkSession.builder.appName("Apply_Model_Multi_Task")
    .master("spark://spark-master:7077")
    .config("spark.executor.memory", "1536m")  # 1.5GB - an toàn với worker 2GB
    .config("spark.executor.cores", "2")
    .config("spark.network.timeout", "600s")
    .config("spark.executor.heartbeatInterval", "60s")
    .config("spark.storage.blockManagerSlaveTimeoutMs", "600000")
    .config("spark.rpc.askTimeout", "600s")
    # ===== Iceberg Catalog qua Nessie =====
    .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
    .config("spark.sql.catalog.nessie.uri", "http://nessie:19120/api/v2")
    .config("spark.sql.catalog.nessie.ref", "main")
    .config("spark.sql.catalog.nessie.warehouse", "s3a://silver/")
    .config("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    # ===== Cấu hình MinIO (S3-compatible) =====
    .config("spark.sql.catalog.nessie.s3.endpoint", "http://minio:9000")
    .config("spark.sql.catalog.nessie.s3.access-key-id", "admin")
    .config("spark.sql.catalog.nessie.s3.secret-access-key", "admin123")
    .config("spark.sql.catalog.nessie.s3.path-style-access", "true")
    .config("spark.sql.catalog.nessie.s3.region", "us-east-1")
    # ===== Spark + Hadoop S3 connector =====
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", "admin")
    .config("spark.hadoop.fs.s3a.secret.key", "admin123")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.region", "us-east-1")
    # Propagate environment variables to executors
    .config("spark.executorEnv.AWS_REGION", "us-east-1")
    .config("spark.executorEnv.AWS_ACCESS_KEY_ID", "admin")
    .config("spark.executorEnv.AWS_SECRET_ACCESS_KEY", "admin123")
    # ===== Sử dụng JAR files local =====
    .config("spark.jars", "/opt/spark/jars/hadoop-aws-3.3.4.jar,/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")
print("Spark Session da duoc khoi tao voi Nessie catalog!")
print(f"Spark Master: {spark.sparkContext.master}")
print(f"Application ID: {spark.sparkContext.applicationId}")

## 2. Đọc Dữ Liệu từ Bảng Article

In [None]:
# Đọc dữ liệu từ bảng article
df_articles = spark.table("nessie.silver_tables.article")
print(f"Tổng số articles: {df_articles.count()}")

# Cache để tránh đọc lại nhiều lần
df_articles.cache()
print("✓ Đã cache df_articles")

## 3. Load Model và Cấu Hình

In [None]:
# Đường dẫn model local (đã mount vào container)
local_model_path = '/opt/spark-apps/phobert_multitask_final'

# Đọc JSON files từ local filesystem
import json
import os

with open(f'{local_model_path}/model_config.json', 'r') as f:
    model_config = json.load(f)

with open(f'{local_model_path}/label_mappings.json', 'r') as f:
    label_mappings = json.load(f)

print(f"NER labels: {model_config['num_ner_labels']}")
print(f"Topic labels: {model_config['num_topic_labels']}")
print(f"Intent labels: {model_config['num_intent_labels']}")

# Kiểm tra VnCoreNLP JAR file tại path đã mount
print("\n=== Kiểm tra VnCoreNLP JAR file ===")
vncorenlp_path = '/opt/spark-apps/VnCoreNLP-1.2/VnCoreNLP-1.2.jar'

if os.path.exists(vncorenlp_path):
    print(f"Tìm thấy: {vncorenlp_path}")
    print(f"\nĐang khởi tạo VnCoreNLP...")
    try:
        annotator = VnCoreNLP(vncorenlp_path, annotators="wseg", max_heap_size='-Xmx2g')
        print("VnCoreNLP loaded successfully!")
    except Exception as e:
        print(f"Lỗi khởi tạo VnCoreNLP: {e}")
        annotator = None
else:
    print(f"Không tìm thấy VnCoreNLP JAR tại: {vncorenlp_path}")
    print("Kiểm tra lại volume mount trong docker-compose.yaml")
    annotator = None

# Lưu model_path cho các cell sau
model_path = local_model_path

## 4. Định Nghĩa Model Architecture

In [None]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import RobertaModel


class MultiTaskPhoBERT_WithFusion(nn.Module):
    """Multi-Task PhoBERT với Feature Fusion"""
    def __init__(self, phobert_path, num_ner_labels, num_topic_labels, num_intent_labels, dropout=0.2):
        super(MultiTaskPhoBERT_WithFusion, self).__init__()
        
        self.phobert = RobertaModel.from_pretrained(phobert_path)
        self.phobert.config.hidden_dropout_prob = 0.25
        self.phobert.config.attention_probs_dropout_prob = 0.25
        self.hidden_size = self.phobert.config.hidden_size
        self.num_ner_labels = num_ner_labels
        
        self.dropout = nn.Dropout(dropout)
        self.dropout_heavy = nn.Dropout(dropout * 1.5)
        
        # NER head
        self.ner_hidden = nn.Linear(self.hidden_size, self.hidden_size // 2)
        self.ner_norm = nn.LayerNorm(self.hidden_size // 2)
        self.ner_classifier = nn.Linear(self.hidden_size // 2, num_ner_labels)
        
        # Intent head
        self.intent_hidden = nn.Linear(self.hidden_size, self.hidden_size // 2)
        self.intent_norm = nn.LayerNorm(self.hidden_size // 2)
        self.intent_classifier = nn.Linear(self.hidden_size // 2, num_intent_labels)
        
        # Topic Fusion Head
        fusion_input_size = self.hidden_size + num_ner_labels
        self.topic_input_proj = nn.Linear(fusion_input_size, self.hidden_size)
        
        self.topic_layer1 = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.LayerNorm(self.hidden_size),
            nn.GELU(),
            nn.Dropout(dropout * 0.5)
        )
        
        self.topic_layer2 = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.LayerNorm(self.hidden_size),
            nn.GELU(),
            nn.Dropout(dropout * 0.5)
        )
        
        self.topic_classifier = nn.Linear(self.hidden_size, num_topic_labels)
        
        # NER attention mechanism
        self.ner_attention = nn.Sequential(
            nn.Linear(num_ner_labels, num_ner_labels // 2),
            nn.Tanh(),
            nn.Linear(num_ner_labels // 2, num_ner_labels),
            nn.Softmax(dim=-1)
        )
        
        # Cross-Attention
        self.cross_attention = nn.MultiheadAttention(
            embed_dim=self.hidden_size,
            num_heads=8,
            dropout=dropout,
            batch_first=True
        )
        self.cross_attn_norm = nn.LayerNorm(self.hidden_size)
        
        # Auxiliary head
        self.aux_topic_classifier = nn.Sequential(
            nn.Linear(num_ner_labels, num_ner_labels // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(num_ner_labels // 2, num_topic_labels)
        )
    
    def extract_ner_features(self, ner_logits, attention_mask):
        """Trích xuất NER features với MAX + AVG pooling"""
        ner_probs = F.softmax(ner_logits, dim=-1)
        
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand_as(ner_probs)
        ner_probs_masked = ner_probs * attention_mask_expanded
        
        max_features, _ = ner_probs_masked.max(dim=1)
        
        seq_lengths = attention_mask.sum(dim=1, keepdim=True).clamp(min=1)
        avg_features = ner_probs_masked.sum(dim=1) / seq_lengths
        
        ner_features = 0.5 * max_features + 0.5 * avg_features
        
        attention_weights = self.ner_attention(ner_features)
        ner_features_weighted = ner_features * attention_weights
        
        return ner_features_weighted
    
    def forward(self, input_ids, attention_mask=None, ner_labels=None, topic_labels=None, intent_labels=None):
        outputs = self.phobert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        cls_output = sequence_output[:, 0, :]
        
        cls_expanded = cls_output.unsqueeze(1)
        attn_output, _ = self.cross_attention(
            query=cls_expanded,
            key=sequence_output,
            value=sequence_output,
            key_padding_mask=(attention_mask == 0) if attention_mask is not None else None
        )
        cls_output = self.cross_attn_norm(cls_output + attn_output.squeeze(1))
        cls_output_dropped = self.dropout_heavy(cls_output)
        
        # NER predictions
        ner_hidden = self.ner_hidden(sequence_output)
        ner_hidden = self.ner_norm(ner_hidden)
        ner_hidden = F.gelu(ner_hidden)
        ner_hidden = self.dropout(ner_hidden)
        ner_logits = self.ner_classifier(ner_hidden)
        
        ner_features = self.extract_ner_features(ner_logits, attention_mask)
        
        # Topic prediction
        topic_input = torch.cat([cls_output_dropped, ner_features], dim=-1)
        topic_hidden = self.topic_input_proj(topic_input)
        topic_hidden = topic_hidden + self.topic_layer1(topic_hidden)
        topic_hidden = topic_hidden + self.topic_layer2(topic_hidden)
        topic_logits = self.topic_classifier(topic_hidden)
        
        # Intent prediction
        intent_hidden = self.intent_hidden(cls_output_dropped)
        intent_hidden = self.intent_norm(intent_hidden)
        intent_hidden = F.gelu(intent_hidden)
        intent_hidden = self.dropout(intent_hidden)
        intent_logits = self.intent_classifier(intent_hidden)
        
        return {
            'loss': None,
            'ner_logits': ner_logits,
            'topic_logits': topic_logits,
            'intent_logits': intent_logits
        }

print("Model class defined!")

## 5. Load Model Weights

In [None]:
# Đường dẫn model local (đã mount vào container)
local_model_path = '/opt/spark-apps/phobert_multitask_final'

print(f"Loading model from: {local_model_path}")

# Load tokenizer và model
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

model = MultiTaskPhoBERT_WithFusion(
    phobert_path=model_config['phobert_base'],
    num_ner_labels=model_config['num_ner_labels'],
    num_topic_labels=model_config['num_topic_labels'],
    num_intent_labels=model_config['num_intent_labels'],
    dropout=model_config['dropout']
)

# Load trọng số đã train
state_dict = torch.load(f'{local_model_path}/pytorch_model.bin', map_location='cpu')
model.load_state_dict(state_dict)
model.eval()

# Chuyển model sang GPU nếu có
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print(f"Model loaded successfully on {device}!")

## 6. Định Nghĩa Prediction Functions

In [None]:
def normalize_text_with_underscore(text, annotator):
    """Chuẩn hóa văn bản tiếng Việt bằng VnCoreNLP"""
    try:
        sentences = annotator.tokenize(text)
        
        normalized_sentences = []
        for sentence in sentences:
            words = []
            for word in sentence:
                if isinstance(word, list):
                    words.append('_'.join(word))
                else:
                    words.append(word)
            normalized_sentences.append(' '.join(words))
        
        normalized_text = ' '.join(normalized_sentences)
        return normalized_text
    except Exception as e:
        print(f"Error normalizing text: {e}")
        return text


def predict_labels(text, model, tokenizer, label_mappings, device, annotator, max_length=256):
    """Dự đoán NER, Topic, Intent cho văn bản đầu vào"""
    if not text or text.strip() == '':
        return {
            'normalized_text': '',
            'ner_labels': 'O',
            'topic_label': 'None',
            'intent_label': 'Unknown'
        }
    
    # Chuẩn hóa text bằng VnCoreNLP
    normalized_text = normalize_text_with_underscore(text, annotator)
    normalized_tokens = normalized_text.split()
    num_tokens = len(normalized_tokens)
    
    # Tokenize cho PhoBERT
    inputs = tokenizer(
        normalized_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    if 'token_type_ids' in inputs:
        inputs.pop('token_type_ids')
    
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Dự đoán
    with torch.no_grad():
        outputs = model(**inputs)
    
    ner_logits = outputs['ner_logits']
    topic_logits = outputs['topic_logits']
    intent_logits = outputs['intent_logits']
    
    # NER predictions
    ner_preds = torch.argmax(ner_logits, dim=-1)[0]
    phobert_tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    # Align labels với VnCoreNLP tokens
    ner_labels = []
    token_idx = 0
    
    for word in normalized_tokens:
        word_tokens = tokenizer.tokenize(word)
        
        while token_idx < len(phobert_tokens) and phobert_tokens[token_idx] == '<s>':
            token_idx += 1
        
        if token_idx < len(phobert_tokens) and phobert_tokens[token_idx] not in ['</s>', '<pad>']:
            label = label_mappings['ner_id2label'][str(ner_preds[token_idx].item())]
            ner_labels.append(label)
            token_idx += len(word_tokens)
        else:
            ner_labels.append('O')
            break
    
    # Topic predictions (multi-label, threshold=0.5)
    topic_probs = torch.sigmoid(topic_logits)[0]
    topic_preds = (topic_probs > 0.5).nonzero(as_tuple=True)[0].cpu().tolist()
    
    if len(topic_preds) > 0:
        topic_labels = [label_mappings['topic_id2label'][str(idx)] for idx in topic_preds]
        topic_label = '|'.join(topic_labels)
    else:
        topic_label = 'None'
    
    # Intent prediction (single-label)
    intent_pred = torch.argmax(intent_logits, dim=-1).item()
    intent_label = label_mappings['intent_id2label'][str(intent_pred)]
    
    return {
        'normalized_text': normalized_text,
        'ner_labels': ' '.join(ner_labels),
        'topic_label': topic_label,
        'intent_label': intent_label
    }

print("Prediction functions defined!")

## 7. Apply Model trên Toàn Bộ Dữ Liệu

In [None]:
# Định nghĩa schema cho DataFrame kết quả
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.sql.functions import days
import gc

schema = StructType([
    StructField("postID", StringType(), False),
    StructField("timePublish", TimestampType(), False),
    StructField("description_Normalized", StringType(), True),
    StructField("Label_NER", StringType(), True),
    StructField("Label_Topic", StringType(), True),
    StructField("Label_Intent", StringType(), True),
    StructField("likeCount", IntegerType(), True),
    StructField("commentCount", IntegerType(), True),
    StructField("shareCount", IntegerType(), True),
    StructField("type", StringType(), True)
])

# Kiểm tra xem table result_multi_model đã tồn tại chưa
try:
    df_existing = spark.table("nessie.silver_tables.result_multi_model")
    existing_post_ids = [row.postID for row in df_existing.select("postID").distinct().collect()]
    print(f"Tìm thấy {len(existing_post_ids)} articles đã được xử lý trước đó")
    
    # Giải phóng df_existing
    df_existing.unpersist()
    del df_existing
except Exception as e:
    print(f"Table result_multi_model chưa tồn tại hoặc rỗng: {e}")
    existing_post_ids = []

# Lọc ra các articles chưa được xử lý
if existing_post_ids:
    df_articles_new = df_articles.filter(~col("articleID").isin(existing_post_ids))
    print(f"Tổng số articles mới cần xử lý: {df_articles_new.count()}")
else:
    df_articles_new = df_articles
    print(f"Xử lý toàn bộ {df_articles.count()} articles")

print("\n=== Collecting data to process ===")
# Collect data về driver để xử lý (phù hợp với dataset nhỏ)
articles_to_process = df_articles_new.select("articleID", "description", "timePublish", 
                                              "likeCount", "commentCount", "shareCount", "type").collect()
print(f"Đã collect {len(articles_to_process)} articles về driver")

# Giải phóng df_articles_new sau khi collect
df_articles_new.unpersist()
del df_articles_new
gc.collect()
print("✓ Đã giải phóng df_articles_new")

# Xử lý từng article bằng model đã load sẵn trên driver
results = []
batch_size = 50  # Xử lý và lưu mỗi 50 records
start_time = time.time()

print("\n=== Processing articles with model ===")
for idx, row in enumerate(articles_to_process):
    if idx % 10 == 0:
        elapsed = time.time() - start_time
        avg_time = elapsed / (idx + 1) if idx > 0 else 0
        remaining = avg_time * (len(articles_to_process) - idx)
        print(f"[{time.strftime('%H:%M:%S')}] Processed {idx}/{len(articles_to_process)} | "
              f"Elapsed: {elapsed:.1f}s | ETA: {remaining:.1f}s")
    
    article_id = row['articleID']
    description = row['description']
    
    if not description or description.strip() == '':
        results.append({
            'postID': article_id,
            'timePublish': row['timePublish'],
            'description_Normalized': '',
            'Label_NER': 'O',
            'Label_Topic': 'None',
            'Label_Intent': 'Unknown',
            'likeCount': int(row['likeCount']) if row['likeCount'] else 0,
            'commentCount': int(row['commentCount']) if row['commentCount'] else 0,
            'shareCount': int(row['shareCount']) if row['shareCount'] else 0,
            'type': row['type']
        })
        continue
    
    try:
        # Sử dụng hàm predict_labels đã định nghĩa ở cell 6
        predictions = predict_labels(description, model, tokenizer, label_mappings, device, annotator)
        
        results.append({
            'postID': article_id,
            'timePublish': row['timePublish'],
            'description_Normalized': predictions['normalized_text'],
            'Label_NER': predictions['ner_labels'],
            'Label_Topic': predictions['topic_label'],
            'Label_Intent': predictions['intent_label'],
            'likeCount': int(row['likeCount']) if row['likeCount'] else 0,
            'commentCount': int(row['commentCount']) if row['commentCount'] else 0,
            'shareCount': int(row['shareCount']) if row['shareCount'] else 0,
            'type': row['type']
        })
    except Exception as e:
        print(f"Error processing article {article_id}: {e}")
        results.append({
            'postID': article_id,
            'timePublish': row['timePublish'],
            'description_Normalized': description,
            'Label_NER': 'O',
            'Label_Topic': 'None',
            'Label_Intent': 'Unknown',
            'likeCount': int(row['likeCount']) if row['likeCount'] else 0,
            'commentCount': int(row['commentCount']) if row['commentCount'] else 0,
            'shareCount': int(row['shareCount']) if row['shareCount'] else 0,
            'type': row['type']
        })
    
    # Lưu batch và reset results để giải phóng memory
    if (idx + 1) % batch_size == 0 and len(results) > 0:
        try:
            batch_df = spark.createDataFrame(results, schema=schema)
            batch_df = batch_df.withColumn("created_at", current_timestamp()) \
                               .withColumn("updated_at", current_timestamp())
            
            # Append batch vào table
            batch_df.writeTo("nessie.silver_tables.result_multi_model") \
                .using("iceberg") \
                .tableProperty("write.format.default", "parquet") \
                .tableProperty("write.metadata.compression-codec", "gzip") \
                .tableProperty("write.parquet.compression-codec", "gzip") \
                .append()
            
            print(f"✓ Saved batch: {len(results)} records | Total processed: {idx + 1}")
            
            # Giải phóng batch_df
            batch_df.unpersist()
            del batch_df
            
            results = []  # Reset
            gc.collect()  # Thu gom rác
            
        except Exception as batch_err:
            # Nếu table chưa tồn tại, tạo mới
            if "table does not exist" in str(batch_err).lower() or "not found" in str(batch_err).lower():
                print("Creating table for first batch...")
                batch_df.writeTo("nessie.silver_tables.result_multi_model") \
                    .using("iceberg") \
                    .tableProperty("write.format.default", "parquet") \
                    .tableProperty("write.metadata.compression-codec", "gzip") \
                    .tableProperty("write.parquet.compression-codec", "gzip") \
                    .partitionedBy(days(col("timePublish"))) \
                    .create()
                print(f"✓ Table created with {len(results)} records")
                
                # Giải phóng
                batch_df.unpersist()
                del batch_df
                results = []
                gc.collect()
            else:
                print(f"✗ Error saving batch: {batch_err}")

# Lưu records còn lại (nếu có)
if len(results) > 0:
    try:
        final_df = spark.createDataFrame(results, schema=schema)
        final_df = final_df.withColumn("created_at", current_timestamp()) \
                           .withColumn("updated_at", current_timestamp())
        
        final_df.writeTo("nessie.silver_tables.result_multi_model") \
            .using("iceberg") \
            .tableProperty("write.format.default", "parquet") \
            .tableProperty("write.metadata.compression-codec", "gzip") \
            .tableProperty("write.parquet.compression-codec", "gzip") \
            .append()
        
        print(f"✓ Saved final batch: {len(results)} records")
        
        # Giải phóng
        final_df.unpersist()
        del final_df
        
    except Exception as final_err:
        if "table does not exist" in str(final_err).lower():
            final_df.writeTo("nessie.silver_tables.result_multi_model") \
                .using("iceberg") \
                .tableProperty("write.format.default", "parquet") \
                .tableProperty("write.metadata.compression-codec", "gzip") \
                .tableProperty("write.parquet.compression-codec", "gzip") \
                .partitionedBy(days(col("timePublish"))) \
                .create()
            print(f"✓ Table created with {len(results)} records")
            
            # Giải phóng
            final_df.unpersist()
            del final_df

# Giải phóng bộ nhớ cuối cùng
del results, articles_to_process
gc.collect()
print("✓ Đã giải phóng bộ nhớ processing")

total_time = time.time() - start_time
print(f"\n=== Completed processing in {total_time:.1f}s ===")

# Kiểm tra kết quả đã lưu
print("\n=== Checking saved results ===")
try:
    df_check = spark.table("nessie.silver_tables.result_multi_model")
    print(f"Total records in table: {df_check.count()}")
    print("\nRecent 5 records:")
    df_check.orderBy(col("created_at").desc()).show(5, truncate=80)
    
    # Giải phóng df_check
    df_check.unpersist()
    del df_check
    gc.collect()
    
except Exception as e:
    print(f"Could not read table: {e}")

## 8. Verify Dữ Liệu Đã Lưu

In [None]:
# Verify dữ liệu đã lưu (Cell 7 đã tự động lưu theo batch)
import gc

try:
    df_verify = spark.table("nessie.silver_tables.result_multi_model")
    total_count = df_verify.count()
    
    print(f"✓ Total records in table: {total_count}")
    print("\nRecent saved records:")
    df_verify.orderBy(col("created_at").desc()).show(5, truncate=False)
    
    # Thống kê số lượng records theo thời gian tạo
    print("\n=== Records saved by time ===")
    df_verify.groupBy("created_at").count() \
        .orderBy(col("created_at").desc()) \
        .show(20, truncate=False)
    
    # Giải phóng bộ nhớ
    df_verify.unpersist()
    del df_verify
    gc.collect()
    print("\n✓ Đã giải phóng bộ nhớ")
    
except Exception as e:
    print(f"✗ Error reading table: {e}")

## 9. Thống kê kết quả

In [None]:
# Đọc lại dữ liệu từ table
import gc

df_verify = spark.table("nessie.silver_tables.result_multi_model")

print(f"Total records in result_multi_model: {df_verify.count()}")
print("\nSample records:")
df_verify.show(10, truncate=False)

print("\n=== Label Statistics ===")
print("\nTopic distribution:")
df_topic = df_verify.groupBy("Label_Topic").count().orderBy(col("count").desc())
df_topic.show(10, truncate=False)
del df_topic

print("\nIntent distribution:")
df_intent = df_verify.groupBy("Label_Intent").count().orderBy(col("count").desc())
df_intent.show(truncate=False)
del df_intent

# Giải phóng bộ nhớ
df_verify.unpersist()
del df_verify
gc.collect()
print("\n✓ Đã giải phóng bộ nhớ")

## 10. Dừng Spark Session

In [None]:
# Giải phóng bộ nhớ trước khi dừng Spark
import gc

# Unpersist df_articles nếu còn cache
try:
    df_articles.unpersist()
    del df_articles
except:
    pass

# Giải phóng model và tokenizer
try:
    del model, tokenizer, label_mappings
except:
    pass

# Giải phóng VnCoreNLP
try:
    if annotator is not None:
        del annotator
except:
    pass

# Thu gom rác Python
gc.collect()
print("✓ Đã giải phóng tất cả bộ nhớ")

# Dừng Spark Session
spark.stop()
print("✓ Spark Session đã được dừng!")