## 1. Import Libraries và Khởi tạo Spark Session

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, udf, current_timestamp
import torch
import json
import time
from transformers import AutoTokenizer
from vncorenlp import VnCoreNLP
import os
import warnings
warnings.filterwarnings('ignore')

# Set AWS environment variables for MinIO
os.environ['AWS_REGION'] = 'us-east-1'
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'admin123'

# Khởi tạo Spark Session với Iceberg và Nessie catalog
spark = (
    SparkSession.builder.appName("Apply_Model_Multi_Task")
    .master("spark://spark-master:7077")
    .config("spark.executor.memory", "1536m")  # 1.5GB - an toàn với worker 2GB
    .config("spark.executor.cores", "2")
    .config("spark.network.timeout", "600s")
    .config("spark.executor.heartbeatInterval", "60s")
    .config("spark.storage.blockManagerSlaveTimeoutMs", "600000")
    .config("spark.rpc.askTimeout", "600s")
    # ===== Iceberg Catalog qua Nessie =====
    .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
    .config("spark.sql.catalog.nessie.uri", "http://nessie:19120/api/v2")
    .config("spark.sql.catalog.nessie.ref", "main")
    .config("spark.sql.catalog.nessie.warehouse", "s3a://silver/")
    .config("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    # ===== Cấu hình MinIO (S3-compatible) =====
    .config("spark.sql.catalog.nessie.s3.endpoint", "http://minio:9000")
    .config("spark.sql.catalog.nessie.s3.access-key-id", "admin")
    .config("spark.sql.catalog.nessie.s3.secret-access-key", "admin123")
    .config("spark.sql.catalog.nessie.s3.path-style-access", "true")
    .config("spark.sql.catalog.nessie.s3.region", "us-east-1")
    # ===== Spark + Hadoop S3 connector =====
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", "admin")
    .config("spark.hadoop.fs.s3a.secret.key", "admin123")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.region", "us-east-1")
    # Propagate environment variables to executors
    .config("spark.executorEnv.AWS_REGION", "us-east-1")
    .config("spark.executorEnv.AWS_ACCESS_KEY_ID", "admin")
    .config("spark.executorEnv.AWS_SECRET_ACCESS_KEY", "admin123")
    # ===== Sử dụng JAR files local =====
    .config("spark.jars", "/opt/spark/jars/hadoop-aws-3.3.4.jar,/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")
print("Spark Session da duoc khoi tao voi Nessie catalog!")
print(f"Spark Master: {spark.sparkContext.master}")
print(f"Application ID: {spark.sparkContext.applicationId}")

25/12/08 07:29:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Spark Session da duoc khoi tao voi Nessie catalog!
Spark Master: spark://spark-master:7077
Application ID: app-20251208072944-0002


## 2. Đọc Dữ Liệu từ Bảng Article

In [2]:
# Đọc dữ liệu từ bảng article
df_articles = spark.table("nessie.silver_tables.article")
print(f"Tổng số articles: {df_articles.count()}")

# Cache để tránh đọc lại nhiều lần
df_articles.cache()
print("✓ Đã cache df_articles")

[Stage 0:>                                                          (0 + 1) / 1]

Tổng số articles: 3295
✓ Đã cache df_articles


                                                                                

## 3. Load Model và Cấu Hình

In [3]:
# Đường dẫn model local (đã mount vào container)
local_model_path = '/opt/spark-apps/phobert_multitask_final'

# Đọc JSON files từ local filesystem
import json
import os

with open(f'{local_model_path}/model_config.json', 'r') as f:
    model_config = json.load(f)

with open(f'{local_model_path}/label_mappings.json', 'r') as f:
    label_mappings = json.load(f)

print(f"NER labels: {model_config['num_ner_labels']}")
print(f"Topic labels: {model_config['num_topic_labels']}")
print(f"Intent labels: {model_config['num_intent_labels']}")

# Kiểm tra VnCoreNLP JAR file tại path đã mount
print("\n=== Kiểm tra VnCoreNLP JAR file ===")
vncorenlp_path = '/opt/spark-apps/VnCoreNLP-1.2/VnCoreNLP-1.2.jar'

if os.path.exists(vncorenlp_path):
    print(f"Tìm thấy: {vncorenlp_path}")
    print(f"\nĐang khởi tạo VnCoreNLP...")
    try:
        annotator = VnCoreNLP(vncorenlp_path, annotators="wseg", max_heap_size='-Xmx2g')
        print("VnCoreNLP loaded successfully!")
    except Exception as e:
        print(f"Lỗi khởi tạo VnCoreNLP: {e}")
        annotator = None
else:
    print(f"Không tìm thấy VnCoreNLP JAR tại: {vncorenlp_path}")
    print("Kiểm tra lại volume mount trong docker-compose.yaml")
    annotator = None

# Lưu model_path cho các cell sau
model_path = local_model_path

NER labels: 25
Topic labels: 10
Intent labels: 7

=== Kiểm tra VnCoreNLP JAR file ===
Tìm thấy: /opt/spark-apps/VnCoreNLP-1.2/VnCoreNLP-1.2.jar

Đang khởi tạo VnCoreNLP...
VnCoreNLP loaded successfully!


## 4. Định Nghĩa Model Architecture

In [4]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import RobertaModel


class MultiTaskPhoBERT_WithFusion(nn.Module):
    """Multi-Task PhoBERT với Feature Fusion"""
    def __init__(self, phobert_path, num_ner_labels, num_topic_labels, num_intent_labels, dropout=0.2):
        super(MultiTaskPhoBERT_WithFusion, self).__init__()
        
        self.phobert = RobertaModel.from_pretrained(phobert_path)
        self.phobert.config.hidden_dropout_prob = 0.25
        self.phobert.config.attention_probs_dropout_prob = 0.25
        self.hidden_size = self.phobert.config.hidden_size
        self.num_ner_labels = num_ner_labels
        
        self.dropout = nn.Dropout(dropout)
        self.dropout_heavy = nn.Dropout(dropout * 1.5)
        
        # NER head
        self.ner_hidden = nn.Linear(self.hidden_size, self.hidden_size // 2)
        self.ner_norm = nn.LayerNorm(self.hidden_size // 2)
        self.ner_classifier = nn.Linear(self.hidden_size // 2, num_ner_labels)
        
        # Intent head
        self.intent_hidden = nn.Linear(self.hidden_size, self.hidden_size // 2)
        self.intent_norm = nn.LayerNorm(self.hidden_size // 2)
        self.intent_classifier = nn.Linear(self.hidden_size // 2, num_intent_labels)
        
        # Topic Fusion Head
        fusion_input_size = self.hidden_size + num_ner_labels
        self.topic_input_proj = nn.Linear(fusion_input_size, self.hidden_size)
        
        self.topic_layer1 = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.LayerNorm(self.hidden_size),
            nn.GELU(),
            nn.Dropout(dropout * 0.5)
        )
        
        self.topic_layer2 = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.LayerNorm(self.hidden_size),
            nn.GELU(),
            nn.Dropout(dropout * 0.5)
        )
        
        self.topic_classifier = nn.Linear(self.hidden_size, num_topic_labels)
        
        # NER attention mechanism
        self.ner_attention = nn.Sequential(
            nn.Linear(num_ner_labels, num_ner_labels // 2),
            nn.Tanh(),
            nn.Linear(num_ner_labels // 2, num_ner_labels),
            nn.Softmax(dim=-1)
        )
        
        # Cross-Attention
        self.cross_attention = nn.MultiheadAttention(
            embed_dim=self.hidden_size,
            num_heads=8,
            dropout=dropout,
            batch_first=True
        )
        self.cross_attn_norm = nn.LayerNorm(self.hidden_size)
        
        # Auxiliary head
        self.aux_topic_classifier = nn.Sequential(
            nn.Linear(num_ner_labels, num_ner_labels // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(num_ner_labels // 2, num_topic_labels)
        )
    
    def extract_ner_features(self, ner_logits, attention_mask):
        """Trích xuất NER features với MAX + AVG pooling"""
        ner_probs = F.softmax(ner_logits, dim=-1)
        
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand_as(ner_probs)
        ner_probs_masked = ner_probs * attention_mask_expanded
        
        max_features, _ = ner_probs_masked.max(dim=1)
        
        seq_lengths = attention_mask.sum(dim=1, keepdim=True).clamp(min=1)
        avg_features = ner_probs_masked.sum(dim=1) / seq_lengths
        
        ner_features = 0.5 * max_features + 0.5 * avg_features
        
        attention_weights = self.ner_attention(ner_features)
        ner_features_weighted = ner_features * attention_weights
        
        return ner_features_weighted
    
    def forward(self, input_ids, attention_mask=None, ner_labels=None, topic_labels=None, intent_labels=None):
        outputs = self.phobert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        cls_output = sequence_output[:, 0, :]
        
        cls_expanded = cls_output.unsqueeze(1)
        attn_output, _ = self.cross_attention(
            query=cls_expanded,
            key=sequence_output,
            value=sequence_output,
            key_padding_mask=(attention_mask == 0) if attention_mask is not None else None
        )
        cls_output = self.cross_attn_norm(cls_output + attn_output.squeeze(1))
        cls_output_dropped = self.dropout_heavy(cls_output)
        
        # NER predictions
        ner_hidden = self.ner_hidden(sequence_output)
        ner_hidden = self.ner_norm(ner_hidden)
        ner_hidden = F.gelu(ner_hidden)
        ner_hidden = self.dropout(ner_hidden)
        ner_logits = self.ner_classifier(ner_hidden)
        
        ner_features = self.extract_ner_features(ner_logits, attention_mask)
        
        # Topic prediction
        topic_input = torch.cat([cls_output_dropped, ner_features], dim=-1)
        topic_hidden = self.topic_input_proj(topic_input)
        topic_hidden = topic_hidden + self.topic_layer1(topic_hidden)
        topic_hidden = topic_hidden + self.topic_layer2(topic_hidden)
        topic_logits = self.topic_classifier(topic_hidden)
        
        # Intent prediction
        intent_hidden = self.intent_hidden(cls_output_dropped)
        intent_hidden = self.intent_norm(intent_hidden)
        intent_hidden = F.gelu(intent_hidden)
        intent_hidden = self.dropout(intent_hidden)
        intent_logits = self.intent_classifier(intent_hidden)
        
        return {
            'loss': None,
            'ner_logits': ner_logits,
            'topic_logits': topic_logits,
            'intent_logits': intent_logits
        }

print("Model class defined!")

Model class defined!


## 5. Load Model Weights

In [5]:
# Đường dẫn model local (đã mount vào container)
local_model_path = '/opt/spark-apps/phobert_multitask_final'

print(f"Loading model from: {local_model_path}")

# Load tokenizer và model
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

model = MultiTaskPhoBERT_WithFusion(
    phobert_path=model_config['phobert_base'],
    num_ner_labels=model_config['num_ner_labels'],
    num_topic_labels=model_config['num_topic_labels'],
    num_intent_labels=model_config['num_intent_labels'],
    dropout=model_config['dropout']
)

# Load trọng số đã train
state_dict = torch.load(f'{local_model_path}/pytorch_model.bin', map_location='cpu')
model.load_state_dict(state_dict)
model.eval()

# Chuyển model sang GPU nếu có
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print(f"Model loaded successfully on {device}!")

Loading model from: /opt/spark-apps/phobert_multitask_final


config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully on cpu!


## 6. Định Nghĩa Prediction Functions

In [6]:
def normalize_text_with_underscore(text, annotator):
    """Chuẩn hóa văn bản tiếng Việt bằng VnCoreNLP"""
    try:
        sentences = annotator.tokenize(text)
        
        normalized_sentences = []
        for sentence in sentences:
            words = []
            for word in sentence:
                if isinstance(word, list):
                    words.append('_'.join(word))
                else:
                    words.append(word)
            normalized_sentences.append(' '.join(words))
        
        normalized_text = ' '.join(normalized_sentences)
        return normalized_text
    except Exception as e:
        print(f"Error normalizing text: {e}")
        return text


def predict_labels(text, model, tokenizer, label_mappings, device, annotator, max_length=256):
    """Dự đoán NER, Topic, Intent cho văn bản đầu vào"""
    if not text or text.strip() == '':
        return {
            'normalized_text': '',
            'ner_labels': 'O',
            'topic_label': 'None',
            'intent_label': 'Unknown'
        }
    
    # Chuẩn hóa text bằng VnCoreNLP
    normalized_text = normalize_text_with_underscore(text, annotator)
    normalized_tokens = normalized_text.split()
    num_tokens = len(normalized_tokens)
    
    # Tokenize cho PhoBERT
    inputs = tokenizer(
        normalized_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    if 'token_type_ids' in inputs:
        inputs.pop('token_type_ids')
    
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Dự đoán
    with torch.no_grad():
        outputs = model(**inputs)
    
    ner_logits = outputs['ner_logits']
    topic_logits = outputs['topic_logits']
    intent_logits = outputs['intent_logits']
    
    # NER predictions
    ner_preds = torch.argmax(ner_logits, dim=-1)[0]
    phobert_tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    # Align labels với VnCoreNLP tokens
    ner_labels = []
    token_idx = 0
    
    for word in normalized_tokens:
        word_tokens = tokenizer.tokenize(word)
        
        while token_idx < len(phobert_tokens) and phobert_tokens[token_idx] == '<s>':
            token_idx += 1
        
        if token_idx < len(phobert_tokens) and phobert_tokens[token_idx] not in ['</s>', '<pad>']:
            label = label_mappings['ner_id2label'][str(ner_preds[token_idx].item())]
            ner_labels.append(label)
            token_idx += len(word_tokens)
        else:
            ner_labels.append('O')
            break
    
    # Topic predictions (multi-label, threshold=0.5)
    topic_probs = torch.sigmoid(topic_logits)[0]
    topic_preds = (topic_probs > 0.5).nonzero(as_tuple=True)[0].cpu().tolist()
    
    if len(topic_preds) > 0:
        topic_labels = [label_mappings['topic_id2label'][str(idx)] for idx in topic_preds]
        topic_label = '|'.join(topic_labels)
    else:
        topic_label = 'None'
    
    # Intent prediction (single-label)
    intent_pred = torch.argmax(intent_logits, dim=-1).item()
    intent_label = label_mappings['intent_id2label'][str(intent_pred)]
    
    return {
        'normalized_text': normalized_text,
        'ner_labels': ' '.join(ner_labels),
        'topic_label': topic_label,
        'intent_label': intent_label
    }

print("Prediction functions defined!")

Prediction functions defined!


## 7. Apply Model trên Toàn Bộ Dữ Liệu

In [7]:
# Định nghĩa schema cho DataFrame kết quả
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.sql.functions import days
import gc

schema = StructType([
    StructField("postID", StringType(), False),
    StructField("timePublish", TimestampType(), False),
    StructField("description_Normalized", StringType(), True),
    StructField("Label_NER", StringType(), True),
    StructField("Label_Topic", StringType(), True),
    StructField("Label_Intent", StringType(), True),
    StructField("likeCount", IntegerType(), True),
    StructField("commentCount", IntegerType(), True),
    StructField("shareCount", IntegerType(), True),
    StructField("type", StringType(), True)
])

# Kiểm tra xem table result_multi_model đã tồn tại chưa
try:
    df_existing = spark.table("nessie.silver_tables.result_multi_model")
    existing_post_ids = [row.postID for row in df_existing.select("postID").distinct().collect()]
    print(f"Tìm thấy {len(existing_post_ids)} articles đã được xử lý trước đó")
    
    # Giải phóng df_existing
    df_existing.unpersist()
    del df_existing
except Exception as e:
    print(f"Table result_multi_model chưa tồn tại hoặc rỗng: {e}")
    existing_post_ids = []

# Lọc ra các articles chưa được xử lý
if existing_post_ids:
    df_articles_new = df_articles.filter(~col("articleID").isin(existing_post_ids))
    print(f"Tổng số articles mới cần xử lý: {df_articles_new.count()}")
else:
    df_articles_new = df_articles
    print(f"Xử lý toàn bộ {df_articles.count()} articles")

print("\n=== Collecting data to process ===")
# Collect data về driver để xử lý (phù hợp với dataset nhỏ)
articles_to_process = df_articles_new.select("articleID", "description", "timePublish", 
                                              "likeCount", "commentCount", "shareCount", "type").collect()
print(f"Đã collect {len(articles_to_process)} articles về driver")

# Giải phóng df_articles_new sau khi collect
df_articles_new.unpersist()
del df_articles_new
gc.collect()
print("✓ Đã giải phóng df_articles_new")

# Xử lý từng article bằng model đã load sẵn trên driver
results = []
batch_size = 50  # Xử lý và lưu mỗi 50 records
start_time = time.time()

print("\n=== Processing articles with model ===")
for idx, row in enumerate(articles_to_process):
    if idx % 10 == 0:
        elapsed = time.time() - start_time
        avg_time = elapsed / (idx + 1) if idx > 0 else 0
        remaining = avg_time * (len(articles_to_process) - idx)
        print(f"[{time.strftime('%H:%M:%S')}] Processed {idx}/{len(articles_to_process)} | "
              f"Elapsed: {elapsed:.1f}s | ETA: {remaining:.1f}s")
    
    article_id = row['articleID']
    description = row['description']
    
    if not description or description.strip() == '':
        results.append({
            'postID': article_id,
            'timePublish': row['timePublish'],
            'description_Normalized': '',
            'Label_NER': 'O',
            'Label_Topic': 'None',
            'Label_Intent': 'Unknown',
            'likeCount': int(row['likeCount']) if row['likeCount'] else 0,
            'commentCount': int(row['commentCount']) if row['commentCount'] else 0,
            'shareCount': int(row['shareCount']) if row['shareCount'] else 0,
            'type': row['type']
        })
        continue
    
    try:
        # Sử dụng hàm predict_labels đã định nghĩa ở cell 6
        predictions = predict_labels(description, model, tokenizer, label_mappings, device, annotator)
        
        results.append({
            'postID': article_id,
            'timePublish': row['timePublish'],
            'description_Normalized': predictions['normalized_text'],
            'Label_NER': predictions['ner_labels'],
            'Label_Topic': predictions['topic_label'],
            'Label_Intent': predictions['intent_label'],
            'likeCount': int(row['likeCount']) if row['likeCount'] else 0,
            'commentCount': int(row['commentCount']) if row['commentCount'] else 0,
            'shareCount': int(row['shareCount']) if row['shareCount'] else 0,
            'type': row['type']
        })
    except Exception as e:
        print(f"Error processing article {article_id}: {e}")
        results.append({
            'postID': article_id,
            'timePublish': row['timePublish'],
            'description_Normalized': description,
            'Label_NER': 'O',
            'Label_Topic': 'None',
            'Label_Intent': 'Unknown',
            'likeCount': int(row['likeCount']) if row['likeCount'] else 0,
            'commentCount': int(row['commentCount']) if row['commentCount'] else 0,
            'shareCount': int(row['shareCount']) if row['shareCount'] else 0,
            'type': row['type']
        })
    
    # Lưu batch và reset results để giải phóng memory
    if (idx + 1) % batch_size == 0 and len(results) > 0:
        try:
            batch_df = spark.createDataFrame(results, schema=schema)
            batch_df = batch_df.withColumn("created_at", current_timestamp()) \
                               .withColumn("updated_at", current_timestamp())
            
            # Append batch vào table
            batch_df.writeTo("nessie.silver_tables.result_multi_model") \
                .using("iceberg") \
                .tableProperty("write.format.default", "parquet") \
                .tableProperty("write.metadata.compression-codec", "gzip") \
                .tableProperty("write.parquet.compression-codec", "gzip") \
                .append()
            
            print(f"✓ Saved batch: {len(results)} records | Total processed: {idx + 1}")
            
            # Giải phóng batch_df
            batch_df.unpersist()
            del batch_df
            
            results = []  # Reset
            gc.collect()  # Thu gom rác
            
        except Exception as batch_err:
            # Nếu table chưa tồn tại, tạo mới
            if "table does not exist" in str(batch_err).lower() or "not found" in str(batch_err).lower():
                print("Creating table for first batch...")
                batch_df.writeTo("nessie.silver_tables.result_multi_model") \
                    .using("iceberg") \
                    .tableProperty("write.format.default", "parquet") \
                    .tableProperty("write.metadata.compression-codec", "gzip") \
                    .tableProperty("write.parquet.compression-codec", "gzip") \
                    .partitionedBy(days(col("timePublish"))) \
                    .create()
                print(f"✓ Table created with {len(results)} records")
                
                # Giải phóng
                batch_df.unpersist()
                del batch_df
                results = []
                gc.collect()
            else:
                print(f"✗ Error saving batch: {batch_err}")

# Lưu records còn lại (nếu có)
if len(results) > 0:
    try:
        final_df = spark.createDataFrame(results, schema=schema)
        final_df = final_df.withColumn("created_at", current_timestamp()) \
                           .withColumn("updated_at", current_timestamp())
        
        final_df.writeTo("nessie.silver_tables.result_multi_model") \
            .using("iceberg") \
            .tableProperty("write.format.default", "parquet") \
            .tableProperty("write.metadata.compression-codec", "gzip") \
            .tableProperty("write.parquet.compression-codec", "gzip") \
            .append()
        
        print(f"✓ Saved final batch: {len(results)} records")
        
        # Giải phóng
        final_df.unpersist()
        del final_df
        
    except Exception as final_err:
        if "table does not exist" in str(final_err).lower():
            final_df.writeTo("nessie.silver_tables.result_multi_model") \
                .using("iceberg") \
                .tableProperty("write.format.default", "parquet") \
                .tableProperty("write.metadata.compression-codec", "gzip") \
                .tableProperty("write.parquet.compression-codec", "gzip") \
                .partitionedBy(days(col("timePublish"))) \
                .create()
            print(f"✓ Table created with {len(results)} records")
            
            # Giải phóng
            final_df.unpersist()
            del final_df

# Giải phóng bộ nhớ cuối cùng
del results, articles_to_process
gc.collect()
print("✓ Đã giải phóng bộ nhớ processing")

total_time = time.time() - start_time
print(f"\n=== Completed processing in {total_time:.1f}s ===")

# Kiểm tra kết quả đã lưu
print("\n=== Checking saved results ===")
try:
    df_check = spark.table("nessie.silver_tables.result_multi_model")
    print(f"Total records in table: {df_check.count()}")
    print("\nRecent 5 records:")
    df_check.orderBy(col("created_at").desc()).show(5, truncate=80)
    
    # Giải phóng df_check
    df_check.unpersist()
    del df_check
    gc.collect()
    
except Exception as e:
    print(f"Could not read table: {e}")

Tìm thấy 0 articles đã được xử lý trước đó


                                                                                

Xử lý toàn bộ 3295 articles

=== Collecting data to process ===


                                                                                

Đã collect 3295 articles về driver
✓ Đã giải phóng df_articles_new

=== Processing articles with model ===
[07:31:12] Processed 0/3295 | Elapsed: 0.0s | ETA: 0.0s
[07:31:15] Processed 10/3295 | Elapsed: 2.7s | ETA: 818.8s
[07:31:17] Processed 20/3295 | Elapsed: 5.1s | ETA: 789.7s
[07:31:20] Processed 30/3295 | Elapsed: 7.9s | ETA: 835.4s
[07:31:23] Processed 40/3295 | Elapsed: 10.4s | ETA: 825.1s


                                                                                

✓ Saved batch: 50 records | Total processed: 50
[07:33:26] Processed 50/3295 | Elapsed: 134.0s | ETA: 8525.5s
[07:33:29] Processed 60/3295 | Elapsed: 136.2s | ETA: 7225.1s
[07:33:31] Processed 70/3295 | Elapsed: 138.3s | ETA: 6280.9s
[07:33:33] Processed 80/3295 | Elapsed: 140.4s | ETA: 5573.5s
[07:33:35] Processed 90/3295 | Elapsed: 142.4s | ETA: 5016.7s


                                                                                

✓ Saved batch: 50 records | Total processed: 100
[07:33:38] Processed 100/3295 | Elapsed: 146.1s | ETA: 4621.9s
[07:33:41] Processed 110/3295 | Elapsed: 148.2s | ETA: 4251.7s
[07:33:43] Processed 120/3295 | Elapsed: 150.2s | ETA: 3940.7s
[07:33:45] Processed 130/3295 | Elapsed: 152.2s | ETA: 3678.3s
[07:33:47] Processed 140/3295 | Elapsed: 154.2s | ETA: 3450.5s


                                                                                

✓ Saved batch: 50 records | Total processed: 150
[07:33:50] Processed 150/3295 | Elapsed: 157.7s | ETA: 3285.0s
[07:33:52] Processed 160/3295 | Elapsed: 159.7s | ETA: 3110.0s
[07:33:54] Processed 170/3295 | Elapsed: 161.7s | ETA: 2955.0s
[07:33:56] Processed 180/3295 | Elapsed: 163.7s | ETA: 2817.6s
[07:33:58] Processed 190/3295 | Elapsed: 165.8s | ETA: 2695.3s


                                                                                

✓ Saved batch: 50 records | Total processed: 200
[07:34:02] Processed 200/3295 | Elapsed: 169.3s | ETA: 2606.2s
[07:34:04] Processed 210/3295 | Elapsed: 171.2s | ETA: 2503.6s
[07:34:06] Processed 220/3295 | Elapsed: 173.2s | ETA: 2410.5s
[07:34:08] Processed 230/3295 | Elapsed: 175.2s | ETA: 2325.3s
[07:34:10] Processed 240/3295 | Elapsed: 177.3s | ETA: 2247.5s
✓ Saved batch: 50 records | Total processed: 250
[07:34:13] Processed 250/3295 | Elapsed: 180.4s | ETA: 2189.0s
[07:34:15] Processed 260/3295 | Elapsed: 182.5s | ETA: 2121.7s
[07:34:17] Processed 270/3295 | Elapsed: 184.4s | ETA: 2058.6s
[07:34:19] Processed 280/3295 | Elapsed: 186.5s | ETA: 2000.6s
[07:34:21] Processed 290/3295 | Elapsed: 188.5s | ETA: 1947.0s


                                                                                

✓ Saved batch: 50 records | Total processed: 300
[07:34:24] Processed 300/3295 | Elapsed: 191.9s | ETA: 1909.2s
[07:34:26] Processed 310/3295 | Elapsed: 193.9s | ETA: 1860.9s
[07:34:29] Processed 320/3295 | Elapsed: 196.2s | ETA: 1818.6s
[07:34:31] Processed 330/3295 | Elapsed: 198.4s | ETA: 1777.2s
[07:34:33] Processed 340/3295 | Elapsed: 200.5s | ETA: 1737.4s


                                                                                

✓ Saved batch: 50 records | Total processed: 350
[07:34:36] Processed 350/3295 | Elapsed: 203.9s | ETA: 1711.1s
[07:34:39] Processed 360/3295 | Elapsed: 206.2s | ETA: 1676.4s
[07:34:41] Processed 370/3295 | Elapsed: 208.4s | ETA: 1643.3s
[07:34:43] Processed 380/3295 | Elapsed: 210.6s | ETA: 1611.0s
[07:34:45] Processed 390/3295 | Elapsed: 212.7s | ETA: 1579.9s
✓ Saved batch: 50 records | Total processed: 400
[07:34:48] Processed 400/3295 | Elapsed: 215.7s | ETA: 1557.5s
[07:34:50] Processed 410/3295 | Elapsed: 217.8s | ETA: 1529.1s
[07:34:52] Processed 420/3295 | Elapsed: 219.8s | ETA: 1501.3s
[07:34:54] Processed 430/3295 | Elapsed: 221.9s | ETA: 1475.0s
[07:34:56] Processed 440/3295 | Elapsed: 223.9s | ETA: 1449.7s


                                                                                

✓ Saved batch: 50 records | Total processed: 450
[07:35:00] Processed 450/3295 | Elapsed: 227.4s | ETA: 1434.5s
[07:35:02] Processed 460/3295 | Elapsed: 229.4s | ETA: 1411.0s
[07:35:04] Processed 470/3295 | Elapsed: 231.4s | ETA: 1388.0s
[07:35:06] Processed 480/3295 | Elapsed: 233.5s | ETA: 1366.3s
[07:35:08] Processed 490/3295 | Elapsed: 235.5s | ETA: 1345.5s


                                                                                

✓ Saved batch: 50 records | Total processed: 500
[07:35:11] Processed 500/3295 | Elapsed: 238.9s | ETA: 1333.0s
[07:35:13] Processed 510/3295 | Elapsed: 240.9s | ETA: 1313.1s
[07:35:15] Processed 520/3295 | Elapsed: 242.9s | ETA: 1293.9s
[07:35:17] Processed 530/3295 | Elapsed: 245.0s | ETA: 1275.7s
[07:35:19] Processed 540/3295 | Elapsed: 247.0s | ETA: 1258.0s


                                                                                

✓ Saved batch: 50 records | Total processed: 550
[07:35:23] Processed 550/3295 | Elapsed: 250.3s | ETA: 1247.1s
[07:35:25] Processed 560/3295 | Elapsed: 252.3s | ETA: 1230.2s
[07:35:27] Processed 570/3295 | Elapsed: 254.3s | ETA: 1213.7s
[07:35:29] Processed 580/3295 | Elapsed: 256.4s | ETA: 1198.3s
[07:35:31] Processed 590/3295 | Elapsed: 258.6s | ETA: 1183.5s
✓ Saved batch: 50 records | Total processed: 600
[07:35:34] Processed 600/3295 | Elapsed: 261.4s | ETA: 1172.2s
[07:35:36] Processed 610/3295 | Elapsed: 263.4s | ETA: 1157.7s
[07:35:38] Processed 620/3295 | Elapsed: 265.5s | ETA: 1143.6s
[07:35:40] Processed 630/3295 | Elapsed: 267.5s | ETA: 1129.9s
[07:35:42] Processed 640/3295 | Elapsed: 269.6s | ETA: 1116.7s
✓ Saved batch: 50 records | Total processed: 650
[07:35:45] Processed 650/3295 | Elapsed: 272.8s | ETA: 1108.4s
[07:35:47] Processed 660/3295 | Elapsed: 275.1s | ETA: 1096.5s
[07:35:50] Processed 670/3295 | Elapsed: 277.2s | ETA: 1084.3s
[07:35:52] Processed 680/3295 | El

                                                                                

✓ Saved batch: 50 records | Total processed: 1800
[07:40:05] Processed 1800/3295 | Elapsed: 532.8s | ETA: 442.3s
[07:40:10] Processed 1810/3295 | Elapsed: 537.5s | ETA: 440.7s
[07:40:14] Processed 1820/3295 | Elapsed: 541.8s | ETA: 438.8s
[07:40:18] Processed 1830/3295 | Elapsed: 546.0s | ETA: 436.9s
[07:40:22] Processed 1840/3295 | Elapsed: 549.8s | ETA: 434.5s


                                                                                

✓ Saved batch: 50 records | Total processed: 1850
[07:40:29] Processed 1850/3295 | Elapsed: 556.7s | ETA: 434.6s
[07:40:33] Processed 1860/3295 | Elapsed: 560.2s | ETA: 432.0s
[07:40:35] Processed 1870/3295 | Elapsed: 562.9s | ETA: 428.7s
[07:40:38] Processed 1880/3295 | Elapsed: 565.9s | ETA: 425.7s
[07:40:41] Processed 1890/3295 | Elapsed: 568.8s | ETA: 422.6s
✓ Saved batch: 50 records | Total processed: 1900
[07:40:45] Processed 1900/3295 | Elapsed: 572.5s | ETA: 420.1s
[07:40:48] Processed 1910/3295 | Elapsed: 575.3s | ETA: 416.9s
[07:40:51] Processed 1920/3295 | Elapsed: 578.4s | ETA: 414.0s
[07:40:53] Processed 1930/3295 | Elapsed: 581.1s | ETA: 410.7s
[07:40:56] Processed 1940/3295 | Elapsed: 583.6s | ETA: 407.4s
✓ Saved batch: 50 records | Total processed: 1950
[07:41:00] Processed 1950/3295 | Elapsed: 587.3s | ETA: 404.9s
[07:41:02] Processed 1960/3295 | Elapsed: 590.0s | ETA: 401.7s
[07:41:05] Processed 1970/3295 | Elapsed: 592.7s | ETA: 398.5s
[07:41:08] Processed 1980/3295 

                                                                                

✓ Saved batch: 50 records | Total processed: 2050
[07:41:31] Processed 2050/3295 | Elapsed: 618.5s | ETA: 375.4s
[07:41:34] Processed 2060/3295 | Elapsed: 621.3s | ETA: 372.3s
[07:41:36] Processed 2070/3295 | Elapsed: 624.1s | ETA: 369.2s
[07:41:39] Processed 2080/3295 | Elapsed: 627.0s | ETA: 366.1s
[07:41:43] Processed 2090/3295 | Elapsed: 630.6s | ETA: 363.4s


                                                                                

✓ Saved batch: 50 records | Total processed: 2100
[07:41:48] Processed 2100/3295 | Elapsed: 636.0s | ETA: 361.7s
[07:41:52] Processed 2110/3295 | Elapsed: 639.2s | ETA: 358.8s
[07:41:55] Processed 2120/3295 | Elapsed: 642.5s | ETA: 355.9s
[07:41:58] Processed 2130/3295 | Elapsed: 645.5s | ETA: 352.9s
[07:42:01] Processed 2140/3295 | Elapsed: 648.5s | ETA: 349.8s


                                                                                

✓ Saved batch: 50 records | Total processed: 2150
[07:42:06] Processed 2150/3295 | Elapsed: 653.5s | ETA: 347.9s
[07:42:09] Processed 2160/3295 | Elapsed: 656.4s | ETA: 344.7s
[07:42:12] Processed 2170/3295 | Elapsed: 659.3s | ETA: 341.6s
[07:42:14] Processed 2180/3295 | Elapsed: 662.0s | ETA: 338.4s
[07:42:17] Processed 2190/3295 | Elapsed: 664.8s | ETA: 335.3s


                                                                                

✓ Saved batch: 50 records | Total processed: 2200
[07:42:22] Processed 2200/3295 | Elapsed: 669.9s | ETA: 333.3s
[07:42:25] Processed 2210/3295 | Elapsed: 672.7s | ETA: 330.1s
[07:42:28] Processed 2220/3295 | Elapsed: 675.6s | ETA: 327.0s
[07:42:31] Processed 2230/3295 | Elapsed: 678.4s | ETA: 323.8s
[07:42:33] Processed 2240/3295 | Elapsed: 680.9s | ETA: 320.6s


                                                                                

✓ Saved batch: 50 records | Total processed: 2250
[07:42:38] Processed 2250/3295 | Elapsed: 685.5s | ETA: 318.2s
[07:42:41] Processed 2260/3295 | Elapsed: 688.3s | ETA: 315.1s
[07:42:44] Processed 2270/3295 | Elapsed: 691.2s | ETA: 312.0s
[07:42:46] Processed 2280/3295 | Elapsed: 694.1s | ETA: 308.9s
[07:42:49] Processed 2290/3295 | Elapsed: 697.1s | ETA: 305.8s


                                                                                

✓ Saved batch: 50 records | Total processed: 2300
[07:42:55] Processed 2300/3295 | Elapsed: 702.5s | ETA: 303.8s
[07:42:58] Processed 2310/3295 | Elapsed: 705.7s | ETA: 300.8s
[07:43:01] Processed 2320/3295 | Elapsed: 708.7s | ETA: 297.7s
[07:43:04] Processed 2330/3295 | Elapsed: 711.1s | ETA: 294.4s
[07:43:06] Processed 2340/3295 | Elapsed: 713.6s | ETA: 291.1s


                                                                                

✓ Saved batch: 50 records | Total processed: 2350
[07:43:10] Processed 2350/3295 | Elapsed: 717.8s | ETA: 288.5s
[07:43:13] Processed 2360/3295 | Elapsed: 720.2s | ETA: 285.2s
[07:43:15] Processed 2370/3295 | Elapsed: 722.6s | ETA: 281.9s
[07:43:17] Processed 2380/3295 | Elapsed: 725.0s | ETA: 278.6s
[07:43:20] Processed 2390/3295 | Elapsed: 727.7s | ETA: 275.4s
✓ Saved batch: 50 records | Total processed: 2400
[07:43:23] Processed 2400/3295 | Elapsed: 731.1s | ETA: 272.5s
[07:43:26] Processed 2410/3295 | Elapsed: 733.6s | ETA: 269.3s
[07:43:29] Processed 2420/3295 | Elapsed: 736.3s | ETA: 266.1s
[07:43:31] Processed 2430/3295 | Elapsed: 738.8s | ETA: 262.9s
[07:43:34] Processed 2440/3295 | Elapsed: 741.3s | ETA: 259.7s


                                                                                

✓ Saved batch: 50 records | Total processed: 2450
[07:43:38] Processed 2450/3295 | Elapsed: 745.4s | ETA: 257.0s
[07:43:41] Processed 2460/3295 | Elapsed: 748.1s | ETA: 253.8s
[07:43:43] Processed 2470/3295 | Elapsed: 750.6s | ETA: 250.6s
[07:43:45] Processed 2480/3295 | Elapsed: 753.0s | ETA: 247.4s
[07:43:48] Processed 2490/3295 | Elapsed: 755.6s | ETA: 244.2s


                                                                                

✓ Saved batch: 50 records | Total processed: 2500
[07:43:52] Processed 2500/3295 | Elapsed: 760.0s | ETA: 241.6s
[07:43:55] Processed 2510/3295 | Elapsed: 762.8s | ETA: 238.5s
[07:43:58] Processed 2520/3295 | Elapsed: 765.5s | ETA: 235.3s
[07:44:01] Processed 2530/3295 | Elapsed: 768.1s | ETA: 232.2s
[07:44:03] Processed 2540/3295 | Elapsed: 770.7s | ETA: 229.0s


                                                                                

✓ Saved batch: 50 records | Total processed: 2550
[07:44:07] Processed 2550/3295 | Elapsed: 774.5s | ETA: 226.2s
[07:44:10] Processed 2560/3295 | Elapsed: 777.2s | ETA: 223.0s
[07:44:12] Processed 2570/3295 | Elapsed: 779.7s | ETA: 219.9s
[07:44:15] Processed 2580/3295 | Elapsed: 782.2s | ETA: 216.7s
[07:44:17] Processed 2590/3295 | Elapsed: 784.8s | ETA: 213.5s


                                                                                

✓ Saved batch: 50 records | Total processed: 2600
[07:44:22] Processed 2600/3295 | Elapsed: 789.9s | ETA: 211.1s
[07:44:25] Processed 2610/3295 | Elapsed: 792.6s | ETA: 207.9s
[07:44:28] Processed 2620/3295 | Elapsed: 795.3s | ETA: 204.8s
[07:44:31] Processed 2630/3295 | Elapsed: 798.3s | ETA: 201.8s
[07:44:33] Processed 2640/3295 | Elapsed: 800.8s | ETA: 198.6s


                                                                                

✓ Saved batch: 50 records | Total processed: 2650
[07:44:38] Processed 2650/3295 | Elapsed: 805.2s | ETA: 195.9s
[07:44:40] Processed 2660/3295 | Elapsed: 807.8s | ETA: 192.8s
[07:44:43] Processed 2670/3295 | Elapsed: 810.4s | ETA: 189.6s
[07:44:45] Processed 2680/3295 | Elapsed: 813.0s | ETA: 186.5s
[07:44:48] Processed 2690/3295 | Elapsed: 815.6s | ETA: 183.4s


                                                                                

✓ Saved batch: 50 records | Total processed: 2700
[07:44:52] Processed 2700/3295 | Elapsed: 820.0s | ETA: 180.6s
[07:44:55] Processed 2710/3295 | Elapsed: 822.5s | ETA: 177.5s
[07:44:57] Processed 2720/3295 | Elapsed: 824.9s | ETA: 174.3s
[07:45:00] Processed 2730/3295 | Elapsed: 827.5s | ETA: 171.2s
[07:45:02] Processed 2740/3295 | Elapsed: 830.0s | ETA: 168.0s


                                                                                

✓ Saved batch: 50 records | Total processed: 2750
[07:45:06] Processed 2750/3295 | Elapsed: 833.9s | ETA: 165.2s
[07:45:09] Processed 2760/3295 | Elapsed: 836.4s | ETA: 162.1s
[07:45:11] Processed 2770/3295 | Elapsed: 839.1s | ETA: 159.0s
[07:45:14] Processed 2780/3295 | Elapsed: 841.7s | ETA: 155.9s
[07:45:17] Processed 2790/3295 | Elapsed: 844.2s | ETA: 152.7s


                                                                                

✓ Saved batch: 50 records | Total processed: 2800
[07:45:21] Processed 2800/3295 | Elapsed: 848.2s | ETA: 149.9s
[07:45:23] Processed 2810/3295 | Elapsed: 850.8s | ETA: 146.8s
[07:45:26] Processed 2820/3295 | Elapsed: 853.4s | ETA: 143.7s
[07:45:28] Processed 2830/3295 | Elapsed: 855.9s | ETA: 140.6s
[07:45:31] Processed 2840/3295 | Elapsed: 858.4s | ETA: 137.5s
✓ Saved batch: 50 records | Total processed: 2850
[07:45:34] Processed 2850/3295 | Elapsed: 862.0s | ETA: 134.6s
[07:45:37] Processed 2860/3295 | Elapsed: 864.4s | ETA: 131.4s
[07:45:39] Processed 2870/3295 | Elapsed: 867.0s | ETA: 128.3s
[07:45:42] Processed 2880/3295 | Elapsed: 869.6s | ETA: 125.3s
[07:45:44] Processed 2890/3295 | Elapsed: 872.1s | ETA: 122.2s
✓ Saved batch: 50 records | Total processed: 2900
[07:45:48] Processed 2900/3295 | Elapsed: 875.4s | ETA: 119.2s
[07:45:51] Processed 2910/3295 | Elapsed: 878.4s | ETA: 116.2s
[07:45:53] Processed 2920/3295 | Elapsed: 881.1s | ETA: 113.1s
[07:45:56] Processed 2930/3295 

                                                                                

✓ Saved batch: 50 records | Total processed: 3000
[07:46:17] Processed 3000/3295 | Elapsed: 904.8s | ETA: 88.9s
[07:46:20] Processed 3010/3295 | Elapsed: 907.8s | ETA: 85.9s
[07:46:23] Processed 3020/3295 | Elapsed: 910.6s | ETA: 82.9s
[07:46:26] Processed 3030/3295 | Elapsed: 913.4s | ETA: 79.9s
[07:46:28] Processed 3040/3295 | Elapsed: 916.0s | ETA: 76.8s


                                                                                

✓ Saved batch: 50 records | Total processed: 3050
[07:46:33] Processed 3050/3295 | Elapsed: 920.8s | ETA: 73.9s
[07:46:36] Processed 3060/3295 | Elapsed: 923.3s | ETA: 70.9s
[07:46:38] Processed 3070/3295 | Elapsed: 925.8s | ETA: 67.8s
[07:46:41] Processed 3080/3295 | Elapsed: 928.4s | ETA: 64.8s
[07:46:43] Processed 3090/3295 | Elapsed: 930.8s | ETA: 61.7s


                                                                                

✓ Saved batch: 50 records | Total processed: 3100
[07:46:48] Processed 3100/3295 | Elapsed: 935.7s | ETA: 58.8s
[07:46:51] Processed 3110/3295 | Elapsed: 938.4s | ETA: 55.8s
[07:46:53] Processed 3120/3295 | Elapsed: 940.9s | ETA: 52.8s
[07:46:56] Processed 3130/3295 | Elapsed: 943.5s | ETA: 49.7s
[07:46:59] Processed 3140/3295 | Elapsed: 946.1s | ETA: 46.7s


                                                                                

✓ Saved batch: 50 records | Total processed: 3150
[07:47:04] Processed 3150/3295 | Elapsed: 951.3s | ETA: 43.8s
[07:47:06] Processed 3160/3295 | Elapsed: 953.8s | ETA: 40.7s
[07:47:09] Processed 3170/3295 | Elapsed: 956.3s | ETA: 37.7s
[07:47:11] Processed 3180/3295 | Elapsed: 958.8s | ETA: 34.7s
[07:47:14] Processed 3190/3295 | Elapsed: 961.2s | ETA: 31.6s


                                                                                

✓ Saved batch: 50 records | Total processed: 3200
[07:47:18] Processed 3200/3295 | Elapsed: 965.4s | ETA: 28.7s
[07:47:21] Processed 3210/3295 | Elapsed: 968.2s | ETA: 25.6s
[07:47:23] Processed 3220/3295 | Elapsed: 970.7s | ETA: 22.6s
[07:47:26] Processed 3230/3295 | Elapsed: 973.3s | ETA: 19.6s
[07:47:28] Processed 3240/3295 | Elapsed: 976.1s | ETA: 16.6s


                                                                                

✓ Saved batch: 50 records | Total processed: 3250
[07:47:33] Processed 3250/3295 | Elapsed: 980.5s | ETA: 13.6s
[07:47:36] Processed 3260/3295 | Elapsed: 983.3s | ETA: 10.6s
[07:47:38] Processed 3270/3295 | Elapsed: 985.9s | ETA: 7.5s
[07:47:41] Processed 3280/3295 | Elapsed: 988.4s | ETA: 4.5s
[07:47:43] Processed 3290/3295 | Elapsed: 990.9s | ETA: 1.5s


                                                                                

✓ Saved final batch: 45 records
✓ Đã giải phóng bộ nhớ processing

=== Completed processing in 993.8s ===

=== Checking saved results ===
Total records in table: 3295

Recent 5 records:


                                                                                

+---------------------------------------------------+-------------------+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------+------------+---------+------------+----------+------+--------------------------+--------------------------+
|                                             postID|        timePublish|                                                          description_Normalized|                                                                       Label_NER|        Label_Topic|Label_Intent|likeCount|commentCount|shareCount|  type|                created_at|                updated_at|
+---------------------------------------------------+-------------------+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-------------------+------------+----

## 8. Verify Dữ Liệu Đã Lưu

In [8]:
# Verify dữ liệu đã lưu (Cell 7 đã tự động lưu theo batch)
import gc

try:
    df_verify = spark.table("nessie.silver_tables.result_multi_model")
    total_count = df_verify.count()
    
    print(f"✓ Total records in table: {total_count}")
    print("\nRecent saved records:")
    df_verify.orderBy(col("created_at").desc()).show(5, truncate=False)
    
    # Thống kê số lượng records theo thời gian tạo
    print("\n=== Records saved by time ===")
    df_verify.groupBy("created_at").count() \
        .orderBy(col("created_at").desc()) \
        .show(20, truncate=False)
    
    # Giải phóng bộ nhớ
    df_verify.unpersist()
    del df_verify
    gc.collect()
    print("\n✓ Đã giải phóng bộ nhớ")
    
except Exception as e:
    print(f"✗ Error reading table: {e}")

✓ Total records in table: 3295

Recent saved records:


                                                                                

+--------------------------------------------------+-------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------



+--------------------------+-----+
|created_at                |count|
+--------------------------+-----+
|2025-12-08 07:47:45.029136|45   |
|2025-12-08 07:47:31.628683|50   |
|2025-12-08 07:47:16.518126|50   |
|2025-12-08 07:47:01.794822|50   |
|2025-12-08 07:46:46.360117|50   |
|2025-12-08 07:46:31.537678|50   |
|2025-12-08 07:46:15.775398|50   |
|2025-12-08 07:46:02.075139|50   |
|2025-12-08 07:45:47.403404|50   |
|2025-12-08 07:45:33.892038|50   |
|2025-12-08 07:45:19.640926|50   |
|2025-12-08 07:45:05.340584|50   |
|2025-12-08 07:44:51.582258|50   |
|2025-12-08 07:44:36.319707|50   |
|2025-12-08 07:44:21.17089 |50   |
|2025-12-08 07:44:05.996317|50   |
|2025-12-08 07:43:51.324001|50   |
|2025-12-08 07:43:36.614141|50   |
|2025-12-08 07:43:23.056378|50   |
|2025-12-08 07:43:09.115966|50   |
+--------------------------+-----+
only showing top 20 rows


✓ Đã giải phóng bộ nhớ


                                                                                

## 9. Thống kê kết quả

In [9]:
# Đọc lại dữ liệu từ table
import gc

df_verify = spark.table("nessie.silver_tables.result_multi_model")

print(f"Total records in result_multi_model: {df_verify.count()}")
print("\nSample records:")
df_verify.show(10, truncate=False)

print("\n=== Label Statistics ===")
print("\nTopic distribution:")
df_topic = df_verify.groupBy("Label_Topic").count().orderBy(col("count").desc())
df_topic.show(10, truncate=False)
del df_topic

print("\nIntent distribution:")
df_intent = df_verify.groupBy("Label_Intent").count().orderBy(col("count").desc())
df_intent.show(truncate=False)
del df_intent

# Giải phóng bộ nhớ
df_verify.unpersist()
del df_verify
gc.collect()
print("\n✓ Đã giải phóng bộ nhớ")

Total records in result_multi_model: 3295

Sample records:
+--------------------------------------------+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+----------------+--------------+---------+------------+----------+--------+--------------------------+--------------------------+
|postID                                      |timePublish        |description_Normalized                                                                                                                                                                                                                                        |Label_NER                        

                                                                                

+-------------------------+-----+
|Label_Topic              |count|
+-------------------------+-----+
|MAJOR                    |651  |
|UNIVERSITY               |561  |
|MAJOR|UNIVERSITY         |326  |
|CAREER|MAJOR             |321  |
|OTHER                    |206  |
|STUDY                    |155  |
|TUITION|UNIVERSITY       |129  |
|MAJOR|TUITION|UNIVERSITY |103  |
|MAJOR|SUBJECT_COMBINATION|82   |
|None                     |82   |
+-------------------------+-----+
only showing top 10 rows


Intent distribution:


                                                                                

+----------------+-----+
|Label_Intent    |count|
+----------------+-----+
|share_info      |1553 |
|ask_info        |890  |
|ask_advice      |403  |
|other           |191  |
|ask_confirmation|108  |
|ask_experience  |76   |
|ask_comparison  |74   |
+----------------+-----+


✓ Đã giải phóng bộ nhớ


## 10. Dừng Spark Session

In [10]:
# Giải phóng bộ nhớ trước khi dừng Spark
import gc

# Unpersist df_articles nếu còn cache
try:
    df_articles.unpersist()
    del df_articles
except:
    pass

# Giải phóng model và tokenizer
try:
    del model, tokenizer, label_mappings
except:
    pass

# Giải phóng VnCoreNLP
try:
    if annotator is not None:
        del annotator
except:
    pass

# Thu gom rác Python
gc.collect()
print("✓ Đã giải phóng tất cả bộ nhớ")

# Dừng Spark Session
spark.stop()
print("✓ Spark Session đã được dừng!")

✓ Đã giải phóng tất cả bộ nhớ
✓ Spark Session đã được dừng!
