## 1. Import Libraries và Khởi tạo Spark Session

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, udf, current_timestamp
import torch
import json
import time
from transformers import AutoTokenizer
from vncorenlp import VnCoreNLP
import os
import warnings
warnings.filterwarnings('ignore')

# Set AWS environment variables for MinIO
os.environ['AWS_REGION'] = 'us-east-1'
os.environ['AWS_ACCESS_KEY_ID'] = 'admin'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'admin123'

# Khởi tạo Spark Session với Iceberg và Nessie catalog
spark = (
    SparkSession.builder.appName("Apply_Model_Multi_Task")
    .master("spark://spark-master:7077")
    .config("spark.executor.memory", "1536m")  # 1.5GB - an toàn với worker 2GB
    .config("spark.executor.cores", "2")
    .config("spark.network.timeout", "600s")
    .config("spark.executor.heartbeatInterval", "60s")
    .config("spark.storage.blockManagerSlaveTimeoutMs", "600000")
    .config("spark.rpc.askTimeout", "600s")
    # ===== Iceberg Catalog qua Nessie =====
    .config("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog")
    .config("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog")
    .config("spark.sql.catalog.nessie.uri", "http://nessie:19120/api/v2")
    .config("spark.sql.catalog.nessie.ref", "main")
    .config("spark.sql.catalog.nessie.warehouse", "s3a://silver/")
    .config("spark.sql.catalog.nessie.io-impl", "org.apache.iceberg.aws.s3.S3FileIO")
    # ===== Cấu hình MinIO (S3-compatible) =====
    .config("spark.sql.catalog.nessie.s3.endpoint", "http://minio:9000")
    .config("spark.sql.catalog.nessie.s3.access-key-id", "admin")
    .config("spark.sql.catalog.nessie.s3.secret-access-key", "admin123")
    .config("spark.sql.catalog.nessie.s3.path-style-access", "true")
    .config("spark.sql.catalog.nessie.s3.region", "us-east-1")
    # ===== Spark + Hadoop S3 connector =====
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", "admin")
    .config("spark.hadoop.fs.s3a.secret.key", "admin123")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    .config("spark.hadoop.fs.s3a.region", "us-east-1")
    # Propagate environment variables to executors
    .config("spark.executorEnv.AWS_REGION", "us-east-1")
    .config("spark.executorEnv.AWS_ACCESS_KEY_ID", "admin")
    .config("spark.executorEnv.AWS_SECRET_ACCESS_KEY", "admin123")
    # ===== Sử dụng JAR files local =====
    .config("spark.jars", "/opt/spark/jars/hadoop-aws-3.3.4.jar,/opt/spark/jars/aws-java-sdk-bundle-1.12.262.jar")
    .getOrCreate()
)

spark.sparkContext.setLogLevel("ERROR")
print("Spark Session da duoc khoi tao voi Nessie catalog!")
print(f"Spark Master: {spark.sparkContext.master}")
print(f"Application ID: {spark.sparkContext.applicationId}")

25/12/06 15:14:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Spark Session da duoc khoi tao voi Nessie catalog!
Spark Master: spark://spark-master:7077
Application ID: app-20251206151427-0000


## 2. Đọc Dữ Liệu từ Bảng Article

In [2]:
# Đọc dữ liệu từ bảng article
df_articles = spark.table("nessie.silver_tables.article")
print(f"Tổng số articles: {df_articles.count()}")

[Stage 0:>                                                          (0 + 1) / 1]

Tổng số articles: 3295


                                                                                

## 3. Load Model và Cấu Hình

In [3]:
# Đường dẫn model local (đã mount vào container)
local_model_path = '/opt/spark-apps/phobert_multitask_final'

# Đọc JSON files từ local filesystem
import json
import os

with open(f'{local_model_path}/model_config.json', 'r') as f:
    model_config = json.load(f)

with open(f'{local_model_path}/label_mappings.json', 'r') as f:
    label_mappings = json.load(f)

print(f"NER labels: {model_config['num_ner_labels']}")
print(f"Topic labels: {model_config['num_topic_labels']}")
print(f"Intent labels: {model_config['num_intent_labels']}")

# Kiểm tra VnCoreNLP JAR file tại path đã mount
print("\n=== Kiểm tra VnCoreNLP JAR file ===")
vncorenlp_path = '/opt/spark-apps/VnCoreNLP-1.2/VnCoreNLP-1.2.jar'

if os.path.exists(vncorenlp_path):
    print(f"Tìm thấy: {vncorenlp_path}")
    print(f"\nĐang khởi tạo VnCoreNLP...")
    try:
        annotator = VnCoreNLP(vncorenlp_path, annotators="wseg", max_heap_size='-Xmx2g')
        print("VnCoreNLP loaded successfully!")
    except Exception as e:
        print(f"Lỗi khởi tạo VnCoreNLP: {e}")
        annotator = None
else:
    print(f"Không tìm thấy VnCoreNLP JAR tại: {vncorenlp_path}")
    print("Kiểm tra lại volume mount trong docker-compose.yaml")
    annotator = None

# Lưu model_path cho các cell sau
model_path = local_model_path

NER labels: 25
Topic labels: 10
Intent labels: 7

=== Kiểm tra VnCoreNLP JAR file ===
Tìm thấy: /opt/spark-apps/VnCoreNLP-1.2/VnCoreNLP-1.2.jar

Đang khởi tạo VnCoreNLP...
VnCoreNLP loaded successfully!


## 4. Định Nghĩa Model Architecture

In [4]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import RobertaModel


class MultiTaskPhoBERT_WithFusion(nn.Module):
    """Multi-Task PhoBERT với Feature Fusion"""
    def __init__(self, phobert_path, num_ner_labels, num_topic_labels, num_intent_labels, dropout=0.2):
        super(MultiTaskPhoBERT_WithFusion, self).__init__()
        
        self.phobert = RobertaModel.from_pretrained(phobert_path)
        self.phobert.config.hidden_dropout_prob = 0.25
        self.phobert.config.attention_probs_dropout_prob = 0.25
        self.hidden_size = self.phobert.config.hidden_size
        self.num_ner_labels = num_ner_labels
        
        self.dropout = nn.Dropout(dropout)
        self.dropout_heavy = nn.Dropout(dropout * 1.5)
        
        # NER head
        self.ner_hidden = nn.Linear(self.hidden_size, self.hidden_size // 2)
        self.ner_norm = nn.LayerNorm(self.hidden_size // 2)
        self.ner_classifier = nn.Linear(self.hidden_size // 2, num_ner_labels)
        
        # Intent head
        self.intent_hidden = nn.Linear(self.hidden_size, self.hidden_size // 2)
        self.intent_norm = nn.LayerNorm(self.hidden_size // 2)
        self.intent_classifier = nn.Linear(self.hidden_size // 2, num_intent_labels)
        
        # Topic Fusion Head
        fusion_input_size = self.hidden_size + num_ner_labels
        self.topic_input_proj = nn.Linear(fusion_input_size, self.hidden_size)
        
        self.topic_layer1 = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.LayerNorm(self.hidden_size),
            nn.GELU(),
            nn.Dropout(dropout * 0.5)
        )
        
        self.topic_layer2 = nn.Sequential(
            nn.Linear(self.hidden_size, self.hidden_size),
            nn.LayerNorm(self.hidden_size),
            nn.GELU(),
            nn.Dropout(dropout * 0.5)
        )
        
        self.topic_classifier = nn.Linear(self.hidden_size, num_topic_labels)
        
        # NER attention mechanism
        self.ner_attention = nn.Sequential(
            nn.Linear(num_ner_labels, num_ner_labels // 2),
            nn.Tanh(),
            nn.Linear(num_ner_labels // 2, num_ner_labels),
            nn.Softmax(dim=-1)
        )
        
        # Cross-Attention
        self.cross_attention = nn.MultiheadAttention(
            embed_dim=self.hidden_size,
            num_heads=8,
            dropout=dropout,
            batch_first=True
        )
        self.cross_attn_norm = nn.LayerNorm(self.hidden_size)
        
        # Auxiliary head
        self.aux_topic_classifier = nn.Sequential(
            nn.Linear(num_ner_labels, num_ner_labels // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(num_ner_labels // 2, num_topic_labels)
        )
    
    def extract_ner_features(self, ner_logits, attention_mask):
        """Trích xuất NER features với MAX + AVG pooling"""
        ner_probs = F.softmax(ner_logits, dim=-1)
        
        attention_mask_expanded = attention_mask.unsqueeze(-1).expand_as(ner_probs)
        ner_probs_masked = ner_probs * attention_mask_expanded
        
        max_features, _ = ner_probs_masked.max(dim=1)
        
        seq_lengths = attention_mask.sum(dim=1, keepdim=True).clamp(min=1)
        avg_features = ner_probs_masked.sum(dim=1) / seq_lengths
        
        ner_features = 0.5 * max_features + 0.5 * avg_features
        
        attention_weights = self.ner_attention(ner_features)
        ner_features_weighted = ner_features * attention_weights
        
        return ner_features_weighted
    
    def forward(self, input_ids, attention_mask=None, ner_labels=None, topic_labels=None, intent_labels=None):
        outputs = self.phobert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs.last_hidden_state
        cls_output = sequence_output[:, 0, :]
        
        cls_expanded = cls_output.unsqueeze(1)
        attn_output, _ = self.cross_attention(
            query=cls_expanded,
            key=sequence_output,
            value=sequence_output,
            key_padding_mask=(attention_mask == 0) if attention_mask is not None else None
        )
        cls_output = self.cross_attn_norm(cls_output + attn_output.squeeze(1))
        cls_output_dropped = self.dropout_heavy(cls_output)
        
        # NER predictions
        ner_hidden = self.ner_hidden(sequence_output)
        ner_hidden = self.ner_norm(ner_hidden)
        ner_hidden = F.gelu(ner_hidden)
        ner_hidden = self.dropout(ner_hidden)
        ner_logits = self.ner_classifier(ner_hidden)
        
        ner_features = self.extract_ner_features(ner_logits, attention_mask)
        
        # Topic prediction
        topic_input = torch.cat([cls_output_dropped, ner_features], dim=-1)
        topic_hidden = self.topic_input_proj(topic_input)
        topic_hidden = topic_hidden + self.topic_layer1(topic_hidden)
        topic_hidden = topic_hidden + self.topic_layer2(topic_hidden)
        topic_logits = self.topic_classifier(topic_hidden)
        
        # Intent prediction
        intent_hidden = self.intent_hidden(cls_output_dropped)
        intent_hidden = self.intent_norm(intent_hidden)
        intent_hidden = F.gelu(intent_hidden)
        intent_hidden = self.dropout(intent_hidden)
        intent_logits = self.intent_classifier(intent_hidden)
        
        return {
            'loss': None,
            'ner_logits': ner_logits,
            'topic_logits': topic_logits,
            'intent_logits': intent_logits
        }

print("Model class defined!")

Model class defined!


## 5. Load Model Weights

In [5]:
# Đường dẫn model local (đã mount vào container)
local_model_path = '/opt/spark-apps/phobert_multitask_final'

print(f"Loading model from: {local_model_path}")

# Load tokenizer và model
tokenizer = AutoTokenizer.from_pretrained(local_model_path)

model = MultiTaskPhoBERT_WithFusion(
    phobert_path=model_config['phobert_base'],
    num_ner_labels=model_config['num_ner_labels'],
    num_topic_labels=model_config['num_topic_labels'],
    num_intent_labels=model_config['num_intent_labels'],
    dropout=model_config['dropout']
)

# Load trọng số đã train
state_dict = torch.load(f'{local_model_path}/pytorch_model.bin', map_location='cpu')
model.load_state_dict(state_dict)
model.eval()

# Chuyển model sang GPU nếu có
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

print(f"Model loaded successfully on {device}!")

Loading model from: /opt/spark-apps/phobert_multitask_final


Some weights of RobertaModel were not initialized from the model checkpoint at vinai/phobert-base-v2 and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded successfully on cpu!


## 6. Định Nghĩa Prediction Functions

In [6]:
def normalize_text_with_underscore(text, annotator):
    """Chuẩn hóa văn bản tiếng Việt bằng VnCoreNLP"""
    try:
        sentences = annotator.tokenize(text)
        
        normalized_sentences = []
        for sentence in sentences:
            words = []
            for word in sentence:
                if isinstance(word, list):
                    words.append('_'.join(word))
                else:
                    words.append(word)
            normalized_sentences.append(' '.join(words))
        
        normalized_text = ' '.join(normalized_sentences)
        return normalized_text
    except Exception as e:
        print(f"Error normalizing text: {e}")
        return text


def predict_labels(text, model, tokenizer, label_mappings, device, annotator, max_length=256):
    """Dự đoán NER, Topic, Intent cho văn bản đầu vào"""
    if not text or text.strip() == '':
        return {
            'normalized_text': '',
            'ner_labels': 'O',
            'topic_label': 'None',
            'intent_label': 'Unknown'
        }
    
    # Chuẩn hóa text bằng VnCoreNLP
    normalized_text = normalize_text_with_underscore(text, annotator)
    normalized_tokens = normalized_text.split()
    num_tokens = len(normalized_tokens)
    
    # Tokenize cho PhoBERT
    inputs = tokenizer(
        normalized_text,
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    if 'token_type_ids' in inputs:
        inputs.pop('token_type_ids')
    
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Dự đoán
    with torch.no_grad():
        outputs = model(**inputs)
    
    ner_logits = outputs['ner_logits']
    topic_logits = outputs['topic_logits']
    intent_logits = outputs['intent_logits']
    
    # NER predictions
    ner_preds = torch.argmax(ner_logits, dim=-1)[0]
    phobert_tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    
    # Align labels với VnCoreNLP tokens
    ner_labels = []
    token_idx = 0
    
    for word in normalized_tokens:
        word_tokens = tokenizer.tokenize(word)
        
        while token_idx < len(phobert_tokens) and phobert_tokens[token_idx] == '<s>':
            token_idx += 1
        
        if token_idx < len(phobert_tokens) and phobert_tokens[token_idx] not in ['</s>', '<pad>']:
            label = label_mappings['ner_id2label'][str(ner_preds[token_idx].item())]
            ner_labels.append(label)
            token_idx += len(word_tokens)
        else:
            ner_labels.append('O')
            break
    
    # Topic predictions (multi-label, threshold=0.5)
    topic_probs = torch.sigmoid(topic_logits)[0]
    topic_preds = (topic_probs > 0.5).nonzero(as_tuple=True)[0].cpu().tolist()
    
    if len(topic_preds) > 0:
        topic_labels = [label_mappings['topic_id2label'][str(idx)] for idx in topic_preds]
        topic_label = '|'.join(topic_labels)
    else:
        topic_label = 'None'
    
    # Intent prediction (single-label)
    intent_pred = torch.argmax(intent_logits, dim=-1).item()
    intent_label = label_mappings['intent_id2label'][str(intent_pred)]
    
    return {
        'normalized_text': normalized_text,
        'ner_labels': ' '.join(ner_labels),
        'topic_label': topic_label,
        'intent_label': intent_label
    }

print("Prediction functions defined!")

Prediction functions defined!


## 7. Apply Model trên Toàn Bộ Dữ Liệu

In [7]:
# Định nghĩa schema cho DataFrame kết quả
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.sql.functions import days

schema = StructType([
    StructField("postID", StringType(), False),
    StructField("timePublish", TimestampType(), False),
    StructField("description_Normalized", StringType(), True),
    StructField("Label_NER", StringType(), True),
    StructField("Label_Topic", StringType(), True),
    StructField("Label_Intent", StringType(), True),
    StructField("likeCount", IntegerType(), True),
    StructField("commentCount", IntegerType(), True),
    StructField("shareCount", IntegerType(), True),
    StructField("type", StringType(), True)
])

# Kiểm tra xem table result_multi_model đã tồn tại chưa
try:
    df_existing = spark.table("nessie.silver_tables.result_multi_model")
    existing_post_ids = [row.postID for row in df_existing.select("postID").distinct().collect()]
    print(f"Tìm thấy {len(existing_post_ids)} articles đã được xử lý trước đó")
except Exception as e:
    print(f"Table result_multi_model chưa tồn tại hoặc rỗng: {e}")
    existing_post_ids = []

# Lọc ra các articles chưa được xử lý
if existing_post_ids:
    df_articles_new = df_articles.filter(~col("articleID").isin(existing_post_ids))
    print(f"Tổng số articles mới cần xử lý: {df_articles_new.count()}")
else:
    df_articles_new = df_articles
    print(f"Xử lý toàn bộ {df_articles.count()} articles")

print("\n=== Collecting data to process ===")
# Collect data về driver để xử lý (phù hợp với dataset nhỏ)
articles_to_process = df_articles_new.select("articleID", "description", "timePublish", 
                                              "likeCount", "commentCount", "shareCount", "type").collect()
print(f"Đã collect {len(articles_to_process)} articles về driver")

# Xử lý từng article bằng model đã load sẵn trên driver
results = []
batch_size = 50  # Xử lý và lưu mỗi 50 records
start_time = time.time()

print("\n=== Processing articles with model ===")
for idx, row in enumerate(articles_to_process):
    if idx % 10 == 0:
        elapsed = time.time() - start_time
        avg_time = elapsed / (idx + 1) if idx > 0 else 0
        remaining = avg_time * (len(articles_to_process) - idx)
        print(f"[{time.strftime('%H:%M:%S')}] Processed {idx}/{len(articles_to_process)} | "
              f"Elapsed: {elapsed:.1f}s | ETA: {remaining:.1f}s")
    
    article_id = row['articleID']
    description = row['description']
    
    if not description or description.strip() == '':
        results.append({
            'postID': article_id,
            'timePublish': row['timePublish'],
            'description_Normalized': '',
            'Label_NER': 'O',
            'Label_Topic': 'None',
            'Label_Intent': 'Unknown',
            'likeCount': int(row['likeCount']) if row['likeCount'] else 0,
            'commentCount': int(row['commentCount']) if row['commentCount'] else 0,
            'shareCount': int(row['shareCount']) if row['shareCount'] else 0,
            'type': row['type']
        })
        continue
    
    try:
        # Sử dụng hàm predict_labels đã định nghĩa ở cell 6
        predictions = predict_labels(description, model, tokenizer, label_mappings, device, annotator)
        
        results.append({
            'postID': article_id,
            'timePublish': row['timePublish'],
            'description_Normalized': predictions['normalized_text'],
            'Label_NER': predictions['ner_labels'],
            'Label_Topic': predictions['topic_label'],
            'Label_Intent': predictions['intent_label'],
            'likeCount': int(row['likeCount']) if row['likeCount'] else 0,
            'commentCount': int(row['commentCount']) if row['commentCount'] else 0,
            'shareCount': int(row['shareCount']) if row['shareCount'] else 0,
            'type': row['type']
        })
    except Exception as e:
        print(f"Error processing article {article_id}: {e}")
        results.append({
            'postID': article_id,
            'timePublish': row['timePublish'],
            'description_Normalized': description,
            'Label_NER': 'O',
            'Label_Topic': 'None',
            'Label_Intent': 'Unknown',
            'likeCount': int(row['likeCount']) if row['likeCount'] else 0,
            'commentCount': int(row['commentCount']) if row['commentCount'] else 0,
            'shareCount': int(row['shareCount']) if row['shareCount'] else 0,
            'type': row['type']
        })
    
    # Lưu batch và reset results để giải phóng memory
    if (idx + 1) % batch_size == 0 and len(results) > 0:
        try:
            batch_df = spark.createDataFrame(results, schema=schema)
            batch_df = batch_df.withColumn("created_at", current_timestamp()) \
                               .withColumn("updated_at", current_timestamp())
            
            # Append batch vào table
            batch_df.writeTo("nessie.silver_tables.result_multi_model") \
                .using("iceberg") \
                .tableProperty("write.format.default", "parquet") \
                .tableProperty("write.metadata.compression-codec", "gzip") \
                .tableProperty("write.parquet.compression-codec", "gzip") \
                .append()
            
            print(f"✓ Saved batch: {len(results)} records | Total processed: {idx + 1}")
            results = []  # Reset
        except Exception as batch_err:
            # Nếu table chưa tồn tại, tạo mới
            if "table does not exist" in str(batch_err).lower() or "not found" in str(batch_err).lower():
                print("Creating table for first batch...")
                batch_df.writeTo("nessie.silver_tables.result_multi_model") \
                    .using("iceberg") \
                    .tableProperty("write.format.default", "parquet") \
                    .tableProperty("write.metadata.compression-codec", "gzip") \
                    .tableProperty("write.parquet.compression-codec", "gzip") \
                    .partitionedBy(days(col("timePublish"))) \
                    .create()
                print(f"✓ Table created with {len(results)} records")
                results = []
            else:
                print(f"✗ Error saving batch: {batch_err}")

# Lưu records còn lại (nếu có)
if len(results) > 0:
    try:
        final_df = spark.createDataFrame(results, schema=schema)
        final_df = final_df.withColumn("created_at", current_timestamp()) \
                           .withColumn("updated_at", current_timestamp())
        
        final_df.writeTo("nessie.silver_tables.result_multi_model") \
            .using("iceberg") \
            .tableProperty("write.format.default", "parquet") \
            .tableProperty("write.metadata.compression-codec", "gzip") \
            .tableProperty("write.parquet.compression-codec", "gzip") \
            .append()
        
        print(f"✓ Saved final batch: {len(results)} records")
    except Exception as final_err:
        if "table does not exist" in str(final_err).lower():
            final_df.writeTo("nessie.silver_tables.result_multi_model") \
                .using("iceberg") \
                .tableProperty("write.format.default", "parquet") \
                .tableProperty("write.metadata.compression-codec", "gzip") \
                .tableProperty("write.parquet.compression-codec", "gzip") \
                .partitionedBy(days(col("timePublish"))) \
                .create()
            print(f"✓ Table created with {len(results)} records")

total_time = time.time() - start_time
print(f"\n=== Completed processing {len(articles_to_process)} articles in {total_time:.1f}s ===")

# Kiểm tra kết quả đã lưu
print("\n=== Checking saved results ===")
try:
    df_check = spark.table("nessie.silver_tables.result_multi_model")
    print(f"Total records in table: {df_check.count()}")
    print("\nRecent 5 records:")
    df_check.orderBy(col("created_at").desc()).show(5, truncate=80)
except Exception as e:
    print(f"Could not read table: {e}")

Tìm thấy 0 articles đã được xử lý trước đó


                                                                                

Xử lý toàn bộ 3295 articles

=== Collecting data to process ===


                                                                                

Đã collect 3295 articles về driver

=== Processing articles with model ===
[15:15:54] Processed 0/3295 | Elapsed: 0.0s | ETA: 0.0s
[15:15:57] Processed 10/3295 | Elapsed: 2.6s | ETA: 790.1s
[15:15:59] Processed 20/3295 | Elapsed: 4.7s | ETA: 734.7s
[15:16:01] Processed 30/3295 | Elapsed: 6.8s | ETA: 712.8s
[15:16:03] Processed 40/3295 | Elapsed: 8.9s | ETA: 704.5s


                                                                                

✓ Saved batch: 50 records | Total processed: 50
[15:16:09] Processed 50/3295 | Elapsed: 14.7s | ETA: 938.3s
[15:16:11] Processed 60/3295 | Elapsed: 16.8s | ETA: 890.6s
[15:16:13] Processed 70/3295 | Elapsed: 18.9s | ETA: 858.9s
[15:16:15] Processed 80/3295 | Elapsed: 21.1s | ETA: 838.0s
[15:16:17] Processed 90/3295 | Elapsed: 23.2s | ETA: 818.0s


                                                                                

✓ Saved batch: 50 records | Total processed: 100
[15:16:21] Processed 100/3295 | Elapsed: 26.8s | ETA: 847.6s
[15:16:23] Processed 110/3295 | Elapsed: 28.9s | ETA: 829.3s
[15:16:25] Processed 120/3295 | Elapsed: 31.0s | ETA: 813.5s
[15:16:27] Processed 130/3295 | Elapsed: 33.0s | ETA: 797.8s
[15:16:29] Processed 140/3295 | Elapsed: 35.0s | ETA: 783.0s


                                                                                

✓ Saved batch: 50 records | Total processed: 150
[15:16:33] Processed 150/3295 | Elapsed: 38.7s | ETA: 805.4s
[15:16:35] Processed 160/3295 | Elapsed: 40.7s | ETA: 792.4s
[15:16:37] Processed 170/3295 | Elapsed: 42.7s | ETA: 780.2s
[15:16:39] Processed 180/3295 | Elapsed: 44.8s | ETA: 770.9s
[15:16:41] Processed 190/3295 | Elapsed: 46.9s | ETA: 761.9s


                                                                                

✓ Saved batch: 50 records | Total processed: 200
[15:16:44] Processed 200/3295 | Elapsed: 50.2s | ETA: 772.9s
[15:16:46] Processed 210/3295 | Elapsed: 52.3s | ETA: 765.1s
[15:16:48] Processed 220/3295 | Elapsed: 54.5s | ETA: 758.0s
[15:16:50] Processed 230/3295 | Elapsed: 56.5s | ETA: 749.9s
[15:16:52] Processed 240/3295 | Elapsed: 58.6s | ETA: 742.8s
✓ Saved batch: 50 records | Total processed: 250
[15:16:56] Processed 250/3295 | Elapsed: 61.7s | ETA: 748.5s
[15:16:58] Processed 260/3295 | Elapsed: 63.7s | ETA: 741.1s
[15:17:00] Processed 270/3295 | Elapsed: 65.8s | ETA: 733.9s
[15:17:02] Processed 280/3295 | Elapsed: 67.8s | ETA: 727.1s
[15:17:04] Processed 290/3295 | Elapsed: 69.8s | ETA: 720.7s


                                                                                

✓ Saved batch: 50 records | Total processed: 300
[15:17:07] Processed 300/3295 | Elapsed: 73.0s | ETA: 725.9s
[15:17:09] Processed 310/3295 | Elapsed: 75.0s | ETA: 719.6s
[15:17:11] Processed 320/3295 | Elapsed: 76.9s | ETA: 713.1s
[15:17:13] Processed 330/3295 | Elapsed: 78.9s | ETA: 706.7s
[15:17:15] Processed 340/3295 | Elapsed: 81.0s | ETA: 701.7s


                                                                                

✓ Saved batch: 50 records | Total processed: 350
[15:17:18] Processed 350/3295 | Elapsed: 84.0s | ETA: 705.0s
[15:17:20] Processed 360/3295 | Elapsed: 86.1s | ETA: 700.4s
[15:17:22] Processed 370/3295 | Elapsed: 88.3s | ETA: 696.0s
[15:17:24] Processed 380/3295 | Elapsed: 90.4s | ETA: 691.6s
[15:17:26] Processed 390/3295 | Elapsed: 92.5s | ETA: 687.1s


                                                                                

✓ Saved batch: 50 records | Total processed: 400
[15:17:30] Processed 400/3295 | Elapsed: 95.7s | ETA: 690.7s
[15:17:32] Processed 410/3295 | Elapsed: 97.7s | ETA: 685.5s
[15:17:34] Processed 420/3295 | Elapsed: 99.9s | ETA: 682.1s
[15:17:36] Processed 430/3295 | Elapsed: 101.9s | ETA: 677.2s
[15:17:38] Processed 440/3295 | Elapsed: 103.8s | ETA: 672.3s
✓ Saved batch: 50 records | Total processed: 450
[15:17:41] Processed 450/3295 | Elapsed: 106.8s | ETA: 674.0s
[15:17:43] Processed 460/3295 | Elapsed: 108.9s | ETA: 669.7s
[15:17:45] Processed 470/3295 | Elapsed: 111.0s | ETA: 665.8s
[15:17:47] Processed 480/3295 | Elapsed: 113.0s | ETA: 661.5s
[15:17:49] Processed 490/3295 | Elapsed: 115.2s | ETA: 658.1s
✓ Saved batch: 50 records | Total processed: 500
[15:17:52] Processed 500/3295 | Elapsed: 118.4s | ETA: 660.4s
[15:17:54] Processed 510/3295 | Elapsed: 120.4s | ETA: 656.4s
[15:17:56] Processed 520/3295 | Elapsed: 122.5s | ETA: 652.5s
[15:17:58] Processed 530/3295 | Elapsed: 124.6s | 

                                                                                

✓ Saved batch: 50 records | Total processed: 550
[15:18:04] Processed 550/3295 | Elapsed: 129.8s | ETA: 646.8s
[15:18:06] Processed 560/3295 | Elapsed: 131.8s | ETA: 642.3s
[15:18:08] Processed 570/3295 | Elapsed: 133.7s | ETA: 638.3s
[15:18:10] Processed 580/3295 | Elapsed: 135.8s | ETA: 634.4s
[15:18:12] Processed 590/3295 | Elapsed: 137.7s | ETA: 630.5s


                                                                                

✓ Saved batch: 50 records | Total processed: 600
[15:18:15] Processed 600/3295 | Elapsed: 140.9s | ETA: 632.0s
[15:18:17] Processed 610/3295 | Elapsed: 143.0s | ETA: 628.2s
[15:18:19] Processed 620/3295 | Elapsed: 145.1s | ETA: 624.8s
[15:18:21] Processed 630/3295 | Elapsed: 147.1s | ETA: 621.1s
[15:18:23] Processed 640/3295 | Elapsed: 149.1s | ETA: 617.7s


                                                                                

✓ Saved batch: 50 records | Total processed: 650
[15:18:26] Processed 650/3295 | Elapsed: 152.3s | ETA: 618.8s
[15:18:28] Processed 660/3295 | Elapsed: 154.4s | ETA: 615.4s
[15:18:30] Processed 670/3295 | Elapsed: 156.5s | ETA: 612.0s
[15:18:32] Processed 680/3295 | Elapsed: 158.6s | ETA: 608.8s
[15:18:35] Processed 690/3295 | Elapsed: 160.6s | ETA: 605.6s
✓ Saved batch: 50 records | Total processed: 700
[15:18:37] Processed 700/3295 | Elapsed: 163.4s | ETA: 604.8s
[15:18:39] Processed 710/3295 | Elapsed: 165.4s | ETA: 601.3s
[15:18:41] Processed 720/3295 | Elapsed: 167.4s | ETA: 597.9s
[15:18:43] Processed 730/3295 | Elapsed: 169.5s | ETA: 594.6s
[15:18:45] Processed 740/3295 | Elapsed: 171.6s | ETA: 591.7s
✓ Saved batch: 50 records | Total processed: 750
[15:18:48] Processed 750/3295 | Elapsed: 174.4s | ETA: 591.1s
[15:18:50] Processed 760/3295 | Elapsed: 176.5s | ETA: 587.9s
[15:18:53] Processed 770/3295 | Elapsed: 178.6s | ETA: 585.1s
[15:18:55] Processed 780/3295 | Elapsed: 180.7s

                                                                                

✓ Saved batch: 50 records | Total processed: 800
[15:19:00] Processed 800/3295 | Elapsed: 185.9s | ETA: 579.0s
[15:19:02] Processed 810/3295 | Elapsed: 188.0s | ETA: 576.1s
[15:19:04] Processed 820/3295 | Elapsed: 190.1s | ETA: 573.1s
[15:19:06] Processed 830/3295 | Elapsed: 192.2s | ETA: 570.1s
[15:19:08] Processed 840/3295 | Elapsed: 194.2s | ETA: 566.9s
✓ Saved batch: 50 records | Total processed: 850
[15:19:11] Processed 850/3295 | Elapsed: 197.2s | ETA: 566.6s
[15:19:13] Processed 860/3295 | Elapsed: 199.2s | ETA: 563.4s
[15:19:15] Processed 870/3295 | Elapsed: 201.3s | ETA: 560.5s
[15:19:17] Processed 880/3295 | Elapsed: 203.3s | ETA: 557.4s
[15:19:19] Processed 890/3295 | Elapsed: 205.4s | ETA: 554.4s
✓ Saved batch: 50 records | Total processed: 900
[15:19:22] Processed 900/3295 | Elapsed: 208.4s | ETA: 554.0s
[15:19:24] Processed 910/3295 | Elapsed: 210.5s | ETA: 551.0s
[15:19:26] Processed 920/3295 | Elapsed: 212.5s | ETA: 548.0s
[15:19:28] Processed 930/3295 | Elapsed: 214.5s

                                                                                

✓ Saved batch: 50 records | Total processed: 1450
[15:21:23] Processed 1450/3295 | Elapsed: 329.5s | ETA: 418.9s
[15:21:25] Processed 1460/3295 | Elapsed: 331.5s | ETA: 416.4s
[15:21:27] Processed 1470/3295 | Elapsed: 333.6s | ETA: 413.9s
[15:21:29] Processed 1480/3295 | Elapsed: 335.6s | ETA: 411.2s
[15:21:31] Processed 1490/3295 | Elapsed: 337.6s | ETA: 408.7s


                                                                                

✓ Saved batch: 50 records | Total processed: 1500
[15:21:35] Processed 1500/3295 | Elapsed: 340.8s | ETA: 407.6s
[15:21:37] Processed 1510/3295 | Elapsed: 342.8s | ETA: 405.0s
[15:21:39] Processed 1520/3295 | Elapsed: 344.9s | ETA: 402.5s
[15:21:41] Processed 1530/3295 | Elapsed: 346.9s | ETA: 399.9s
[15:21:43] Processed 1540/3295 | Elapsed: 349.0s | ETA: 397.5s
✓ Saved batch: 50 records | Total processed: 1550
[15:21:46] Processed 1550/3295 | Elapsed: 352.1s | ETA: 396.2s
[15:21:48] Processed 1560/3295 | Elapsed: 354.2s | ETA: 393.7s
[15:21:50] Processed 1570/3295 | Elapsed: 356.2s | ETA: 391.1s
[15:21:52] Processed 1580/3295 | Elapsed: 358.4s | ETA: 388.7s
[15:21:54] Processed 1590/3295 | Elapsed: 360.5s | ETA: 386.3s
✓ Saved batch: 50 records | Total processed: 1600
[15:21:57] Processed 1600/3295 | Elapsed: 363.1s | ETA: 384.5s
[15:21:59] Processed 1610/3295 | Elapsed: 365.2s | ETA: 382.0s
[15:22:01] Processed 1620/3295 | Elapsed: 367.3s | ETA: 379.5s
[15:22:03] Processed 1630/3295 

                                                                                

✓ Saved batch: 50 records | Total processed: 1850
[15:22:52] Processed 1850/3295 | Elapsed: 418.5s | ETA: 326.7s
[15:22:54] Processed 1860/3295 | Elapsed: 420.6s | ETA: 324.3s
[15:22:56] Processed 1870/3295 | Elapsed: 422.6s | ETA: 321.9s
[15:22:59] Processed 1880/3295 | Elapsed: 424.8s | ETA: 319.5s
[15:23:01] Processed 1890/3295 | Elapsed: 426.8s | ETA: 317.1s


                                                                                

✓ Saved batch: 50 records | Total processed: 1900
[15:23:04] Processed 1900/3295 | Elapsed: 430.1s | ETA: 315.6s
[15:23:06] Processed 1910/3295 | Elapsed: 432.1s | ETA: 313.1s
[15:23:08] Processed 1920/3295 | Elapsed: 434.1s | ETA: 310.7s
[15:23:10] Processed 1930/3295 | Elapsed: 436.1s | ETA: 308.3s
[15:23:12] Processed 1940/3295 | Elapsed: 438.2s | ETA: 305.9s
✓ Saved batch: 50 records | Total processed: 1950
[15:23:15] Processed 1950/3295 | Elapsed: 441.2s | ETA: 304.1s
[15:23:17] Processed 1960/3295 | Elapsed: 443.2s | ETA: 301.7s
[15:23:19] Processed 1970/3295 | Elapsed: 445.2s | ETA: 299.3s
[15:23:21] Processed 1980/3295 | Elapsed: 447.3s | ETA: 296.9s
[15:23:23] Processed 1990/3295 | Elapsed: 449.4s | ETA: 294.5s
✓ Saved batch: 50 records | Total processed: 2000
[15:23:26] Processed 2000/3295 | Elapsed: 452.1s | ETA: 292.6s
[15:23:28] Processed 2010/3295 | Elapsed: 454.1s | ETA: 290.2s
[15:23:30] Processed 2020/3295 | Elapsed: 456.1s | ETA: 287.7s
[15:23:32] Processed 2030/3295 

                                                                                

✓ Saved batch: 50 records | Total processed: 2100
[15:23:48] Processed 2100/3295 | Elapsed: 474.4s | ETA: 269.8s
[15:23:50] Processed 2110/3295 | Elapsed: 476.4s | ETA: 267.4s
[15:23:52] Processed 2120/3295 | Elapsed: 478.5s | ETA: 265.1s
[15:23:54] Processed 2130/3295 | Elapsed: 480.6s | ETA: 262.8s
[15:23:57] Processed 2140/3295 | Elapsed: 482.6s | ETA: 260.4s


                                                                                

✓ Saved batch: 50 records | Total processed: 2150
[15:24:01] Processed 2150/3295 | Elapsed: 486.7s | ETA: 259.1s
[15:24:03] Processed 2160/3295 | Elapsed: 488.9s | ETA: 256.8s
[15:24:05] Processed 2170/3295 | Elapsed: 491.0s | ETA: 254.5s
[15:24:07] Processed 2180/3295 | Elapsed: 493.1s | ETA: 252.1s
[15:24:09] Processed 2190/3295 | Elapsed: 495.1s | ETA: 249.7s


                                                                                

✓ Saved batch: 50 records | Total processed: 2200
[15:24:13] Processed 2200/3295 | Elapsed: 498.8s | ETA: 248.2s
[15:24:15] Processed 2210/3295 | Elapsed: 500.9s | ETA: 245.8s
[15:24:17] Processed 2220/3295 | Elapsed: 502.8s | ETA: 243.4s
[15:24:19] Processed 2230/3295 | Elapsed: 504.9s | ETA: 241.0s
[15:24:21] Processed 2240/3295 | Elapsed: 506.9s | ETA: 238.6s


                                                                                

✓ Saved batch: 50 records | Total processed: 2250
[15:24:25] Processed 2250/3295 | Elapsed: 510.7s | ETA: 237.1s
[15:24:27] Processed 2260/3295 | Elapsed: 512.7s | ETA: 234.7s
[15:24:29] Processed 2270/3295 | Elapsed: 514.7s | ETA: 232.3s
[15:24:31] Processed 2280/3295 | Elapsed: 516.7s | ETA: 229.9s
[15:24:33] Processed 2290/3295 | Elapsed: 518.7s | ETA: 227.5s


                                                                                

✓ Saved batch: 50 records | Total processed: 2300
[15:24:36] Processed 2300/3295 | Elapsed: 521.9s | ETA: 225.7s
[15:24:38] Processed 2310/3295 | Elapsed: 524.0s | ETA: 223.3s
[15:24:40] Processed 2320/3295 | Elapsed: 526.1s | ETA: 221.0s
[15:24:42] Processed 2330/3295 | Elapsed: 528.1s | ETA: 218.6s
[15:24:44] Processed 2340/3295 | Elapsed: 530.4s | ETA: 216.4s


                                                                                

✓ Saved batch: 50 records | Total processed: 2350
[15:24:47] Processed 2350/3295 | Elapsed: 533.5s | ETA: 214.5s
[15:24:49] Processed 2360/3295 | Elapsed: 535.5s | ETA: 212.1s
[15:24:51] Processed 2370/3295 | Elapsed: 537.6s | ETA: 209.7s
[15:24:54] Processed 2380/3295 | Elapsed: 539.7s | ETA: 207.4s
[15:24:56] Processed 2390/3295 | Elapsed: 541.7s | ETA: 205.0s
✓ Saved batch: 50 records | Total processed: 2400
[15:24:59] Processed 2400/3295 | Elapsed: 544.8s | ETA: 203.1s
[15:25:01] Processed 2410/3295 | Elapsed: 546.8s | ETA: 200.7s
[15:25:03] Processed 2420/3295 | Elapsed: 548.9s | ETA: 198.4s
[15:25:05] Processed 2430/3295 | Elapsed: 551.0s | ETA: 196.0s
[15:25:07] Processed 2440/3295 | Elapsed: 553.0s | ETA: 193.7s
✓ Saved batch: 50 records | Total processed: 2450
[15:25:10] Processed 2450/3295 | Elapsed: 555.8s | ETA: 191.6s
[15:25:12] Processed 2460/3295 | Elapsed: 557.9s | ETA: 189.3s
[15:25:14] Processed 2470/3295 | Elapsed: 560.0s | ETA: 187.0s
[15:25:16] Processed 2480/3295 

                                                                                

✓ Saved batch: 50 records | Total processed: 2550
[15:25:32] Processed 2550/3295 | Elapsed: 577.9s | ETA: 168.8s
[15:25:34] Processed 2560/3295 | Elapsed: 580.0s | ETA: 166.5s
[15:25:36] Processed 2570/3295 | Elapsed: 582.1s | ETA: 164.1s
[15:25:38] Processed 2580/3295 | Elapsed: 584.2s | ETA: 161.8s
[15:25:40] Processed 2590/3295 | Elapsed: 586.2s | ETA: 159.5s


                                                                                

✓ Saved batch: 50 records | Total processed: 2600
[15:25:43] Processed 2600/3295 | Elapsed: 589.4s | ETA: 157.5s
[15:25:45] Processed 2610/3295 | Elapsed: 591.6s | ETA: 155.2s
[15:25:47] Processed 2620/3295 | Elapsed: 593.6s | ETA: 152.9s
[15:25:50] Processed 2630/3295 | Elapsed: 595.7s | ETA: 150.6s
[15:25:52] Processed 2640/3295 | Elapsed: 597.7s | ETA: 148.2s
✓ Saved batch: 50 records | Total processed: 2650
[15:25:55] Processed 2650/3295 | Elapsed: 600.6s | ETA: 146.1s
[15:25:57] Processed 2660/3295 | Elapsed: 602.7s | ETA: 143.8s
[15:25:59] Processed 2670/3295 | Elapsed: 604.7s | ETA: 141.5s
[15:26:01] Processed 2680/3295 | Elapsed: 606.7s | ETA: 139.2s
[15:26:03] Processed 2690/3295 | Elapsed: 608.7s | ETA: 136.9s
✓ Saved batch: 50 records | Total processed: 2700
[15:26:06] Processed 2700/3295 | Elapsed: 611.7s | ETA: 134.7s
[15:26:08] Processed 2710/3295 | Elapsed: 613.7s | ETA: 132.4s
[15:26:10] Processed 2720/3295 | Elapsed: 615.7s | ETA: 130.1s
[15:26:12] Processed 2730/3295 

                                                                                

✓ Saved batch: 50 records | Total processed: 2750
[15:26:17] Processed 2750/3295 | Elapsed: 623.4s | ETA: 123.5s
[15:26:19] Processed 2760/3295 | Elapsed: 625.5s | ETA: 121.2s
[15:26:21] Processed 2770/3295 | Elapsed: 627.5s | ETA: 118.9s
[15:26:24] Processed 2780/3295 | Elapsed: 629.7s | ETA: 116.6s
[15:26:26] Processed 2790/3295 | Elapsed: 631.7s | ETA: 114.3s


                                                                                

✓ Saved batch: 50 records | Total processed: 2800
[15:26:29] Processed 2800/3295 | Elapsed: 635.4s | ETA: 112.3s
[15:26:31] Processed 2810/3295 | Elapsed: 637.3s | ETA: 110.0s
[15:26:33] Processed 2820/3295 | Elapsed: 639.3s | ETA: 107.6s
[15:26:35] Processed 2830/3295 | Elapsed: 641.3s | ETA: 105.3s
[15:26:37] Processed 2840/3295 | Elapsed: 643.4s | ETA: 103.0s
✓ Saved batch: 50 records | Total processed: 2850
[15:26:40] Processed 2850/3295 | Elapsed: 646.3s | ETA: 100.9s
[15:26:42] Processed 2860/3295 | Elapsed: 648.3s | ETA: 98.6s
[15:26:44] Processed 2870/3295 | Elapsed: 650.5s | ETA: 96.3s
[15:26:46] Processed 2880/3295 | Elapsed: 652.5s | ETA: 94.0s
[15:26:48] Processed 2890/3295 | Elapsed: 654.5s | ETA: 91.7s
✓ Saved batch: 50 records | Total processed: 2900
[15:26:51] Processed 2900/3295 | Elapsed: 657.5s | ETA: 89.5s
[15:26:54] Processed 2910/3295 | Elapsed: 659.7s | ETA: 87.2s
[15:26:56] Processed 2920/3295 | Elapsed: 661.7s | ETA: 85.0s
[15:26:58] Processed 2930/3295 | Elaps

                                                                                

✓ Saved batch: 50 records | Total processed: 3150
[15:27:46] Processed 3150/3295 | Elapsed: 712.1s | ETA: 32.8s
[15:27:48] Processed 3160/3295 | Elapsed: 714.1s | ETA: 30.5s
[15:27:50] Processed 3170/3295 | Elapsed: 716.1s | ETA: 28.2s
[15:27:52] Processed 3180/3295 | Elapsed: 718.1s | ETA: 26.0s
[15:27:54] Processed 3190/3295 | Elapsed: 720.1s | ETA: 23.7s


                                                                                

✓ Saved batch: 50 records | Total processed: 3200
[15:27:57] Processed 3200/3295 | Elapsed: 723.6s | ETA: 21.5s
[15:27:59] Processed 3210/3295 | Elapsed: 725.6s | ETA: 19.2s
[15:28:01] Processed 3220/3295 | Elapsed: 727.6s | ETA: 16.9s
[15:28:04] Processed 3230/3295 | Elapsed: 729.7s | ETA: 14.7s
[15:28:06] Processed 3240/3295 | Elapsed: 731.8s | ETA: 12.4s


                                                                                

✓ Saved batch: 50 records | Total processed: 3250
[15:28:09] Processed 3250/3295 | Elapsed: 735.0s | ETA: 10.2s
[15:28:11] Processed 3260/3295 | Elapsed: 737.0s | ETA: 7.9s
[15:28:13] Processed 3270/3295 | Elapsed: 739.1s | ETA: 5.6s
[15:28:15] Processed 3280/3295 | Elapsed: 741.2s | ETA: 3.4s
[15:28:17] Processed 3290/3295 | Elapsed: 743.2s | ETA: 1.1s


                                                                                

✓ Saved final batch: 45 records

=== Completed processing 3295 articles in 745.5s ===

=== Checking saved results ===
Total records in table: 3295

Recent 5 records:


                                                                                

+---------------------------------------------------+-------------------+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-----------+------------+---------+------------+----------+------+--------------------------+--------------------------+
|                                             postID|        timePublish|                                                          description_Normalized|                                                                       Label_NER|Label_Topic|Label_Intent|likeCount|commentCount|shareCount|  type|                created_at|                updated_at|
+---------------------------------------------------+-------------------+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+-----------+------------+---------+------------+-----

## 8. Verify Dữ Liệu Đã Lưu

In [8]:
# Verify dữ liệu đã lưu (Cell 7 đã tự động lưu theo batch)
try:
    df_verify = spark.table("nessie.silver_tables.result_multi_model")
    total_count = df_verify.count()
    
    print(f"✓ Total records in table: {total_count}")
    print("\nRecent saved records:")
    df_verify.orderBy(col("created_at").desc()).show(5, truncate=False)
    
    # Thống kê số lượng records theo thời gian tạo
    print("\n=== Records saved by time ===")
    df_verify.groupBy("created_at").count() \
        .orderBy(col("created_at").desc()) \
        .show(20, truncate=False)
    
except Exception as e:
    print(f"✗ Error reading table: {e}")

✓ Total records in table: 3295

Recent saved records:


                                                                                

+--------------------------------------------------+-------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------+------------+---------+------------+----------+------+--------------------------+--------------------------+
|postID                                            |timePublish     



+--------------------------+-----+
|created_at                |count|
+--------------------------+-----+
|2025-12-06 15:28:18.630403|45   |
|2025-12-06 15:28:08.21911 |50   |
|2025-12-06 15:27:56.539175|50   |
|2025-12-06 15:27:45.384132|50   |
|2025-12-06 15:27:34.661695|50   |
|2025-12-06 15:27:23.890264|50   |
|2025-12-06 15:27:13.172456|50   |
|2025-12-06 15:27:02.232199|50   |
|2025-12-06 15:26:50.862312|50   |
|2025-12-06 15:26:39.822783|50   |
|2025-12-06 15:26:28.087923|50   |
|2025-12-06 15:26:16.231217|50   |
|2025-12-06 15:26:05.181591|50   |
|2025-12-06 15:25:54.054918|50   |
|2025-12-06 15:25:42.645255|50   |
|2025-12-06 15:25:31.266543|50   |
|2025-12-06 15:25:20.530875|50   |
|2025-12-06 15:25:09.393024|50   |
|2025-12-06 15:24:58.097393|50   |
|2025-12-06 15:24:46.750668|50   |
+--------------------------+-----+
only showing top 20 rows



                                                                                

## 9. Thống kê kết quả

In [9]:
# Đọc lại dữ liệu từ table
df_verify = spark.table("nessie.silver_tables.result_multi_model")

print(f"Total records in result_multi_model: {df_verify.count()}")
print("\nSample records:")
df_verify.show(10, truncate=False)

print("\n=== Label Statistics ===")
print("\nTopic distribution:")
df_verify.groupBy("Label_Topic").count().orderBy(col("count").desc()).show(10, truncate=False)

print("\nIntent distribution:")
df_verify.groupBy("Label_Intent").count().orderBy(col("count").desc()).show(truncate=False)

Total records in result_multi_model: 3295

Sample records:
+-----------------------------------------+-------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------+----------------+---------+------------+----------+--------+--------------------------+--------------------------+
|postID                                   |timePublish        |description_Normalized                                                                                            

                                                                                

+-------------------------+-----+
|Label_Topic              |count|
+-------------------------+-----+
|MAJOR                    |645  |
|UNIVERSITY               |564  |
|MAJOR|UNIVERSITY         |345  |
|CAREER|MAJOR             |308  |
|OTHER                    |208  |
|STUDY                    |154  |
|TUITION|UNIVERSITY       |129  |
|MAJOR|TUITION|UNIVERSITY |108  |
|MAJOR|SUBJECT_COMBINATION|83   |
|None                     |80   |
+-------------------------+-----+
only showing top 10 rows


Intent distribution:




+----------------+-----+
|Label_Intent    |count|
+----------------+-----+
|share_info      |1577 |
|ask_info        |879  |
|ask_advice      |401  |
|other           |182  |
|ask_confirmation|108  |
|ask_comparison  |74   |
|ask_experience  |74   |
+----------------+-----+



                                                                                

## 10. Dừng Spark Session

In [10]:
# Dừng Spark Session
spark.stop()
print("Spark Session đã được dừng!")

Spark Session đã được dừng!
