In [24]:
import json
import pandas as pd
from pathlib import Path
from dataclasses import dataclass, asdict
# https://medium.com/willhanchen/%E8%87%AA%E7%84%B6%E8%AA%9E%E8%A8%80%E8%99%95%E7%90%86-spacy-%E5%88%9D%E6%8E%A2%E5%BC%B7%E5%A4%A7%E7%9A%84%E5%B7%A5%E5%85%B7%E5%BA%ABspacy-%E8%AE%93%E6%A9%9F%E5%99%A8%E8%AE%80%E6%87%82%E6%88%91%E5%80%91%E7%9A%84%E8%AA%9E%E8%A8%80%E5%90%A7-4a35daa895d0
import spacy
from spacy.cli import download
import nltk
from nltk.corpus import stopwords
import string
import re

import gensim
from gensim.models import FastText, word2vec, Word2Vec
from gensim.utils import tokenize
import numpy as np
from tqdm import tqdm


# for model
import torch
import torch.nn as nn
import lightning as L
import math
from natsort import natsorted

from torch.utils.data import DataLoader, Dataset
from torch.utils.data import random_split   
from torch.nn.utils.rnn import pad_sequence

from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Callable

## Text

In [None]:
class TextData:
    # embedding model use fasttext
    def __init__(self, train_data:pd.DataFrame, test_data:pd.DataFrame):
        self.train_data = train_data
        self.test_data = test_data
        self.embedding_model_name = None
        
        self.setup()
        return 
    
    def train_embedding_model(self, result_files:dict, model_type:str="skipgram", embedding_size:int=100):
        self.embedding_model_name = model_type
        data_folder = Path(result_files["folder"])
        file_to_train = data_folder / result_files["embedding_training"]
        
        sentences = word2vec.LineSentence(file_to_train)
        
        self.model = FastText(
            sentences,
            vector_size=embedding_size,
            window=5,
            min_count=1,
            sg=1 if model_type == "skipgram" else 0, # skipgram = 1, cbow = 0
            hs=0,
            negative=5,
            epochs=10,
        )
        
        model_folder = Path("model")
        model_folder.mkdir(exist_ok=True, parents=True)
        
        model_file_name =  f"{model_type}.model"
        self.model.save(str(model_folder / model_file_name))
        
        return {"model_folder": str(model_folder), "model_file_name":model_file_name, "model_type": model_type}
    
    def load_embedding_model(self, model_path:str):
        self.embedding_model_name = model_path
        self.model = FastText.load(model_path)
        
        return
    
    def setup(self):
        # 第一次執行需要下載停用詞資源
        nltk.download("stopwords")
        download("en_core_web_sm")
        self.nlp = spacy.load("en_core_web_sm")
        return
    
    def clean_text_spacy(self, text:str) -> str:
        condition = lambda token : not token.is_stop and not token.is_punct and not token.is_digit
        
        doc = self.nlp(text)
        # 篩選出非停用字、非標點符號、非數字的 token，並轉為小寫
        tokens = [token.text.lower() for token in doc if condition(token)]
        return " ".join(tokens)
    
    def clean_text_nltk(self, text: str) -> str:
        # 轉成小寫
        text = text.lower()
        # 移除數字
        text = re.sub(r'\d+', '', text)
        # 移除標點符號
        text = re.sub(r'[{}]'.format(re.escape(string.punctuation)), '', text)
        # 移除多餘空白
        text = text.strip()
        # 斷詞
        tokens = text.split()
        # 移除停用字
        stop_words = set(stopwords.words("english"))
        tokens = [word for word in tokens if word not in stop_words]
        # 回傳清理後的結果
        return " ".join(tokens)
    
    def batch_process_text(self, process_funcs:list[Callable], max_workers:int=None):
        result = []
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(process_func) for process_func in process_funcs]
            for future in as_completed(futures):
                result.append(future.result())
                
        return result
    
    def wrapper_function(self,id_:int, type_:int, headline_in:str, short_description_in:str, label:int, process_func:str):
        
        def process_it():
            headline = process_func(headline_in)
            short_description = process_func(short_description_in)
            item = "<CLS> " + headline + " <SEP> " +  short_description + " <END>"
            return {"id": id_, "type": type_, "process_text": item, "label": label}
        
        return process_it
    
    
    def process_all_data(self, process:str, batch_size:int=100, max_workers:int=None):
        process_func = self.clean_text_spacy if process == "spacy" else self.clean_text_nltk
        
        train_data_df = self.train_data[["id" , "headline", "short_description", "label"]].copy()
        train_data_df["type"] = "train"
        test_data_df = self.test_data[["id" ,"headline", "short_description"]].copy()
        test_data_df["type"] = "test"
        test_data_df["label"] = "IDK"
        
        data_df = pd.concat([train_data_df, test_data_df], axis=0)
        
        
        jobs = []
        for _, row in tqdm(data_df.iterrows(), total=data_df.shape[0], desc="Build Jobs"):
            id_ = row["id"]
            type_ = row["type"]
            headline = row["headline"]
            short_description = row["short_description"]
            label = row["label"]
            
            # 使用 wrapper_function 來包裝 process_func
            job = self.wrapper_function(id_, type_, headline, short_description, label , process_func)
            jobs.append(job)
        
        result = []
        for batch in tqdm(range(0, len(jobs), batch_size), desc="Processing text"):
            batch_jobs = jobs[batch:batch + batch_size]
            out = self.batch_process_text(batch_jobs, max_workers)
            result.extend(out)
        
        
        output_df = pd.DataFrame(result)
        
        folder = Path("temp")
        folder.mkdir(exist_ok=True, parents=True)
        
        prefix = f"temp_{process}"
        
        # to temp file for embedding
        with open(folder / f"{prefix}.txt", "w", encoding="utf-8") as f:
            # add special tokens
            f.write("<PAD> <CLS> <SEP> <END> <UNK>\n")
            for text in output_df["process_text"]:
                f.write(text + "\n")
        
        # save processed data
        output_df.to_csv(folder / f"{prefix}.csv", index=False)
        
        output_dict = {"folder":str(folder), "embedding_training" :f"{prefix}.txt", "record": f"{prefix}.csv" }
        # split train test data
        train_test_path = TextData.split_train_test_data(output_dict)
        
        return output_dict | train_test_path
    
    @staticmethod
    def split_train_test_data(process_all_data_output:dict):
        
        folder = process_all_data_output["folder"]
        folder = Path(folder) 
        record_file = process_all_data_output["record"]
        record_path = folder / record_file
        df = pd.read_csv(record_path, low_memory=False, dtype={"process_text": str})
        
        df_train = df[df["type"] == "train"].copy()
        df_test = df[df["type"] == "test"].copy()
        
        df_train = df_train.sort_values(by=["id"]).reset_index(drop=True)
        df_test = df_test.sort_values(by=["id"]).reset_index(drop=True)
        
        df_train = df_train.drop(columns=["type"])
        df_test = df_test.drop(columns=["type"])
        
        prefix = record_file.replace(".csv", "")    
        
        train_file = folder / f"{prefix}_train.csv"
        test_file = folder / f"{prefix}_test.csv"
        
        df_train.to_csv(train_file, index=False)
        df_test.to_csv(test_file, index=False)
        return {"train_path": str(train_file), "test_path": str(test_file)}
    
    
    @staticmethod
    def load_from_folder(folder:str): 
        path = Path(folder)
        if not path.exists():
            raise FileNotFoundError(f"Folder {folder} does not exist.")
        
        train_data_json = path / "News_train.json"
        test_data_json = path / "News_test.json"
        
        with open(train_data_json, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            train_data = [json.loads(line) for line in lines]   
            
        with open(test_data_json, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            test_data = [json.loads(line) for line in lines]
        
        train_data = pd.DataFrame(train_data)
        test_data = pd.DataFrame(test_data)
        
        return TextData(train_data, test_data)
    
    
    def tokenize(self, text:str | list[str]) -> np.ndarray:
        # 使用 FastText 模型進行斷詞
        unk_id = self.model.wv.key_to_index.get("<UNK>", 0)
        
        if isinstance(text, str):
            text_ls = text.split()
        elif isinstance(text, list):
            text_ls = text

        token_nums = [
            self.model.wv.key_to_index.get(text, unk_id)
            for text in text_ls
        ]
        
        return token_nums
    
    
    def embedding_data(self):
        vocab = self.model.wv.index_to_key
        vocab_size = len(vocab)
        embed_dim = self.model.wv.vector_size
        embedding_weight = torch.FloatTensor(
            np.array([self.model.wv[word] for word in vocab])
        )
        
        return {
            "vocab": vocab,
            "vocab_size": vocab_size,
            "embed_dim": embed_dim,
            "embedding_weight": embedding_weight
        }

In [13]:
DATA_FOLDER = "./2025-deep-learning-hw-2-text-classification"

In [14]:
text_data = TextData.load_from_folder(DATA_FOLDER)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/keithlin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m51.4 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [7]:
PROCESS_TYPE = "spacy"
PROCESS_DATA_BATCH_SIZE = 1000

In [None]:
output = text_data.process_all_data(PROCESS_TYPE, batch_size=PROCESS_DATA_BATCH_SIZE)

In [59]:
if True:
    folder = Path("temp")
    folder.mkdir(exist_ok=True, parents=True)

    prefix = f"temp_{PROCESS_TYPE}"
    output = {
        "folder":str(folder), 
        "embedding_training" :f"{prefix}.txt", 
        "record": f"{prefix}.csv",
        "train_path":f"{prefix}_train.csv", 
        "test_path": f"{prefix}_test.csv" 
    }
    print(output)

{'folder': 'temp', 'embedding_training': 'temp_spacy.txt', 'record': 'temp_spacy.csv', 'train_path': 'temp_spacy_train.csv', 'test_path': 'temp_spacy_test.csv'}


In [30]:
EMBEDDING_METHOD = "skipgram"
EMBEDDING_SIZE = 128

In [None]:
model_file = text_data.train_embedding_model(output, model_type=EMBEDDING_METHOD, embedding_size=EMBEDDING_SIZE)

In [31]:
if True:
    model_file = {"model_folder": "model", "model_file_name":f"{EMBEDDING_METHOD}.model", "model_type": EMBEDDING_METHOD}

In [32]:
EMBEDDING_MODEL_PATH = Path(model_file["model_folder"]) / model_file["model_file_name"]

In [33]:
text_data.load_embedding_model(str(EMBEDDING_MODEL_PATH))

## Dataset

In [55]:
class TextDataSet(Dataset):
    def __init__(self, df_in:pd.DataFrame, key_to_index:dict, for_test:bool=False):
        super().__init__()
        self.key_to_index = key_to_index
        self.data = df_in
        
        self.labels = natsorted(set(df_in["label"].values))
        self.for_test = for_test
        return
    
    def detokenize(self, token_nums:list[int]) -> str:
        # 使用 FastText 模型進行斷詞
        index_to_key = list(self.key_to_index.keys())
        text = " ".join([
            index_to_key[token_num]
            for token_num in token_nums
        ])
        
        return text
    
    def tokenize(self, text:str | list[str]) -> np.ndarray:
        # 使用 FastText 模型進行斷詞
        unk_id = self.key_to_index.get("<UNK>", 0)
        
        if isinstance(text, str):
            text_ls = text.split()
        elif isinstance(text, list):
            text_ls = text

        token_nums = [
            self.key_to_index.get(text, unk_id)
            for text in text_ls
        ]
        
        return token_nums
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index:int):
        line_data = self.data.iloc[index]
        text = line_data["process_text"]
        label = int(line_data["label"])
        token_nums = self.tokenize(text)
        token_nums = torch.tensor(token_nums, dtype=torch.long)
        
        if self.for_test:
            return token_nums
        
        return token_nums, label

In [None]:
class TextDataModule(L.LightningDataModule):
    def __init__(
        self,     
        text_data:TextData,
        process_all_data_output:dict, 
        batch_size:int=32, 
        num_workers:int=0, 
        pin_memory:bool=False,
        val_ratio:float=0.1,
    ):
        
        super().__init__()
        self.text_data = text_data  
        self.key_to_index = text_data.model.wv.key_to_index
        self.batch_size = batch_size
        self.num_workers = num_workers
        self.pin_memory = pin_memory
        self.train_df_path = process_all_data_output["train_path"]
        self.test_df_path = process_all_data_output["test_path"]
        self.val_ratio = val_ratio
        return 

    def setup(self, stage=None):
        if stage == "fit" or stage is None:
            train_df = pd.read_csv(self.train_df_path)
            large_data = TextDataSet(train_df, key_to_index=self.key_to_index)
            total_size = len(large_data)
            val_size = int(total_size * self.val_ratio)
            train_size = total_size - val_size
            
            self.train_dataset, self.val_dataset = random_split(
                large_data, [train_size, val_size]
            )
            
            self.predict_df = pd.read_csv(self.test_df_path)
            self.predict_dataset = TextDataSet(self.predict_df, key_to_index=self.key_to_index, for_test=True)
            
        
        elif stage == "predict":
            self.predict_df = pd.read_csv(self.test_df_path)
            self.predict_dataset = TextDataSet(self.predict_df, key_to_index=self.key_to_index, for_test=True)
            return
    
    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
            shuffle=True,
        )
    
    def val_dataloader(self):
        return DataLoader(
            self.val_dataset,
            batch_size=self.batch_size,
            num_workers=self.num_workers,
            pin_memory=self.pin_memory,
            shuffle=False,
        )
    
    def predict_dataloader(self):
        pass
    

## Model


In [36]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=512):
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        # 若 d_model 為奇數，cos 會略少一個維度
        if d_model % 2 == 1:
            pe[:, 1::2] = torch.cos(position * div_term[:-1])
        else:
            pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # shape: (1, max_len, d_model)
        self.register_buffer("pe", pe)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        # x shape: (batch_size, seq_len, d_model)
        x = x + self.pe[:, : x.size(1), :]
        return self.dropout(x)


@dataclass(slots=True)
class EmbeddingConfig:
    vocab_size: int
    embed_dim: int

    # fast_text_pretrained
    state: bool = False
    model_path: str = None
    freeze: bool = False

    @staticmethod
    def config(vocab_size: int, embed_dim: int):
        return EmbeddingConfig(vocab_size=vocab_size, embed_dim=embed_dim)

    @staticmethod
    def load_embedding_pretrain(embedding_path: str, freeze: bool = True):
        return EmbeddingConfig(
            vocab_size=0,
            embed_dim=0,
            state=True,
            model_path=embedding_path,
            freeze=freeze,
        )

    @staticmethod
    def embedding_data(model: Word2Vec):
        vocab = model.wv.index_to_key
        vocab_size = len(vocab)
        embed_dim = model.wv.vector_size
        embedding_weight = torch.FloatTensor(
            np.array([model.wv[word] for word in vocab])
        )

        return {
            "vocab": vocab,
            "vocab_size": vocab_size,
            "embed_dim": embed_dim,
            "embedding_weight": embedding_weight,
        }

    def build_embedding_weight(self) -> torch.Tensor:

        if not self.state:
            raise ValueError(
                "EmbeddingConfig: fasttext pretrained embedding not found."
            )

        file_path = Path(self.model_path)
        if not file_path.exists():
            raise ValueError(
                f"EmbeddingConfig: fasttext pretrained embedding not found. {file_path}"
            )

        model = FastText.load(str(file_path))
        embedding_data = self.embedding_data(model)
        self.vocab_size = embedding_data["vocab_size"]
        self.embed_dim = embedding_data["embed_dim"]

        return embedding_data["embedding_weight"]


class TransformerClassifier(L.LightningModule):
    def __init__(
        self,
        config_dict: dict,
        num_classes: int,
        nhead: int = 8,
        num_layers: int = 2,
        dim_feedforward: int = 512,
        dropout: float = 0.1,
        max_seq_length: int = 512,
    ):
        super().__init__()

        # vocab_size, embed_dim, embedding_weight = self.decode_config(config_dict)
        config = EmbeddingConfig(**config_dict)
        self.embedding, config = self.build_embedding(config)
        self.embed_dim = config.embed_dim

        self.pos_encoder = PositionalEncoding(self.embed_dim, dropout, max_seq_length)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.embed_dim,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            batch_first=True,
        )

        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=num_layers
        )

        # 可採用簡單的池化方式，例如取 [CLS] token 的輸出
        
        self.classifier = nn.Sequential(
            nn.Linear(self.embed_dim, self.embed_dim//2),
            nn.Tanh(),
            nn.Linear(self.embed_dim//2, num_classes),
        )

        self.save_hyperparameters()
        return

    def build_embedding(self, config: EmbeddingConfig):

        if not config.state:
            return nn.Embedding(config.vocab_size, config.embed_dim), config

        embedding_weight = config.build_embedding_weight()

        return (
            nn.Embedding.from_pretrained(embedding_weight, freeze=config.freeze),
            config,
        )

    def forward(self, src):
        # src shape: (batch_size, seq_len) --> token ids
        x = self.embedding(src) * math.sqrt(
            self.embed_dim
        )  # (batch_size, seq_len, embed_dim)
        x = self.pos_encoder(x)
        # TransformerEncoder 預設輸入 shape: (seq_len, batch_size, embed_dim)
        # x = x.transpose(0, 1)
        x = self.transformer_encoder(x)  # (seq_len, batch_size, embed_dim)
        # 此處假設第一個 token 為 [CLS] token，可用於分類
        # cls_token = x[0]  # shape: (batch_size, embed_dim)
        # x = x[:, 0, :]  # shape: (batch_size, embed_dim)
        x = x.mean(dim=1)  # shape: (batch_size, embed_dim)
        logits = self.classifier(x)  # shape: (batch_size, num_classes)
        return logits

    @staticmethod
    def load_from_embedding_pretrained(
        embedding_path: str,
        num_classes: int,
        feeze_pretrained_embedding: bool = True,
        nhead: int = 8,
        num_layers: int = 2,
        dim_feedforward: int = 512,
        dropout: float = 0.1,
        max_seq_length: int = 512,
    ):
        embedding_config = EmbeddingConfig.load_embedding_pretrain(
            embedding_path, feeze_pretrained_embedding
        )

        return TransformerClassifier(
            asdict(embedding_config),
            num_classes=num_classes,
            nhead=nhead,
            num_layers=num_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            max_seq_length=max_seq_length,
        )
    @staticmethod
    def build_model(
        vocab_size: int,
        embed_dim: int,
        num_classes: int,
        nhead: int = 8,
        num_layers: int = 2,
        dim_feedforward: int = 512,
        dropout: float = 0.1,
        max_seq_length: int = 512,
    ):
        embedding_config = EmbeddingConfig.config(
            vocab_size=vocab_size, embed_dim=embed_dim
        )
        return TransformerClassifier(
            asdict(embedding_config),
            num_classes=num_classes,
            nhead=nhead,
            num_layers=num_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
            max_seq_length=max_seq_length,
        )

## Build Training Loop

In [37]:
model = TransformerClassifier.load_from_embedding_pretrained(
    str(EMBEDDING_MODEL_PATH),
    num_classes=4,
    feeze_pretrained_embedding=True,
    nhead=8,
    num_layers=2,
    dim_feedforward=512,
    dropout=0.1,
    max_seq_length=512,
)

In [38]:
print(model)

TransformerClassifier(
  (embedding): Embedding(74602, 128)
  (pos_encoder): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-1): 2 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=512, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=512, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (classifier): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): Tanh()
    (2): Linear(in_features=64, 

In [39]:
dummy_input = torch.randint(0, 100, (2, 512))
print(dummy_input.shape)
dummy_output = model(dummy_input)
print(dummy_output.shape)

torch.Size([2, 512])
torch.Size([2, 4])


In [66]:
#  # 範例：假設詞彙大小 10000, 嵌入維度 128, 分類類別數 5, 輸入長度 50
# batch_size = 16
# seq_len = 50
# vocab_size = 10000
# embed_dim = 128
# num_classes = 5

# model = TransformerClassifier(vocab_size, embed_dim, num_classes)
# sample_input = torch.randint(0, vocab_size, (batch_size, seq_len))
# output = model(sample_input)
# print(output.shape)  # 預期：(batch_size, num_classes)

In [None]:
# vocab_size = 10000
# embed_dim = 128
# model = TransformerClassifier.build_model(
#     vocab_size=vocab_size,
#     embed_dim=embed_dim,
#     num_classes=5,
#     nhead=8,
#     num_layers=6,
#     dim_feedforward=512,
#     dropout=0.1,
#     max_seq_length=512,
# )
# print(model)
# dummy_input = torch.randint(0, vocab_size, (16, 50))
# output = model(dummy_input)
# print(output.shape)  # 預期：(batch_size, num_classes)