In [20]:
import json
import pandas as pd
from pathlib import Path
from dataclasses import dataclass
# https://medium.com/willhanchen/%E8%87%AA%E7%84%B6%E8%AA%9E%E8%A8%80%E8%99%95%E7%90%86-spacy-%E5%88%9D%E6%8E%A2%E5%BC%B7%E5%A4%A7%E7%9A%84%E5%B7%A5%E5%85%B7%E5%BA%ABspacy-%E8%AE%93%E6%A9%9F%E5%99%A8%E8%AE%80%E6%87%82%E6%88%91%E5%80%91%E7%9A%84%E8%AA%9E%E8%A8%80%E5%90%A7-4a35daa895d0
import spacy
from spacy.cli import download
import nltk
from nltk.corpus import stopwords
import string
import re

import gensim
from gensim.models import FastText, word2vec

from tqdm import tqdm

# for model
import torch
import torch.nn as nn
import math

from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Callable

In [None]:
class TextData:
    # embedding model use fasttext
    def __init__(self, train_data:pd.DataFrame, test_data:pd.DataFrame):
        self.train_data = train_data
        self.test_data = test_data
        self.embedding_model_name = None
        
        self.setup()
        return 
    
    def train_embedding_model(self, result_files:dict, model_type:str="skipgram", embedding_size:int=100):
        self.embedding_model_name = model_type
        data_folder = Path(result_files["folder"])
        file_to_train = data_folder / result_files["embedding_training"]
        
        sentences = word2vec.LineSentence(file_to_train)
        
        self.model = FastText(
            sentences,
            vector_size=embedding_size,
            window=5,
            min_count=1,
            sg=1 if model_type == "skipgram" else 0, # skipgram = 1, cbow = 0
            hs=0,
            negative=5,
            epochs=10,
        )
        
        model_folder = Path("model")
        model_folder.mkdir(exist_ok=True, parents=True)
        
        model_file_name = model_folder / f"{model_type}.model"
        self.model.save(model_file_name)
        
        return {"model_folder": model_folder, "model_file_name":model_file_name, "model_type": model_type}
    
    def load_embedding_model(self, model_path:str):
        self.embedding_model_name = model_path
        self.model = FastText.load(model_path)
        
        return
    
    def setup(self):
        # 第一次執行需要下載停用詞資源
        nltk.download("stopwords")
        download("en_core_web_sm")
        self.nlp = spacy.load("en_core_web_sm")
        return
    
    def clean_text_spacy(self, text:str) -> str:
        condition = lambda token : not token.is_stop and not token.is_punct and not token.is_digit
        
        doc = self.nlp(text)
        # 篩選出非停用字、非標點符號、非數字的 token，並轉為小寫
        tokens = [token.text.lower() for token in doc if condition(token)]
        return " ".join(tokens)
    
    def clean_text_nltk(self, text: str) -> str:
        # 轉成小寫
        text = text.lower()
        # 移除數字
        text = re.sub(r'\d+', '', text)
        # 移除標點符號
        text = re.sub(r'[{}]'.format(re.escape(string.punctuation)), '', text)
        # 移除多餘空白
        text = text.strip()
        # 斷詞
        tokens = text.split()
        # 移除停用字
        stop_words = set(stopwords.words("english"))
        tokens = [word for word in tokens if word not in stop_words]
        # 回傳清理後的結果
        return " ".join(tokens)
    
    def batch_process_text(self, process_funcs:list[Callable], max_workers:int=None):
        result = []
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(process_func) for process_func in process_funcs]
            for future in as_completed(futures):
                result.append(future.result())
                
        return result
    
    def wrapper_function(self,id_:int, type_:int, headline_in:str, short_description_in:str, label:int, process_func:str):
        
        def process_it():
            headline = process_func(headline_in)
            short_description = process_func(short_description_in)
            item = "<HL_S> " + headline + " <HL_E> <SD_S> " +  short_description + " <SD_E>"
            return {"id": id_, "type": type_, "process_text": item, "label": label}
        
        return process_it
    
    
    def process_all_data(self, process:str, batch_size:int=100, max_workers:int=None):
        process_func = self.clean_text_spacy if process == "spacy" else self.clean_text_nltk
        
        train_data_df = self.train_data[["id" , "headline", "short_description", "label"]].copy()
        train_data_df["type"] = "train"
        test_data_df = self.test_data[["id" ,"headline", "short_description"]].copy()
        test_data_df["type"] = "test"
        test_data_df["label"] = "IDK"
        
        data_df = pd.concat([train_data_df, test_data_df], axis=0)
        
        
        jobs = []
        for _, row in tqdm(data_df.iterrows(), total=data_df.shape[0], desc="Build Jobs"):
            id_ = row["id"]
            type_ = row["type"]
            headline = row["headline"]
            short_description = row["short_description"]
            label = row["label"]
            
            # 使用 wrapper_function 來包裝 process_func
            job = self.wrapper_function(id_, type_, headline, short_description, label , process_func)
            jobs.append(job)
        
        result = []
        for batch in tqdm(range(0, len(jobs), batch_size), desc="Processing text"):
            batch_jobs = jobs[batch:batch + batch_size]
            out = self.batch_process_text(batch_jobs, max_workers)
            result.extend(out)
        
        
        output_df = pd.DataFrame(result)
        
        folder = Path("temp")
        folder.mkdir(exist_ok=True, parents=True)
        
        prefix = f"temp_{process}"
        
        # to temp file for embedding
        with open(folder / f"{prefix}.txt", "w", encoding="utf-8") as f:
            for text in output_df["process_text"]:
                f.write(text + "\n")
        
        # save processed data
        output_df.to_csv(folder / f"{prefix}.csv", index=False)
        
        return {"folder":str(folder), "embedding_training" :f"{prefix}.txt", "record": f"{prefix}.csv" }
    
    @staticmethod
    def load_from_folder(folder:str): 
        path = Path(folder)
        if not path.exists():
            raise FileNotFoundError(f"Folder {folder} does not exist.")
        
        train_data_json = path / "News_train.json"
        test_data_json = path / "News_test.json"
        
        with open(train_data_json, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            train_data = [json.loads(line) for line in lines]   
            
        with open(test_data_json, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            test_data = [json.loads(line) for line in lines]
        
        train_data = pd.DataFrame(train_data)
        test_data = pd.DataFrame(test_data)
        
        return TextData(train_data, test_data)
        

In [3]:
text_data = TextData.load_from_folder("./2025-deep-learning-hw-2-text-classification")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/keithlin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
output = text_data.process_all_data("spacy", batch_size=1000)

Build Jobs:   0%|          | 0/136608 [00:00<?, ?it/s]

Build Jobs: 100%|██████████| 136608/136608 [00:06<00:00, 21645.63it/s]
Processing text: 100%|██████████| 137/137 [19:49<00:00,  8.68s/it]


In [8]:
model_file = text_data.train_embedding_model(output, model_type="skipgram", embedding_size=150)

In [9]:
model_file

{'model_folder': PosixPath('model'),
 'model_file_name': 'skipgram.model',
 'model_type': 'skipgram'}

In [19]:
text_data.model.wv["boy"]

array([ 0.1951654 , -0.86813563, -0.09932711, -0.5751072 , -0.14427118,
       -0.47718504,  0.18150555,  0.28145316,  0.00348425,  0.02763144,
        0.48817632, -0.19027652, -0.12103428,  0.1658833 , -0.27329892,
       -0.19987366,  0.12913217,  0.5826882 , -0.281025  ,  0.6271863 ,
       -0.47733048,  0.36992878, -0.43996844,  0.04975969,  0.30396414,
       -0.35255644, -0.27388632, -0.63669693, -0.02225855, -0.18977751,
       -0.3521751 , -0.48049617,  0.40071884, -0.10234027,  0.6778273 ,
       -0.16870269, -0.17364618, -0.18735719, -0.5169075 ,  0.20306389,
       -0.04351829,  0.16755113,  0.24116327,  0.31998774,  0.34392014,
       -0.4167296 ,  0.4412915 ,  0.07830356, -0.41733602,  0.6143932 ,
       -0.25059256,  0.0719436 ,  0.04181987,  0.13672096,  0.60727376,
        0.6580578 , -0.19885924, -0.30628523, -0.23641297,  0.00603269,
        0.03962618,  0.02013242, -0.23035033, -0.10245129,  0.31155032,
        0.1301886 ,  0.05717582, -0.62526053,  0.1374269 ,  0.05

In [12]:
len(text_data.model.wv.index_to_key)

74601

In [None]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=512):
        super().__init__()
        self.dropout = nn.Dropout(dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        # 若 d_model 為奇數，cos 會略少一個維度
        if d_model % 2 == 1:
            pe[:, 1::2] = torch.cos(position * div_term[:-1])
        else:
            pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)  # shape: (1, max_len, d_model)
        self.register_buffer("pe", pe)

    def forward(self, x):
        # x shape: (batch_size, seq_len, d_model)
        x = x + self.pe[:, : x.size(1), :]
        return self.dropout(x)


class TransformerClassifier(nn.Module):
    def __init__(
        self,
        vocab_size,
        embed_dim,
        num_classes,
        nhead=8,
        num_layers=2,
        dim_feedforward=512,
        dropout=0.1,
        max_seq_length=512,
    ):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.pos_encoder = PositionalEncoding(embed_dim, dropout, max_seq_length)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=nhead,
            dim_feedforward=dim_feedforward,
            dropout=dropout,
        )
        self.transformer_encoder = nn.TransformerEncoder(
            encoder_layer, num_layers=num_layers
        )

        # 可採用簡單的池化方式，例如取 [CLS] token 的輸出
        self.fc = nn.Linear(embed_dim, num_classes)
        self.embed_dim = embed_dim

    def forward(self, src):
        # src shape: (batch_size, seq_len) --> token ids
        x = self.embedding(src) * math.sqrt(
            self.embed_dim
        )  # (batch_size, seq_len, embed_dim)
        x = self.pos_encoder(x)
        # TransformerEncoder 預設輸入 shape: (seq_len, batch_size, embed_dim)
        x = x.transpose(0, 1)
        x = self.transformer_encoder(x)  # (seq_len, batch_size, embed_dim)
        # 此處假設第一個 token 為 [CLS] token，可用於分類
        cls_token = x[0]  # shape: (batch_size, embed_dim)
        logits = self.fc(cls_token)
        return logits