In [None]:
import json
import pandas as pd
from pathlib import Path
from dataclasses import dataclass
import spacy
from spacy.cli import download
import gensim
from gensim.models import FastText

from tqdm import tqdm


from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Callable

In [None]:
class TextData:
    # embedding model use fasttext
    def __init__(self, train_data:pd.DataFrame, test_data:pd.DataFrame):
        self.train_data = train_data
        self.test_data = test_data
        self.embedding_model_name = None
        
        self.setup()
        return 
    
    # def train_embedding_model(self, result_files:dict,model_type:str="skipgram", embedding_size:int=300):
    #     self.embedding_model_name = model_type
    #     file_to_train = result_files["embedding_training"]
    #     self.model = fasttext.train_unsupervised(file_to_train, model=model_type, minCount=1)
        
    #     return 
    
    # def load_embedding_model(self, model_path:str):
    #     self.embedding_model_name = model_path
    #     self.model = fasttext.load_model(model_path)
        
    #     return
    
    def setup(self):
        # 第一次執行需要下載停用詞資源
        download("en_core_web_sm")
        self.nlp = spacy.load("en_core_web_sm")
        return
    
    def clean_text_spacy(self, text:str) -> str:
        condition = lambda token : not token.is_stop and not token.is_punct and not token.is_digit
        
        doc = self.nlp(text)
        # 篩選出非停用字、非標點符號、非數字的 token，並轉為小寫
        tokens = [token.text.lower() for token in doc if condition(token)]
        return " ".join(tokens)
    
    # def clean_text_nltk(self, text: str) -> str:
    #     # 轉成小寫
    #     text = text.lower()
    #     # 移除數字
    #     text = re.sub(r'\d+', '', text)
    #     # 移除標點符號
    #     text = re.sub(r'[{}]'.format(re.escape(string.punctuation)), '', text)
    #     # 移除多餘空白
    #     text = text.strip()
    #     # 斷詞
    #     tokens = text.split()
    #     # 移除停用字
    #     stop_words = set(stopwords.words("english"))
    #     tokens = [word for word in tokens if word not in stop_words]
    #     # 回傳清理後的結果
    #     return " ".join(tokens)
    
    def batch_process_text(self, process_funcs:list[Callable], max_workers:int=None):
        result = []
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            futures = [executor.submit(process_func) for process_func in process_funcs]
            for future in as_completed(futures):
                result.append(future.result())
                
        return result
    
    def wrapper_function(self,id_:int, type_:int, headline_in:str, short_description_in:str, label:int, process_func:str):
        
        def process_it():
            headline = process_func(headline_in)
            short_description = process_func(short_description_in)
            item = "<HL_S> " + headline + " <HL_E> <SD_S> " +  short_description + " <SD_E>"
            return {"id": id_, "type": type_, "process_text": item, "label": label}
        
        return process_it
    
    
    def process_all_data(self, process:str, batch_size:int=100, max_workers:int=None):
        process_func = self.clean_text_spacy if process == "spacy" else self.clean_text_nltk
        
        train_data_df = self.train_data[["id" , "headline", "short_description", "label"]].copy()
        train_data_df["type"] = "train"
        test_data_df = self.test_data[["id" ,"headline", "short_description"]].copy()
        test_data_df["type"] = "test"
        test_data_df["label"] = "IDK"
        
        data_df = pd.concat([train_data_df, test_data_df], axis=0)
        
        
        jobs = []
        for _, row in tqdm(data_df.iterrows(), total=data_df.shape[0], desc="Build Jobs"):
            id_ = row["id"]
            type_ = row["type"]
            headline = row["headline"]
            short_description = row["short_description"]
            label = row["label"]
            
            # 使用 wrapper_function 來包裝 process_func
            job = self.wrapper_function(id_, type_, headline, short_description, label , process_func)
            jobs.append(job)
        
        result = []
        for batch in tqdm(range(0, len(jobs), batch_size), desc="Processing text"):
            batch_jobs = jobs[batch:batch + batch_size]
            out = self.batch_process_text(batch_jobs, max_workers)
            result.extend(out)
        
        
        output_df = pd.DataFrame(result)
        
        prefix = f"temp_{process}"
        
        # to temp file for embedding
        with open(f"{prefix}.txt", "w", encoding="utf-8") as f:
            for text in output_df["process_text"]:
                f.write(text + "\n")
        
        # save processed data
        output_df.to_csv(f"{prefix}.csv", index=False)
        
        return {"embedding_training" :f"{prefix}.txt", "record": f"{prefix}.csv" }
    
    @staticmethod
    def load_from_folder(folder:str): 
        path = Path(folder)
        if not path.exists():
            raise FileNotFoundError(f"Folder {folder} does not exist.")
        
        train_data_json = path / "News_train.json"
        test_data_json = path / "News_test.json"
        
        with open(train_data_json, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            train_data = [json.loads(line) for line in lines]   
            
        with open(test_data_json, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            test_data = [json.loads(line) for line in lines]
        
        train_data = pd.DataFrame(train_data)
        test_data = pd.DataFrame(test_data)
        
        return TextData(train_data, test_data)
        

In [3]:
text_data = TextData.load_from_folder("./2025-deep-learning-hw-2-text-classification")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/keithlin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
output = text_data.process_all_data("spacy", batch_size=1000)

Build Jobs:   0%|          | 0/136608 [00:00<?, ?it/s]

Build Jobs: 100%|██████████| 136608/136608 [00:06<00:00, 21841.06it/s]
Processing text: 100%|██████████| 137/137 [18:21<00:00,  8.04s/it]


In [5]:
output

{'embedding_training': 'temp_spacy.txt', 'record': 'temp_spacy.csv'}

In [28]:

# 透過 unsupervised 方法，使用 Skipgram 模型進行訓練
model = fasttext.train_unsupervised("corpus.txt", model="skipgram", minCount=1)


Read 0M words
Number of words:  18
Number of labels: 0
Progress: 100.0% words/sec/thread:   70227 lr:  0.000000 avg.loss:      -nan ETA:   0h 0m 0s


In [29]:
model.words

['<END>',
 '</s>',
 '<START>',
 'is',
 'example',
 'great',
 'data',
 'text',
 'more',
 'with',
 'Another',
 'embeddings',
 'word',
 'for',
 'FastText',
 'sentence',
 'an',
 'This']

In [34]:
with open("corpus.txt", "r") as f:
    lines = f.readlines()
    
lines = [line.strip() for line in lines]
lines = sum([line.split() for line in lines] ,[]  )
lines = set(lines)
lines

{'<END>',
 '<START>',
 'Another',
 'FastText',
 'This',
 'an',
 'data',
 'embeddings',
 'example',
 'for',
 'great',
 'is',
 'more',
 'sentence',
 'text',
 'with',
 'word'}

In [39]:

# 查看模型中的單詞列表
print("Words in the model:", model.words)

# 取得某個單詞的詞向量
word_embedding = model.get_word_vector("example")
print("Word embedding for 'example':", word_embedding)

# 使用模型預測句子中最相似的單詞（例如：以內建的 nearest neighbor 方法）
nearest_neighbors = model.get_nearest_neighbors("Fasttext")
print("Nearest neighbors for 'fasttext':", nearest_neighbors)

Words in the model: ['<END>', '</s>', '<START>', 'is', 'example', 'great', 'data', 'text', 'more', 'with', 'Another', 'embeddings', 'word', 'for', 'FastText', 'sentence', 'an', 'This']
Word embedding for 'example': [-4.0063375e-04  5.8068166e-04  1.3081767e-03 -2.6234989e-03
  1.9974632e-03 -4.3969785e-04  6.2194240e-04 -2.4216913e-03
 -1.3190987e-04  2.3871241e-03  2.3822008e-04 -7.0877204e-04
  2.4723849e-04  1.5360215e-03  2.9191854e-03 -3.2843975e-04
  1.9873954e-04 -5.6091917e-04 -1.1284921e-03  5.3638930e-04
  1.6385373e-03 -1.0516634e-03 -8.7366643e-04 -1.5904498e-03
  2.8785563e-03 -3.9984708e-04 -1.8464609e-03 -3.6308047e-04
  1.3621560e-03  4.3135913e-04  1.6672076e-03 -2.3732992e-04
  8.5021317e-04 -8.0895174e-04 -1.0747846e-03  1.0381772e-03
 -3.3981912e-04 -6.8921386e-04  1.2226522e-03  1.7460980e-04
  2.7880720e-05  8.9875195e-04  4.5307167e-04 -1.0518832e-03
  2.1610567e-03 -2.9102925e-04  1.5453915e-03  2.1079767e-03
 -7.3856988e-04 -6.1839382e-04 -5.8842701e-04  2.5627

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/keithlin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:

# 請先安裝 spaCy 及英文模型： pip install spacy && python -m spacy download en_core_web_sm



Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [3]:
# 測試範例
sample_text = "FastText is great for word embeddings! In 2025, we are using it for text classification."

In [4]:
clean_text_spacy(sample_text)

'fasttext great word embeddings text classification'

In [7]:


cleaned_text = clean_text(sample_text)
print("Cleaned text:", cleaned_text)

Cleaned text: fasttext great word embeddings using text classification
