<a href="https://colab.research.google.com/github/JohnsonYu0924/114_2_text-analysis/blob/main/L8_corpus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 建構 Corpus（語料庫）


- PlaintextCorpusReader 是不分類，整體語料庫
- CategorizedPlaintextCorpusReader 是有分類，分類後的語料庫

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import nltk, os, json
from nltk.corpus.reader.plaintext import PlaintextCorpusReader, CategorizedPlaintextCorpusReader


path = "/content/drive/MyDrive/combine/"
# print(os.listdir(path))

# 利用pattern
txtPattern = r"[\w\s.]+\.txt"
myCorpus1 = PlaintextCorpusReader(path, txtPattern)
myCorpus1.fileids()

Pattern = r".*(_Han|_Soong|_Tsai).*\.txt"
catePattern = r".*(_Han|_Soong|_Tsai).*"

myCorpus2 = CategorizedPlaintextCorpusReader(path, Pattern, cat_pattern = catePattern)
myCorpus2.fileids()
myCorpus2.categories()


Mounted at /content/drive


['_Han', '_Soong', '_Tsai']

In [None]:
print()

NameError: name 'myCorpus2' is not defined

In [None]:
print(myCorpus2.words()[26:64]) #經過 tokenizer 後的所有 tokens（字詞）

['I', 'have', 'no', 'doubt', 'that', 'the', 'military', 'had', 'authority', 'to', 'select', 'this', 'particular', 'property', 'for', 'destruction', '.', 'But', 'whatever', 'the', 'weight', 'of', 'authority', 'may', 'be', ',', 'I', 'believe', 'that', 'the', 'Fifth', 'Amendment', 'requires', 'compensation', 'for', 'the', 'taking', '.']


In [None]:
len(myCorpus2.paras("1952 U.S. LEXIS 2631.opin.neg.txt"))

# paras() 會回傳：
# 一個 list，其中每個元素是一個段落（paragraph）
# 每個段落本身會是一個「由句子構成的 list」，
# 每個句子又是一份「由 tokens 構成的 list」

7

用 loop 總結你的文本

In [None]:
# Building a loop to summarize each document in your corpus       ## for publish: use the function prevuos people have done.
for file in myCorpus2.fileids():  ##() : empty: all the files.
    paras = len(myCorpus2.paras(file))   ## paragragh!
    sents = len(myCorpus2.sents(file))   ## sentences!
    words = len(myCorpus2.words(file))   ## words!
    print(file)
    print(str(paras) + " paragraphs, " + str(sents) + \
          " sentences, and " + str(words) + " words" + "\n")

# for file in myCorpus2.fileids():
# myCorpus2.fileids() 會回傳 corpus 裡所有的檔案名稱（list）
# for file in ... 表示要對 corpus 中 每一個文件都做一次統計

# paras = len(myCorpus2.paras(file))
# myCorpus2.paras(file) → 回傳「段落列表」
# len(...) → 計算這篇文件有幾段（paragraphs）

# sents = len(myCorpus2.sents(file))
# myCorpus2.sents(file) → 回傳「句子列表」
# len(...) → 計算句子的數量

# words = len(myCorpus2.words(file))
# myCorpus2.words(file) → 回傳整篇文章 tokenize 後的「字詞 list」
# len(...) → 計算單字數量

1952 U.S. LEXIS 2631.opin.neg.txt
7 paragraphs, 11 sentences, and 190 words

1959 U.S. LEXIS 1490.opin.neg.txt
2 paragraphs, 2 sentences, and 35 words

1985 U.S. LEXIS 63.opin.neg.txt
6 paragraphs, 7 sentences, and 45 words

1986 U.S. LEXIS 25.opin.pos.txt
8 paragraphs, 38 sentences, and 1230 words

1986 U.S. LEXIS 72.opin.2.pos.txt
6 paragraphs, 6 sentences, and 44 words

1989 U.S. LEXIS 579.opin.6.pos 2.txt
5 paragraphs, 10 sentences, and 231 words

1989 U.S. LEXIS 579.opin.6.pos.txt
5 paragraphs, 10 sentences, and 231 words



In [None]:
## 設定只看某個類別
for file in myCorpus2.fileids(categories="pos"):   ## the files in that category
    print(file)

1986 U.S. LEXIS 25.opin.pos.txt
1986 U.S. LEXIS 72.opin.2.pos.txt
1989 U.S. LEXIS 579.opin.6.pos 2.txt
1989 U.S. LEXIS 579.opin.6.pos.txt


In [None]:
# 用函數總結你的文本   ## codes from the book and reword a bit!
import time, nltk
from nltk import pos_tag, sent_tokenize, wordpunct_tokenize

def describe(self, fileids=None, categories=None):
    """
    Performs a single pass of the corpus and
    returns a dictionary with a variety of metrics
    concerning the state of the corpus.
    """
    started = time.time()   ## how much time needed

    # Structures to perform counting.
    counts  = nltk.FreqDist()
    tokens  = nltk.FreqDist() # 一個專門用來計數的字典（frequency dictionary）。自動處理 key-value 增加（比 defaultdict 更方便）。

    # 計算段落數
    for para in self.paras(fileids, categories): # 走訪所有選到的段落，每遇到一個句子就加一。
        counts['paras'] += 1
    # 計算句子數
    for sent in self.sents(fileids, categories):
        counts['sents'] += 1

    for word in self.words(fileids, categories):
        counts['words'] += 1
        tokens[word] += 1

    # Compute the number of files and categories in the corpus
    n_fileids = len(self.fileids())
    n_topics  = len(self.categories())

    # Return data structure with information
    return {
        'files':  n_fileids,
        'topics': n_topics,
        'paras':  counts['paras'],
        'sents':  counts['sents'],
        'words':  counts['words'],
        'vocab':  len(tokens),
        'lexical diversity': float(counts['words']) / float(len(tokens)),
        'paragraphs per doc':  float(counts['paras']) / float(n_fileids),
        'sentences per paragraph':  float(counts['sents']) / float(counts['paras']),
        'secs':   time.time() - started,
    }


In [None]:
describe(myCorpus2)

{'files': 7,
 'topics': 2,
 'paras': 39,
 'sents': 84,
 'words': 2006,
 'vocab': 638,
 'lexical diversity': 3.1442006269592477,
 'paragraphs per doc': 5.571428571428571,
 'sentences per paragraph': 2.1538461538461537,
 'secs': 0.03154444694519043}

In [None]:
describe(myCorpus2, categories="neg")

{'files': 7,
 'topics': 2,
 'paras': 15,
 'sents': 20,
 'words': 270,
 'vocab': 140,
 'lexical diversity': 1.9285714285714286,
 'paragraphs per doc': 2.142857142857143,
 'sentences per paragraph': 1.3333333333333333,
 'secs': 0.00561213493347168}

## 如何使用課本中的程式碼 (Slide 15)

In [None]:
! pip install readability-lxml

Collecting readability-lxml
  Downloading readability_lxml-0.8.4.1-py3-none-any.whl.metadata (4.0 kB)
Collecting cssselect (from readability-lxml)
  Downloading cssselect-1.3.0-py3-none-any.whl.metadata (2.6 kB)
Collecting lxml_html_clean (from lxml[html_clean]->readability-lxml)
  Downloading lxml_html_clean-0.4.3-py3-none-any.whl.metadata (2.3 kB)
Downloading readability_lxml-0.8.4.1-py3-none-any.whl (19 kB)
Downloading cssselect-1.3.0-py3-none-any.whl (18 kB)
Downloading lxml_html_clean-0.4.3-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean, cssselect, readability-lxml
Successfully installed cssselect-1.3.0 lxml_html_clean-0.4.3 readability-lxml-0.8.4.1


In [None]:
#from ch03 import reader

!wget https://raw.githubusercontent.com/foxbook/atap/refs/heads/master/snippets/ch03/reader.py

--2025-12-01 07:06:31--  https://raw.githubusercontent.com/foxbook/atap/refs/heads/master/snippets/ch03/reader.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9710 (9.5K) [text/plain]
Saving to: ‘reader.py’


2025-12-01 07:06:31 (2.86 MB/s) - ‘reader.py’ saved [9710/9710]



In [None]:
import importlib
reader = importlib.import_module("reader")

In [None]:
documentPattern = r'.*\.json'
myTags = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'li']

myCorpus3 = reader.HTMLCorpusReader(mydir + "mc/",
                fileids = documentPattern, encoding='utf8', \
                tags=myTags)

OSError: No such file or directory: '/content/mc'

In [None]:
myCorpus3.fileids()

NameError: name 'myCorpus3' is not defined

## Text Vectorization

In [None]:
## Slide 17
## Function to tokenize and create frequency vectors using NLTK
import nltk
import string

# okenize() 函式 — 分詞 + 清理 + 詞幹化
def tokenize(text):
   stem = nltk.stem.SnowballStemmer('english')  #建立一個 詞幹化器（例如：running → run）
   text = text.lower() #把整串文字變成小寫
   tokens = []

   for token in nltk.word_tokenize(text):
       if token in string.punctuation: continue #丟掉標點符號
       yield stem.stem(token) #例如 running → run 或是 studies → studi
       tokens.append(token)
   return tokens

def tokenize(text):
    stem = nltk.stem.SnowballStemmer('english')
    text = text.lower()
    tokens = []

    for token in nltk.word_tokenize(text):
        if token in string.punctuation:
            continue
        tokens.append(stem.stem(token))   # 用 stem 過的 token

    return tokens

# vectorize function
from collections import defaultdict

def vectorize(doc):
    features = defaultdict(int) # 建立一個 frequency dictionary
    for token in tokenize(doc): # 逐一取得分詞結果（從 tokenize 函式）
        features[token] += 1
    return features

# vectorize"
# 例如輸入："This is great, great!"
# tokenize 後：['this', 'is', 'great', 'great']
# vector 會是：{'this': 1, 'is': 1 'great': 2}

## 英文文件之間的重複字很少，所以如果把兩篇文章變成 Bag-of-Words vector 來比較，你會看到很多 0。


例如輸入：```"This is great, great!"```

tokenize 後：```['this', 'is', 'great', 'great'] ```

vector 會是：
```{'this': 1, 'is': 1 'great': 2} ```


In [None]:
# Tokenize 文本，然後儲存成字串清單
import re
strCorpus = []  ## 每個元素是一篇完整文件的字串。
for file in myCorpus2.fileids(): # myCorpus2.fileids() 會列出語料庫的所有檔名
    doc = myCorpus2.raw(file) #讀取整篇文件的原始內容 (raw)
    doc = re.sub("\s+", " ", doc) # 清理：把多個空白統一成一個空白
    strCorpus.append(doc)

  doc = re.sub("\s+", " ", doc) # 清理：把多個空白統一成一個空白


In [None]:
# 示範 NLP 程式碼的測試語料庫
toyCorpus = [ "The elephant sneezed at the sight of potatoes.", "Bats can see via echolocation. See the bat sight sneeze!", "Wondering, she opened the door to the studio.", ]


In [None]:
toyFreqVectors = map(vectorize, toyCorpus) # 把 vectorize() 函數套用到 toyCorpus 的每一個元素。
print(list(toyFreqVectors)) # map() 回傳的是 lazy object：所以可以用 list() 展開，再輸出內容

[defaultdict(<class 'int'>, {'the': 2, 'eleph': 1, 'sneez': 1, 'at': 1, 'sight': 1, 'of': 1, 'potato': 1}), defaultdict(<class 'int'>, {'bat': 2, 'can': 1, 'see': 2, 'via': 1, 'echoloc': 1, 'the': 1, 'sight': 1, 'sneez': 1}), defaultdict(<class 'int'>, {'wonder': 1, 'she': 1, 'open': 1, 'the': 2, 'door': 1, 'to': 1, 'studio': 1})]


In [None]:
freqVectors = map(vectorize, strCorpus)
print(list(freqVectors))

[defaultdict(<class 'int'>, {'author': 3, 'wodougla': 1, 'type': 1, 'dissent': 3, 'dissentbi': 1, 'dougla': 2, 'mr.': 2, 'justic': 2, 'with': 1, 'whom': 1, 'black': 1, 'concur': 1, 'i': 2, 'have': 1, 'no': 1, 'doubt': 1, 'that': 5, 'the': 18, 'militari': 1, 'had': 1, 'to': 5, 'select': 1, 'this': 3, 'particular': 1, 'properti': 4, 'for': 4, 'destruct': 3, 'but': 2, 'whatev': 2, 'weight': 1, 'of': 4, 'may': 2, 'be': 3, 'believ': 1, 'fifth': 1, 'amend': 1, 'requir': 1, 'compens': 1, 'take': 1, 'was': 4, 'destroy': 1, 'not': 1, 'becaus': 2, 'it': 6, 'in': 1, 'natur': 1, 'a': 2, 'public': 2, 'nuisanc': 1, 'deem': 1, 'necessari': 1, 'help': 1, 'win': 1, 'war': 2, 'as': 3, 'clear': 1, 'appropri': 2, 'end': 1, 'anim': 1, 'food': 1, 'and': 2, 'suppli': 1, 'requisit': 1, 'defens': 1, 'effort': 2, 'court': 1, 'say': 1, 'depriv': 1, 'enemi': 1, 'valuabl': 1, 'logist': 1, 'weapon': 1, 'seem': 1, 'me': 1, 'guid': 1, 'principl': 1, 'should': 2, 'whenev': 1, 'govern': 1, 'determin': 1, 'one': 1, 'per

### Scikit-Learn

CountVectorizer = 自動：

- tokenize
- lowercase
- remove punctuation
- build vocabulary
- build word-count vectors

不需要你自己寫 tokenize 或 vectorize function。

In [None]:
# Scikit-Learn: 它使用 Scikit-Learn 的 CountVectorizer 來把文字轉換成 Bag-of-Words frequency vectors
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer() # 創建了一個「轉換器物件」
toyFreqVectors2 = vectorizer.fit_transform(toyCorpus)
#.fit()：讀進 corpus，找到所有出現的 vocabulary
#.transform()：把每一篇文件轉成字頻向量

print(toyFreqVectors2.shape)
print(toyFreqVectors2.toarray())

# shape:
# 3 篇文件 → 3 rows
# 15 個 unique tokens → 15 columns

# .toarray → 把稀疏矩陣（sparse matrix）展開成 一般的二維矩陣（ array: list of lists）

(3, 20)
[[1 0 0 0 0 0 1 1 0 1 0 0 1 0 1 0 2 0 0 0]
 [0 1 1 1 0 1 0 0 0 0 2 0 1 1 0 0 1 0 1 0]
 [0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 2 1 0 1]]


In [None]:
freqVectors2 = vectorizer.fit_transform(strCorpus)

print(freqVectors2.shape)
fvec = freqVectors2.toarray()
fvec

(7, 587)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 2, 1, 1],
       [0, 0, 0, ..., 2, 1, 1]])

### Genism: Yet another way to create frequency vectors

In [None]:
! pip install gensim

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [None]:
! pip install scipy
! pip install numpy
! pip install --upgrade gensim



In [None]:
import gensim

tokToyCorpus = [tokenize(doc) for doc in toyCorpus] ## tokenize : u have to use it when using gensim!
id2word = gensim.corpora.Dictionary(tokToyCorpus)
toyFreqVectors3 = [id2word.doc2bow(doc) for doc in tokToyCorpus]

In [None]:
print(toyFreqVectors3)
# 會得到 [(id, freq), (id, freq), ...]

#doc1 = ['the','eleph','sneez','at','sight','potato','the']
#doc2 = ['bat','can','see','see','the','sight','sneez','bat']
#doc3 = ['wonder','she','open','door','to','the','studio']

#0: 'the'
#1: 'eleph'
#2: 'sneez'
#3: 'at'
#4: 'sight'
#5: 'potato'
#6: 'bat'
# So (6, 2) means: token with ID 6 appears 2 times in that document.

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 2)], [(4, 1), (5, 1), (6, 1), (7, 2), (8, 1), (9, 1), (10, 2), (11, 1)], [(6, 2), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1)]]


In [None]:
tokCorpus = [tokenize(doc) for doc in strCorpus]
id2word = gensim.corpora.Dictionary(tokCorpus)
freqVectors3 = [id2word.doc2bow(doc) for doc in tokCorpus]

In [None]:
#freqVectors3[0:5]

In [None]:
len(freqVectors3)

7

## One-Hot Encoding

用一個「二元值」表示是否包含某 token，集合（set）型式的 one-hot label。

有些模型（特別是 early neural nets）
只需要知道：某個詞是否出現
而不需要知道它出現幾次。

#### NLTK One-Hot Encoding

In [None]:
# NLTK, One-Hot Encoding
def vectorizeOH(doc):
    return {token: True for token in doc}

tokToyCorpus = [tokenize(doc) for doc in toyCorpus]
toyOHvectors = map(vectorizeOH, tokToyCorpus)

In [None]:
print(list(toyOHvectors))

[{'the': True, 'eleph': True, 'sneez': True, 'at': True, 'sight': True, 'of': True, 'potato': True}, {'bat': True, 'can': True, 'see': True, 'via': True, 'echoloc': True, 'the': True, 'sight': True, 'sneez': True}, {'wonder': True, 'she': True, 'open': True, 'the': True, 'door': True, 'to': True, 'studio': True}]


In [None]:
tokCorpus = [tokenize(doc) for doc in strCorpus]
OHvectors = map(vectorizeOH, tokCorpus)
print(list(OHvectors))

[{'author': True, 'wodougla': True, 'type': True, 'dissent': True, 'dissentbi': True, 'dougla': True, 'mr.': True, 'justic': True, 'with': True, 'whom': True, 'black': True, 'concur': True, 'i': True, 'have': True, 'no': True, 'doubt': True, 'that': True, 'the': True, 'militari': True, 'had': True, 'to': True, 'select': True, 'this': True, 'particular': True, 'properti': True, 'for': True, 'destruct': True, 'but': True, 'whatev': True, 'weight': True, 'of': True, 'may': True, 'be': True, 'believ': True, 'fifth': True, 'amend': True, 'requir': True, 'compens': True, 'take': True, 'was': True, 'destroy': True, 'not': True, 'becaus': True, 'it': True, 'in': True, 'natur': True, 'a': True, 'public': True, 'nuisanc': True, 'deem': True, 'necessari': True, 'help': True, 'win': True, 'war': True, 'as': True, 'clear': True, 'appropri': True, 'end': True, 'anim': True, 'food': True, 'and': True, 'suppli': True, 'requisit': True, 'defens': True, 'effort': True, 'court': True, 'say': True, 'depri

#### Scikit-learn, One-Hot Encoding

而是把 Bag-of-Words 的「頻率矩陣」轉成「0/1 二值矩陣」

In [None]:
# Scikit-learn, One-Hot Encoding
from sklearn.preprocessing import Binarizer

# 建立字頻向量（frequency vectors）
freq = CountVectorizer()
corpus = freq.fit_transform(toyCorpus) # 生一個 Bag-of-Words 矩陣：

# 建立 Binarizer 一個函數（把數字全部變 0/1）
onehot = Binarizer()
# 轉換成一般矩陣
onehot.fit_transform(corpus.toarray())

## the array is different from above. !!

array([[1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0],
       [0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1]])

In [None]:
freq = CountVectorizer()
corpus = freq.fit_transform(strCorpus)
onehot = Binarizer()
# Leaves the sparse array)
OHvectors2 = onehot.fit_transform(corpus)
onehot.fit_transform(corpus.toarray())

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1]])

#### Genism, One-Hot Encoding

In [None]:
# Genism, One-Hot Encoding
tokToyCorpus = [tokenize(doc) for doc in toyCorpus] # 它會把每篇文章變成 tokens list：
id2word = gensim.corpora.Dictionary(tokToyCorpus) # 建立 gensim dictionary
toyOHvectors3 = [
    [(token[0], 1) for token in id2word.doc2bow(doc)] # doc2bow = frequency vector（freq encoding）
    for doc in tokToyCorpus
]

# 把 frequency vector 轉成 One-Hot encoding

In [None]:
tokCorpus = [tokenize(doc) for doc in strCorpus] # tokenize 每一篇文件
id2word = gensim.corpora.Dictionary(tokCorpus) # 建立 gensim 字典（token → id）
OHvectors3 = [
    [(token[0], 1) for token in id2word.doc2bow(doc)]
    for doc in tokCorpus
]

In [None]:
len(OHvectors3)

7

## tf-idf

In [None]:
# NLTK, tf-idf encoding
from nltk.text import TextCollection

def vectorizeTF(corpus):
    corpus = [tokenize(doc) for doc in corpus] # tokenize the corpus
    texts = TextCollection(corpus) # TextCollection 是 NLTK 提供的：一個可以計算 tf、idf、tf-idf 的工具，它會讀整個 corpus，計算每個字在整個 corpus 中的 document frequency.

    for doc in corpus:
        yield {
            term: texts.tf_idf(term, doc)
            for term in doc
        }

## one word used a lot across different document, or a word only show up in a few document.

In [None]:
toyTFvectors = map(vectorizeTF, toyCorpus)

In [None]:
toyTFvectors = list(vectorizeTF(toyCorpus))

for i, vec in enumerate(toyTFvectors):
    print(f"Document {i}:")
    print(vec)
    print()

Document 0:
{'the': 0.0, 'eleph': 0.13732653608351372, 'sneez': 0.05068313851352055, 'at': 0.13732653608351372, 'sight': 0.05068313851352055, 'of': 0.13732653608351372, 'potato': 0.13732653608351372}

Document 1:
{'bat': 0.21972245773362198, 'can': 0.10986122886681099, 'see': 0.21972245773362198, 'via': 0.10986122886681099, 'echoloc': 0.10986122886681099, 'the': 0.0, 'sight': 0.04054651081081644, 'sneez': 0.04054651081081644}

Document 2:
{'wonder': 0.13732653608351372, 'she': 0.13732653608351372, 'open': 0.13732653608351372, 'the': 0.0, 'door': 0.13732653608351372, 'to': 0.13732653608351372, 'studio': 0.13732653608351372}



In [None]:
TFvectors = map(vectorizeTF, strCorpus)

In [None]:
TFvectors = list(vectorizeTF(strCorpus))

for i, vec in enumerate(TFvectors):
    print(f"Document {i}:")
    print(vec)
    print()

Document 0:
{'author': 0.0, 'wodougla': 0.007638798588386391, 'type': 0.0, 'dissent': 0.0, 'dissentbi': 0.005166450368214657, 'dougla': 0.015277597176772783, 'mr.': 0.015277597176772783, 'justic': 0.0, 'with': 0.0034122913898501383, 'whom': 0.0034122913898501383, 'black': 0.011865305786922641, 'concur': 0.011865305786922641, 'i': 0.0018798863393568092, 'have': 0.0034122913898501383, 'no': 0.007638798588386391, 'doubt': 0.011865305786922641, 'that': 0.004699715848392023, 'the': 0.0, 'militari': 0.011865305786922641, 'had': 0.0034122913898501383, 'to': 0.010258299896988197, 'select': 0.007638798588386391, 'this': 0.010236874169550414, 'particular': 0.0034122913898501383, 'properti': 0.047461223147690565, 'for': 0.008206639917590559, 'destruct': 0.03559591736076792, 'but': 0.006824582779700277, 'whatev': 0.015277597176772783, 'weight': 0.011865305786922641, 'of': 0.008206639917590559, 'may': 0.015277597176772783, 'be': 0.015499351104643969, 'believ': 0.011865305786922641, 'fifth': 0.01186

#### Scikit-Learn, tf-idf encoding

In [None]:
# Scikit-Learn, tf-idf encoding
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
toyTFvectors2 = tfidf.fit_transform(toyCorpus)

In [None]:
print(toyTFvectors2)
toyTFvectors2.shape

# 結果是該字在 Document 0 的 TF-IDF 分數。
# 位置	意義
# 0	第 0 份文件（Document 0）
# 16	字典中 index = 16 的那個 token
# 0.4473	該文件的該 token 的 TF-IDF 分數

# 該字在所有文件中是否常見？越罕見，分數越高。
# TF × IDF = TF-IDF_final score
# 像是"the" 在 三份文件都出現，所以它的 IDF 很接近 0。（越多人用的字 → 越不重要 → IDF 越低）
# 算式 IDF(the) = log(N / df(the)) = log(3/3) = log(1) = 0

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 23 stored elements and shape (3, 20)>
  Coords	Values
  (0, 16)	0.44730460893892116
  (0, 6)	0.37867626873820165
  (0, 14)	0.37867626873820165
  (0, 0)	0.37867626873820165
  (0, 12)	0.28799306292785165
  (0, 7)	0.37867626873820165
  (0, 9)	0.37867626873820165
  (1, 16)	0.1786694534059618
  (1, 12)	0.23006945204561577
  (1, 2)	0.30251368128649075
  (1, 3)	0.30251368128649075
  (1, 10)	0.6050273625729815
  (1, 18)	0.30251368128649075
  (1, 5)	0.30251368128649075
  (1, 1)	0.30251368128649075
  (1, 13)	0.30251368128649075
  (2, 16)	0.4343672818844283
  (2, 19)	0.3677238693250534
  (2, 11)	0.3677238693250534
  (2, 8)	0.3677238693250534
  (2, 4)	0.3677238693250534
  (2, 17)	0.3677238693250534
  (2, 15)	0.3677238693250534


(3, 20)

In [None]:
tfidf = TfidfVectorizer()
TFvectors2 = tfidf.fit_transform(strCorpus)

In [None]:
TFvectors2.shape

(7, 587)

#### genism, tf-idf encoding

In [None]:
# genism, tf-idf encoding
tokToyCorpus = [tokenize(doc) for doc in toyCorpus]
lexicon = gensim.corpora.Dictionary(tokToyCorpus)
tfidf = gensim.models.TfidfModel(dictionary=lexicon, normalize=True)
toyTFvectors3 = [tfidf[lexicon.doc2bow(doc)] for doc in tokToyCorpus]

In [None]:
print(toyTFvectors3)

[[(0, np.float64(0.4837965208957426)), (1, np.float64(0.4837965208957426)), (2, np.float64(0.4837965208957426)), (3, np.float64(0.4837965208957426)), (4, np.float64(0.17855490118826325)), (5, np.float64(0.17855490118826325))], [(4, np.float64(0.10992597952954358)), (5, np.float64(0.10992597952954358)), (7, np.float64(0.5956913654963344)), (8, np.float64(0.2978456827481672)), (9, np.float64(0.2978456827481672)), (10, np.float64(0.5956913654963344)), (11, np.float64(0.2978456827481672))], [(12, np.float64(0.408248290463863)), (13, np.float64(0.408248290463863)), (14, np.float64(0.408248290463863)), (15, np.float64(0.408248290463863)), (16, np.float64(0.408248290463863)), (17, np.float64(0.408248290463863))]]


## Distributed Representation


TF, TF-IDF, BOW	Sparse（稀疏）、超大維度、90% 是 0
- TF-IDF 是：[(3, 0.44), (26, 0.21), (88, 0.87), ...] （稀疏）

Doc2Vec	Dense（密集）、低維度、沒有 0
- Doc2Vec 得到的向量看起來像：[-0.13, 0.87, 0.05, -0.42, ...] （例如 100 維）
- Doc2Vec 可理解語意（semantic representation）



TF-IDF / BOW 只能比較哪些字出現一樣。如果字不重複就完全無法比較

Doc2Vec 即使字完全不一樣，只要語意接近，向量相似度（cosine similarity）會靠近。這是分散式語意的強大之處。

In [None]:
## Slide 20, Distributed Representation
# Genism (the only option)
from gensim.models.doc2vec import TaggedDocument, Doc2Vec

corpus = [list(tokenize(doc)) for doc in toyCorpus] #先 tokenize corpus
corpus = [
    TaggedDocument(words, ['d{}'.format(idx)]) #  TaggedDocument 標記文件（Doc2Vec 必要步驟）
    for idx, words in enumerate(corpus)
]
toyModel = Doc2Vec(corpus, min_count=0)  # size=5, 建立 Doc2Vec 模型

print(toyModel.docvecs[0])

[-5.2559674e-03 -5.9936498e-03 -9.9146254e-03  8.5665295e-03
  3.5847796e-03  2.6267971e-04 -9.8911952e-03 -5.1613934e-03
 -9.7270506e-03  2.0224657e-03  2.8346470e-03  4.6406458e-03
 -4.3243794e-03 -3.1749432e-03 -3.0726346e-03 -8.7502059e-03
  2.1682635e-03  9.2512239e-03 -9.5137712e-03 -3.4688634e-03
 -3.7633737e-03  2.6175780e-03 -5.7026939e-03  2.6463985e-03
  5.7865707e-03 -8.1161438e-03 -8.3585950e-03 -9.9731311e-03
  4.9417545e-03 -9.1574620e-03  5.8580614e-03  6.8114670e-03
 -6.5167653e-03 -4.5419913e-03 -1.2705415e-03  1.6345874e-03
 -1.4837370e-03 -8.5483706e-03 -3.6299569e-03  1.7294660e-03
 -2.0310427e-03 -7.2464654e-03  4.2032171e-03 -8.5925050e-03
  2.7253102e-03 -4.6202657e-03  6.4654934e-04 -2.0457348e-03
  5.4251067e-03 -8.0400398e-03 -2.1325520e-03 -7.6262681e-05
 -6.6337567e-03 -6.5803230e-03 -1.9557416e-03  8.8215312e-03
 -1.2637944e-03  3.5586639e-03 -5.7755318e-03  8.8291727e-03
  2.9386890e-03  9.3009342e-03  4.3924297e-03 -4.2032376e-03
  2.2444502e-03 -4.41885

  print(toyModel.docvecs[0])


In [None]:
corpus = [list(tokenize(doc)) for doc in strCorpus]
corpus = [
    TaggedDocument(words, ['d{}'.format(idx)])
    for idx, words in enumerate(corpus)
]

model = Doc2Vec(corpus, min_count=3)  # size=10, min_count=3 出現至少 3 次的詞才會被模型使用

print(model.docvecs[3])

[-4.2198738e-01  5.6964403e-01 -1.2523958e-01  6.6518670e-01
  1.6320406e-01 -9.4609839e-01  1.2806635e-01  1.7377440e+00
 -4.7184888e-01 -7.8881389e-01 -5.0169196e-02 -1.4315276e+00
  5.9740704e-01  1.1773859e+00  5.9406996e-01 -4.0982604e-01
  9.1145575e-01  3.1282258e-01 -1.4885694e-01 -5.6883878e-01
 -5.4572470e-02  4.3965080e-01  2.0627999e-01  7.4375468e-01
  2.4681708e-01  5.1529241e-01 -4.4300494e-01 -5.5855680e-01
  3.0511139e-02 -9.0532142e-01  6.3319165e-01  3.3364210e-02
  2.5430176e-01 -1.4377232e-01 -4.0192521e-01  6.2891191e-01
 -1.6899855e-01 -9.8733127e-01 -2.4156374e-01 -7.3019344e-01
 -1.0147717e-01  1.3203858e-01 -4.3308035e-01  2.8660846e-01
  3.5042610e-02 -8.8911134e-01 -9.8572457e-01  1.8258080e-01
 -1.4504999e-01  7.7341920e-01 -5.7639575e-01  3.4444530e-02
 -3.4880501e-01  2.0311555e-01 -2.3843253e-01 -1.9748969e-01
 -7.8260446e-01  7.5431979e-01 -1.1491742e+00 -2.9330088e-02
  7.9523295e-01  1.1019710e+00  2.9741484e-01  1.8451114e-01
 -5.4168254e-02  1.14406

  print(model.docvecs[3])


## 文件分類器 (Document Classifier)

使用 One-Hot Encoding → Naive Bayes → 訓練 → 預測。

非常基本、非常簡化的小範例。

- Step 1：向量化文字（X）: One-Hot Encoding 版本
（每篇文章變成一個 0/1 的向量）\\

- Step 2：做標籤（y）: 根據每篇 document 的 category （正負文本最常用）

- Step 3：使用機器學習模型: Multinomial Naive Bayes
（適合文字分類）

- Step 4：進行預測: 拿文件向量去預測類別
（示範如何分類）

製作 X matrix：文件向量（one-hot）:作用：把每個 document 轉成數學向量

In [None]:
# Create the X matrix (with one-hot encoding)
freq = CountVectorizer()
corpus = freq.fit_transform(strCorpus)
onehot = Binarizer()
# Converts the sparse array to a dense one (so we can view it)
documents = onehot.fit_transform(corpus.toarray())
documents

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 1, 1],
       [0, 0, 0, ..., 1, 1, 1]])

製作 y vector：標注每份文件（分類類別）

X：文件向量 \
y：文件的類別（正評？負評？哪一種案件？）

In [None]:
# Create the y vector with the labels for each document
labels = []
for doc in myCorpus2.fileids():
    lab = myCorpus2.categories(doc)
    labels.append(lab)
    print(lab)

['neg']
['neg']
['neg']
['pos']
['pos']
['pos']
['pos']


訓練模型（Naive Bayes）

使用 Multinomial Naive Bayes （常用於文字分類）訓練分類模型。

alpha=0.0 = 不做 smoothing（教科書示範）

class_prior=[0.4, 0.6]  假設兩類的機率（演示用）

In [None]:
# Use the fit method (with a naive bayes model)
# Note: 6 documents is way too few to train a model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=0.0, class_prior=[0.4, 0.6])

model.fit(documents, labels)

  y = column_or_1d(y, warn=True)
  self.feature_log_prob_ = np.log(smoothed_fc) - np.log(


用模型進行預測

In [None]:
# Now that the model is fit, we can use the predict method
# Here I am using the same documents for fit and predict
# In real applications, you would use them on different documents
model.predict(documents)

  ret = a @ b


array(['neg', 'neg', 'neg', 'neg', 'neg', 'neg', 'neg'], dtype='<U3')

## 你其實就可以跑 topic model 了

In [None]:
#import nltk
#import string
#from nltk.corpus import stopwords

#nltk.download('stopwords')

#def tokenize(text):
#    stemmer = nltk.stem.SnowballStemmer('english')

#    # 內建 stopwords
#    stop_words = set(stopwords.words('english'))

#    # 自己加的 stopwords（法律文件常見無意義字）
#    custom_stop = {
#        "court", "judge", "case", "opinion", "states",
#        "justice", "law", "u", "s", "v", "section"
#    }

#    # 合併
#    stop_words = stop_words.union(custom_stop)

#    # 轉小寫
#    text = text.lower()
#    tokens = []

#    for token in nltk.word_tokenize(text):

#        # 去除標點符號
#        if token in string.punctuation:
#            continue

#        # 去除停用詞
#        if token in stop_words:
#            continue

#        # 只保留字母（刪掉純數字、No. 123、1852、lexis 之類雜訊）
#        if not token.isalpha():
#            continue

#        # 詞幹化
#        tokens.append(stemmer.stem(token))

#    return tokens


### 整理資料

LDA 所需要的全部資料：

tokCorpus = tokenized documents（list of tokens every document）

id2word = dictionary（token → integer id）

freqVectors3 = doc-term matrix（稀疏 BOW）

In [None]:
tokCorpus = [tokenize(doc) for doc in strCorpus]
id2word = gensim.corpora.Dictionary(tokCorpus)
freqVectors3 = [id2word.doc2bow(doc) for doc in tokCorpus]

### 建立 Topic Model（最常用是 LDA）

In [None]:
from gensim.models.ldamodel import LdaModel

num_topics = 5  # ← 你要自己決定（5, 10, 20 都可以）
lda_model = LdaModel(corpus=freqVectors3,
                     id2word=id2word,
                     num_topics=num_topics,
                     random_state=42,
                     passes=10)


### 查看每個 Topic 的 Top Words

In [None]:
lda_model.print_topics(num_words=10)

[(0,
  '0.063*"the" + 0.040*"of" + 0.016*"to" + 0.016*"it" + 0.016*"i" + 0.016*"that" + 0.016*"justic" + 0.016*"in" + 0.012*"and" + 0.012*"\'s"'),
 (1,
  '0.002*"of" + 0.002*"the" + 0.002*"in" + 0.002*"that" + 0.002*"to" + 0.002*"it" + 0.002*"parti" + 0.002*"dissent" + 0.002*"\'s" + 0.002*"justic"'),
 (2,
  '0.037*"the" + 0.030*"dissent" + 0.013*"of" + 0.013*"that" + 0.013*"in" + 0.013*"author" + 0.013*"justic" + 0.013*"type" + 0.013*"violat" + 0.013*"brennan"'),
 (3,
  '0.005*"the" + 0.004*"parti" + 0.004*"of" + 0.003*"that" + 0.003*"to" + 0.003*"it" + 0.002*"is" + 0.002*"\'s" + 0.002*"a" + 0.002*"and"'),
 (4,
  '0.080*"the" + 0.037*"of" + 0.031*"to" + 0.029*"parti" + 0.022*"it" + 0.020*"that" + 0.013*"is" + 0.013*"and" + 0.013*"\'s" + 0.013*"by"')]

### 查看每份文件的 Topic Distribution

In [None]:
for i, doc in enumerate(freqVectors3):
    print(f"Document {i}:")
    print(lda_model.get_document_topics(doc))

# Document 0: [(4, np.float32(0.9899633))]
# 文件 0 幾乎完全是 Topic 4，機率= 0.9899（99%）

Document 0:
[(4, np.float32(0.99511147))]
Document 1:
[(2, np.float32(0.97303814))]
Document 2:
[(4, np.float32(0.97864693))]
Document 3:
[(4, np.float32(0.99923295))]
Document 4:
[(2, np.float32(0.9762821))]
Document 5:
[(0, np.float32(0.9960197))]
Document 6:
[(0, np.float32(0.99601966))]


In [None]:
!pip install pyLDAvis

import pyLDAvis
import pyLDAvis.gensim

pyLDAvis.enable_notebook()
pyLDAvis.gensim.prepare(lda_model, freqVectors3, id2word)

#encoding 確認，確認是否有字體

Collecting pyLDAvis
  Downloading pyLDAvis-3.4.1-py3-none-any.whl.metadata (4.2 kB)
Collecting funcy (from pyLDAvis)
  Downloading funcy-2.0-py2.py3-none-any.whl.metadata (5.9 kB)
Downloading pyLDAvis-3.4.1-py3-none-any.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading funcy-2.0-py2.py3-none-any.whl (30 kB)
Installing collected packages: funcy, pyLDAvis
Successfully installed funcy-2.0 pyLDAvis-3.4.1


  return datetime.utcnow().replace(tzinfo=utc)


  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
  return datetime.utcnow().replace(tzinfo=utc)
