<a href="https://colab.research.google.com/github/JohnsonYu0924/114_2_text-analysis/blob/main/L10_Text_Classification%EF%BC%88%E6%96%87%E6%9C%AC%E5%88%86%E9%A1%9E%EF%BC%89.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Text Classification（文本分類）

## Import 套件 & 讀取資料

In [None]:
import os
os.getcwd()

'/content'

In [None]:
# Source for code: https://www.analyticsvidhya.com/blog/2018/04/a-comprehensive-guide-to-understand-and-implement-text-classification-in-python/

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
import pandas, numpy, textblob, string

# load the dataset
mydir = "/content/"
corpus = "corpus.txt"
data = open(mydir + corpus).read()

data = open(mydir + corpus).read()
labels, texts = [], []
for i, line in enumerate(data.split("\n")):
  content = line.split()
  if len(content) == 0:  # ← 跳過空行
      continue
  labels.append(content[0])
  texts.append(" ".join(content[1:]))

# create a dataframe using texts and labels
trainDF = pandas.DataFrame()
trainDF['text'] = texts
trainDF['label'] = labels

# show first rows
trainDF.head()


Unnamed: 0,text,label
0,the movie was captivating and emotionally powe...,pos
1,i enjoyed every moment of this beautifully dir...,pos
2,the performances were outstanding and heartfelt,pos
3,visually impressive with a strong narrative st...,pos
4,absolutely loved this movie from start to finish,pos


## Train–Validation Split + Label Encoding

In [None]:
# split the dataset into training and validation datasets
# train_x, valid_x, train_y, valid_y: 切資料 train / valid

train_x, valid_x, train_y, valid_y = \
   model_selection.train_test_split(trainDF['text'], \
                                    trainDF['label'])

# label encode the target variable: 把文字類別轉成數字
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.transform(valid_y)


## Text Vectorization（Count / TF-IDF / N-gram）

### CountVectorizer（Term Frequency）

In [None]:
# 建立 CountVectorizer: 文字轉成數值矩陣

count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')

# fit(): 從訓練資料中建立 vocabulary
count_vect.fit(trainDF['text'])

# transform(): 把每個文件轉成 count vector
xtrain_count = count_vect.transform(train_x)
xvalid_count = count_vect.transform(valid_x)


### TF-IDF（word level）

In [None]:
# token_pattern=r'\w{1,}': “把由字母或數字組成、至少 1 個字元的東西當成一個 token”
# max_features=5000: 只保留前 5000 個最常見、最有意義的單字（依 TF-IDF 排序)

tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])

xtrain_tfidf = tfidf_vect.transform(train_x)
xvalid_tfidf = tfidf_vect.transform(valid_x)


### TF-IDF（N-gram 2–3 grams）

In [None]:
# 建立 N-gram TF-IDF 向量器
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}',
                                   ngram_range=(2,3), max_features=5000)

# 用訓練資料建立 n-gram vocabulary
tfidf_vect_ngram.fit(trainDF['text'])

# 將 train / valid 轉換為 n-gram TF-IDF 矩陣
xtrain_tfidf_ngram = tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram = tfidf_vect_ngram.transform(valid_x)


## Training Models + Evaluation

### Naive Bayes – Count Vectors

In [None]:
classifier = naive_bayes.MultinomialNB() #建立 Multinomial Naive Bayes 模型（最常用於文字資料）
classifier.fit(xtrain_count, train_y)  # 用 CountVectorizer 的向量（xtrain_count）訓練模型
predictions = classifier.predict(xvalid_count) #用訓練好的模型預測驗證集（valid_x → xvalid_count）

acc = metrics.accuracy_score(predictions, valid_y) #預測正確的比例。例如：100 筆裡預測對 20 筆，Accuracy = 0.20
prec = metrics.precision_score(predictions, valid_y) #預測為正確時，有多少是真的正類。 模型預測 30 筆是 A 類，只有 10 筆真的 Precision = 10/30 = 0.33。「說是 A 的時候，到底有多準？」
rec = metrics.recall_score(predictions, valid_y) #真正的正類，有多少被正確預測出來。 資料中本來有 30 個 A 類，模型抓到 10 個。「所有 A，抓到了多少？」。
f1 = metrics.f1_score(predictions, valid_y) #Precision + Recall 的綜合指標

print("Naive Bayes - Count Vectors")
print("Accuracy:", round(acc, 2))
print("Precision:", round(prec, 2))
print("Recall:", round(rec, 2))
print("F1:", round(f1, 2))


Naive Bayes - Count Vectors
Accuracy: 0.4
Precision: 0.5
Recall: 0.33
F1: 0.4


### Logistic Regression – Count Vectors

In [None]:
classifier = linear_model.LogisticRegression(max_iter=300)
classifier.fit(xtrain_count, train_y)
predictions = classifier.predict(xvalid_count)

acc = metrics.accuracy_score(valid_y, predictions)
prec = metrics.precision_score(valid_y, predictions)
rec = metrics.recall_score(valid_y, predictions)
f1 = metrics.f1_score(valid_y, predictions, average='weighted')

print("Logistic Regression - Count Vectors")
print("Accuracy:", round(acc, 2))
print("Precision:", round(prec, 2))
print("Recall:", round(rec, 2))
print("F1:", round(f1, 2))


Logistic Regression - Count Vectors
Accuracy: 0.2
Precision: 0.25
Recall: 0.5
F1: 0.13


### Naive Bayes – N-gram TF-IDF

In [None]:
classifier = naive_bayes.MultinomialNB()
classifier.fit(xtrain_tfidf_ngram, train_y)
predictions = classifier.predict(xvalid_tfidf_ngram)

acc = metrics.accuracy_score(valid_y, predictions)
prec = metrics.precision_score(valid_y, predictions )
rec = metrics.recall_score(valid_y, predictions)
f1 = metrics.f1_score(valid_y, predictions)

print("Naive Bayes - Ngram Level TF-IDF Vectors")
print("Accuracy:", round(acc, 2))
print("Precision:", round(prec, 2))
print("Recall:", round(rec, 2))
print("F1:", round(f1, 2))

Naive Bayes - Ngram Level TF-IDF Vectors
Accuracy: 0.2
Precision: 0.25
Recall: 0.5
F1: 0.33


### SVM – N-gram TF-IDF

Support Vector Machine (SVM)

SVM + n-gram TF-IDF
- 對文字分類通常非常強
- 表現常比 Naive Bayes 和 Logistic Regression 都好
- 但訓練速度慢，記憶體需求高（特別是 n-gram 特徵）

In [None]:
classifier = svm.SVC()
classifier.fit(xtrain_tfidf_ngram, train_y)
predictions = classifier.predict(xvalid_tfidf_ngram)

acc = metrics.accuracy_score(valid_y, predictions)
prec = metrics.precision_score(valid_y, predictions, average='weighted')
rec = metrics.recall_score(valid_y, predictions, average='weighted')
f1 = metrics.f1_score(valid_y, predictions, average='weighted')

print("SVM - Ngram Level TF IDF Vectors")
print("Accuracy:", round(acc, 2))
print("Precision:", round(prec, 2))
print("Recall:", round(rec, 2))
print("F1:", round(f1, 2))


SVM - Ngram Level TF IDF Vectors
Accuracy: 0.4
Precision: 0.16
Recall: 0.4
F1: 0.23


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Random Forest – Count Vectors

Random Forest 通常表現最差，尤其在 TF-IDF 上
- 特徵數量太大（幾千～幾萬），決策樹在高維度資料上效率很差。
- 決策樹偏向處理有明顯閾值的連續數值特徵
- 文字分析通常是稀疏矩陣（大部分是0），樹模型不擅長處理稀疏矩陣。

In [None]:
classifier = ensemble.RandomForestClassifier()
classifier.fit(xtrain_count, train_y)
predictions = classifier.predict(xvalid_count)

acc = metrics.accuracy_score(valid_y, predictions)
prec = metrics.precision_score(valid_y, predictions, average='weighted')
rec = metrics.recall_score(valid_y, predictions, average='weighted')
f1 = metrics.f1_score(valid_y, predictions, average='weighted')

print("Random Forest - Count Vectors")
print("Accuracy:", round(acc, 2))
print("Precision:", round(prec, 2))
print("Recall:", round(rec, 2))
print("F1:", round(f1, 2))

Random Forest - Count Vectors
Accuracy: 0.2
Precision: 0.1
Recall: 0.2
F1: 0.13


## K-fold model evaluation

### Prepare full X, y

In [None]:
y = trainDF['label']
y = encoder.fit_transform(y) #用 LabelEncoder 把文字標籤轉成數字標籤

X = trainDF['text'] #取出所有文字資料
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}') #建立 CountVectorizer
count_vect.fit(X) #fit(X): 用所有文章建立 vocabulary
Xcount = count_vect.transform(X) #transform(X): 把每篇文章變成 Count 矩陣

tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000) #TF-IDF Vectorizer
tfidf_vect.fit(X)
Xtfidf = tfidf_vect.transform(X)


### K-fold Cross Validation

In [None]:
from sklearn.model_selection import KFold, cross_validate
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1_score': make_scorer(f1_score)
}

kfold = model_selection.KFold(n_splits=3, random_state=42, shuffle=True) #「n_splits=3」代表要做 3 次 cross-validation, shuffle=True: 在切成 folds 之前，先把資料「隨機打亂」
modelNB = naive_bayes.MultinomialNB()

results1 = cross_validate(modelNB, Xcount, y, cv=kfold, scoring=scoring)


print("Accuracy for each fold:", results1['test_accuracy'])
print("Precision for each fold:", results1['test_precision'])
print("Recall for each fold:", results1['test_recall'])
print("F1-score for each fold:", results1['test_f1_score'])


Accuracy for each fold: [0.57142857 0.42857143 0.5       ]
Precision for each fold: [0.66666667 0.4        0.5       ]
Recall for each fold: [0.5        0.66666667 0.66666667]
F1-score for each fold: [0.57142857 0.5        0.57142857]
