In [14]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from data_processing import get_clean_data  # 导入预处理函数

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec, FastText
from models import build_cnn_model, build_lstm_model  # 从 models.py 中导入模型构建函数
from sklearn.metrics import classification_report

In [2]:
# 预处理数据并保存到本地
train_cleaned, train_labels, test_cleaned, test_labels = get_clean_data(
    "../data/datasets/ghosh/train_sample.txt",
    "../data/datasets/ghosh/test_sample.txt",
    save=True
)

In [6]:
with open("train_tweets_clean.txt", "r", encoding="utf-8") as f:
    train_lines = f.read().splitlines()

with open("test_tweets_clean.txt", "r", encoding="utf-8") as f:
    test_lines = f.read().splitlines()

train_cleaned_df = pd.DataFrame(train_lines, columns=["Text"])
test_cleaned_df = pd.DataFrame(test_lines, columns=["Text"])

可以添加高频词（在词向量中的展示可以展示高频词词向量），可添加可视化预处理后的数据

In [4]:
import pandas as pd
from gensim.models import Word2Vec, FastText

In [7]:
train_sentences = train_cleaned_df["Text"].apply(lambda x: x.split()).tolist()
test_sentences = test_cleaned_df["Text"].apply(lambda x: x.split()).tolist()

In [8]:
skipgram_model = Word2Vec(sentences=train_sentences, vector_size=100, window=5, sg=1, min_count=1, workers=4)
skipgram_model.save("skipgram.model")

In [9]:
fasttext_model = FastText(sentences=train_sentences, vector_size=100, window=5, min_count=1, workers=4)
fasttext_model.save("fasttext.model")

展示部分词向量

In [11]:
# 选取几个单词展示词向量
words = ["love", "my", "mom", "like", "just"]

print("Skip-gram")
for word in words:
    if word in skipgram_model.wv:
        print(f"{word}: {skipgram_model.wv[word][:5]}")  # 仅展示前5个维度

Skip-gram
love: [ 0.0033984   0.00861393  0.00138705  0.0082688  -0.00935446]
my: [ 0.0053937   0.00297525 -0.01076279  0.00685094 -0.00698826]
mom: [-0.00038277  0.00436646  0.0007396   0.003766   -0.00822128]
like: [-0.0041687  -0.00663539  0.00905645  0.00450114 -0.00133147]
just: [-0.00535828 -0.00484555 -0.00930921 -0.00540644  0.00328046]


In [12]:
print("FastText")
for word in words:
    if word in fasttext_model.wv:
        print(f"{word}: {fasttext_model.wv[word][:5]}")

FastText
love: [-0.00380938  0.00047163 -0.00141727  0.00353753  0.00149987]
my: [ 0.00290381  0.00189729  0.001617    0.00323172 -0.00172936]
mom: [-0.00123451 -0.00050335  0.00012947  0.00161764  0.00149152]
like: [ 1.4855972e-03 -4.1661761e-03  2.8692514e-03  4.2900465e-05
 -2.5856479e-03]
just: [ 0.00101466 -0.00263574  0.00162182 -0.00044807 -0.0011162 ]


训练数据

In [15]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_cleaned_df["Text"].tolist())
vocab_size = len(tokenizer.word_index) + 1  # 词汇表大小

In [16]:
X_train = tokenizer.texts_to_sequences(train_cleaned_df["Text"].tolist())
X_test = tokenizer.texts_to_sequences(test_cleaned_df["Text"].tolist())

In [17]:
train_lengths = [len(seq) for seq in X_train]
max_len = int(np.percentile(train_lengths, 95))
if max_len < 10:
    max_len = 10

In [18]:
X_train = pad_sequences(X_train, maxlen=max_len, padding='post', truncating='post')
X_test = pad_sequences(X_test, maxlen=max_len, padding='post', truncating='post')

In [19]:
# 构造嵌入矩阵函数：将 tokenizer 中的词汇与词向量模型对齐
def create_embedding_matrix(model, tokenizer, vocab_size, embedding_dim):
    embedding_matrix = np.zeros((vocab_size, embedding_dim))
    for word, i in tokenizer.word_index.items():
        if word in model.wv:
            embedding_matrix[i] = model.wv[word]
    return embedding_matrix

In [20]:
embedding_dim = 100  # 词向量维度（与训练时保持一致）

In [21]:
skipgram_embedding_matrix = create_embedding_matrix(skipgram_model, tokenizer, vocab_size, embedding_dim)

In [22]:
fasttext_embedding_matrix = create_embedding_matrix(fasttext_model, tokenizer, vocab_size, embedding_dim)

In [23]:
cnn_skipgram = build_cnn_model(skipgram_embedding_matrix, vocab_size, max_len, embedding_dim)



In [24]:
cnn_fasttext = build_cnn_model(fasttext_embedding_matrix, vocab_size, max_len, embedding_dim)

In [25]:
lstm_skipgram = build_lstm_model(skipgram_embedding_matrix, vocab_size, max_len, embedding_dim)

In [26]:
lstm_fasttext = build_lstm_model(fasttext_embedding_matrix, vocab_size, max_len, embedding_dim)

In [27]:
epochs = 5
batch_size = 32

# 训练 CNN 模型（使用 skip-gram 词向量）
history_cnn = cnn_skipgram.fit(X_train, train_labels, epochs=epochs, batch_size=batch_size,
                               validation_data=(X_test, test_labels))

Epoch 1/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 134ms/step - accuracy: 0.4112 - loss: 0.6937 - val_accuracy: 0.4500 - val_loss: 0.6953
Epoch 2/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.6252 - loss: 0.6884 - val_accuracy: 0.4500 - val_loss: 0.6960
Epoch 3/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step - accuracy: 0.5738 - loss: 0.6851 - val_accuracy: 0.4500 - val_loss: 0.6983
Epoch 4/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - accuracy: 0.5665 - loss: 0.6789 - val_accuracy: 0.4500 - val_loss: 0.7048
Epoch 5/5
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.5500 - loss: 0.6736 - val_accuracy: 0.4500 - val_loss: 0.7139


In [28]:
# 预测与评估
cnn_pred_probs = cnn_skipgram.predict(X_test)
cnn_preds = (cnn_pred_probs > 0.5).astype(int).reshape(-1)
print("=== CNN (skip-gram) Classification Report ===")
print(classification_report(test_labels, cnn_preds))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step
=== CNN (skip-gram) Classification Report ===
              precision    recall  f1-score   support

           0       0.45      1.00      0.62         9
           1       0.00      0.00      0.00        11

    accuracy                           0.45        20
   macro avg       0.23      0.50      0.31        20
weighted avg       0.20      0.45      0.28        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
