In [5]:
# -*- coding: utf-8 -*-
"""
Created on Thu Apr 18 15:40:27 2024

@author: Natsu
"""

import re
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# 从txt文件加载数据
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    return lines

def extract_last_digit(input_file):
    try:
        last_digits = []  # 存储提取的行尾字符的列表
        with open(input_file, 'r') as f_input:
            for line in f_input:
                line = line.strip()  # 去除行尾的换行符和空格
                if line and (line.endswith('0') or line.endswith('1')):# zhang:修改了下格式应该不影响
                    last_digits.append(int(line[-1]))  # 提取行尾的字符并转换为整数

        # 将提取的字符列表转换为NumPy数组
        last_digits_array = np.array(last_digits)
        return last_digits_array
    except Exception as e:
        print(f"发生错误：{e}")
        return None

def clean_text(text):
    # zhang:将文本转换为小写
    text = text.lower()
    # 去除非字母字符和数字
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    return cleaned_text

# 文件路径
file_path = 'C:\\6405prj\\imdb_nolabel.txt' #改成本地路径
data_lines = load_data(file_path)
label_file_path = 'C:\\6405prj\\imdb_labelled.txt'
labels = extract_last_digit(label_file_path)

# 应用文本清洗函数
cleaned_data = [clean_text(line) for line in data_lines]
# 下载NLTK的分词器（仅需运行一次）
nltk.download('punkt')

def tokenize_text(text):
    # 使用NLTK的分词器进行分词
    tokens = word_tokenize(text)
    return tokens

# 应用分词处理函数
tokenized_data = [tokenize_text(line) for line in cleaned_data]
tokenized_texts = [' '.join(tokens) for tokens in tokenized_data]
# 初始化TF-IDF向量化器

# zhang:设置TF-IDF向量化和朴素贝叶斯分类器参数网格
param_grid = {
    'tfidf__max_df': [0.5, 0.75, 0.95],
    'tfidf__min_df': [2, 5],
    'tfidf__max_features': [500, 1000, 5000],
    'clf__alpha': [0.01, 0.1, 1]
}

# zhang:创建管道
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultinomialNB())
])

# zhang:创建GridSearchCV对象
grid_search = GridSearchCV(pipeline, param_grid, cv=5, verbose=2, n_jobs=-1)

# zhang:运行网格搜索
grid_search.fit(cleaned_data, labels)

# zhang:输出最佳参数和对应的准确率
print("最佳参数：", grid_search.best_params_)
print("最佳准确率：", grid_search.best_score_)

# 使用最佳参数训练数据集并去除前缀
best_tfidf_params = {key.replace("tfidf__", ""): value for key, value in grid_search.best_params_.items() if key.startswith("tfidf__")}
best_tfidf = TfidfVectorizer(**best_tfidf_params)
    
# 将文本数据转换为TF-IDF特征矩阵
features_tfidf = best_tfidf.fit_transform([' '.join(tokens) for tokens in tokenized_data])

# 输出TF-IDF特征矩阵的形状
print("TF-IDF特征矩阵形状：", features_tfidf.shape)

# 使用示例
input_file = 'C:\\6405prj\\imdb_labelled.txt'
labels = extract_last_digit(input_file)
if labels is not None:
    print("提取的行尾字符数组：", labels, labels.size)

# 假设已经有特征矩阵features_tfidf和对应的情感标签labels
# 将数据集划分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(features_tfidf, labels, test_size=0.2, random_state=44)

# 初始化朴素贝叶斯分类器
nb_classifier = MultinomialNB()

# 训练模型
nb_classifier.fit(X_train, y_train)

# 在测试集上进行预测
y_pred = nb_classifier.predict(X_test)

# 计算模型准确率
accuracy = accuracy_score(y_test, y_pred)
print("模型准确率：", accuracy)

# 输出分类报告
print("分类报告：")
print(classification_report(y_test, y_pred))

# 新的文本数据
new_texts = ["the film is disappointing", "I think it is interesting"]

# 对新数据进行预处理和特征提取
tokenized_new_texts = [tokenize_text(clean_text(text)) for text in new_texts]
new_features_tfidf = best_tfidf.transform([' '.join(tokens) for tokens in tokenized_new_texts])

# 使用训练好的模型进行预测
new_predictions = nb_classifier.predict(new_features_tfidf)

# 输出预测结果
for text, prediction in zip(new_texts, new_predictions):
    print(f"文本：{text}，预测情感：{prediction}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\59158\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Fitting 5 folds for each of 54 candidates, totalling 270 fits
最佳参数： {'clf__alpha': 1, 'tfidf__max_df': 0.5, 'tfidf__max_features': 1000, 'tfidf__min_df': 2}
最佳准确率： 0.773
TF-IDF特征矩阵形状： (1000, 1000)
提取的行尾字符数组： [0 0 0 0 1 0 0 1 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 1 0 1 1
 1 1 0 0 0 0 0 0 0 0 0 0 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 1
 0 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 1 0 0 1 1
 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 1 1 0 0 0 0 1 0 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 0 1 0 0 0 0
 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1 0
 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1
 1 1 1 1 1 1 0 1 1 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0
 0 0 0 0 0 1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1