# RNN测试

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
import requests
import io

class WeiboSentimentDataset:
    def __init__(self):
        # 数据集URL
        self.url = "https://raw.githubusercontent.com/SophonPlus/ChineseNlpCorpus/master/datasets/weibo_senti_100k/weibo_senti_100k.csv"
    
    def download_dataset(self):
        try:
            print("下载数据集...")
            response = requests.get(self.url)
            response.raise_for_status()  # 检查下载是否成功
            return pd.read_csv(io.StringIO(response.content.decode('utf-8')))
        except Exception as e:
            print(f"下载失败: {e}")
            # 如果下载失败，使用小型示例数据集
            return self.get_sample_dataset()
    
    def get_sample_dataset(self):
        # 示例数据，实际项目中应该使用更大的数据集
        data = {
            'review': [
                "这个产品很好用，我很喜欢",
                "质量特别差，退货了",
                "一般般，可以接受",
                "很满意，物超所值",
                "不推荐购买，浪费钱",
                # ... 添加更多示例
                "服务态度很好，下次还会购买",
                "出现故障，客服态度很差",
                "性价比很高，推荐购买",
                "完全不值这个价格",
                "快递很快，产品完好",
            ],
            'label': [1, 0, 1, 1, 0, 1, 0, 1, 0, 1]
        }
        return pd.DataFrame(data)

def load_and_preprocess_data():
    # 加载数据集
    dataset = WeiboSentimentDataset()
    df = dataset.download_dataset()
    
    # 分割训练集和测试集
    train_texts, test_texts, train_labels, test_labels = train_test_split(
        df['review'].values, 
        df['label'].values,
        test_size=0.2,
        random_state=42
    )
    
    # 创建tokenizer
    tokenizer = Tokenizer(num_words=50000, oov_token='<OOV>')
    tokenizer.fit_on_texts(train_texts)
    
    # 转换文本为序列
    train_sequences = tokenizer.texts_to_sequences(train_texts)
    test_sequences = tokenizer.texts_to_sequences(test_texts)
    
    # 设置最大序列长度
    max_length = 100
    
    # 填充序列
    X_train = pad_sequences(train_sequences, maxlen=max_length, padding='post')
    X_test = pad_sequences(test_sequences, maxlen=max_length, padding='post')
    
    print(f"词汇表大小: {len(tokenizer.word_index) + 1}")
    print(f"训练数据形状: {X_train.shape}")
    print(f"测试数据形状: {X_test.shape}")
    
    return X_train, train_labels, X_test, test_labels, tokenizer, max_length

def build_improved_lstm_model(vocab_size, max_length):
    model = Sequential([
        # 嵌入层
        Embedding(vocab_size, 128, input_length=max_length),
        
        # 第一个LSTM层
        LSTM(128, return_sequences=True),
        Dropout(0.3),
        
        # 第二个LSTM层
        LSTM(64),
        Dropout(0.3),
        
        # 全连接层
        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(1, activation='sigmoid')
    ])
    
    # 编译模型
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['accuracy']
    )
    
    return model

def train_model(model, X_train, y_train, X_test, y_test, batch_size=32, epochs=20):
    # 创建验证集
    X_train_split, X_val, y_train_split, y_val = train_test_split(
        X_train, y_train, test_size=0.1, random_state=42
    )
    
    # 定义回调函数
    callbacks = [
        # 早停
        tf.keras.callbacks.EarlyStopping(
            monitor='val_loss',
            patience=3,
            restore_best_weights=True
        ),
        # 学习率调整
        tf.keras.callbacks.ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=2
        )
    ]
    
    # 训练模型
    history = model.fit(
        X_train_split, y_train_split,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_val, y_val),
        callbacks=callbacks,
        verbose=1
    )
    
    # 评估模型
    test_loss, test_accuracy = model.evaluate(X_test, y_test)
    print(f"\n测试集准确率: {test_accuracy:.4f}")
    
    return history, test_accuracy

def predict_sentiment(model, text, tokenizer, max_length):
    # 预处理文本
    sequence = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(sequence, maxlen=max_length, padding='post')
    
    # 预测
    prediction = model.predict(padded)[0][0]
    sentiment = "正面评价" if prediction > 0.5 else "负面评价"
    confidence = prediction if prediction > 0.5 else 1 - prediction
    
    return sentiment, confidence

def main():
    # 加载和预处理数据
    print("开始加载数据...")
    X_train, y_train, X_test, y_test, tokenizer, max_length = load_and_preprocess_data()
    
    # 构建模型
    vocab_size = len(tokenizer.word_index) + 1
    model = build_improved_lstm_model(vocab_size, max_length)
    print("\n模型结构:")
    model.summary()
    
    # 训练模型
    print("\n开始训练...")
    history, test_accuracy = train_model(
        model, X_train, y_train, X_test, y_test,
        batch_size=32,
        epochs=10
    )
    
    # 测试新评论
    test_texts = [
        "这个产品非常好用，超出我的预期",
        "质量很差，客服态度也不好",
        "价格合理，性能还可以",
        "完全是浪费钱，后悔购买",
        "物流快，包装完好，推荐购买"
    ]
    
    print("\n预测新评论:")
    for text in test_texts:
        sentiment, confidence = predict_sentiment(model, text, tokenizer, max_length)
        print(f"\n文本: '{text}'")
        print(f"预测: {sentiment} (置信度: {confidence:.4f})")
        
    # 保存模型
    model.save('sentiment_model.h5')
    print("\n模型已保存为 'sentiment_model.h5'")

if __name__ == "__main__":
    main()

开始加载数据...
下载数据集...
下载失败: 404 Client Error: Not Found for url: https://raw.githubusercontent.com/SophonPlus/ChineseNlpCorpus/master/datasets/weibo_senti_100k/weibo_senti_100k.csv
词汇表大小: 10
训练数据形状: (8, 100)
测试数据形状: (2, 100)

模型结构:





开始训练...
Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.5714 - loss: 0.6936 - val_accuracy: 1.0000 - val_loss: 0.6732 - learning_rate: 0.0010
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - accuracy: 0.7143 - loss: 0.6877 - val_accuracy: 1.0000 - val_loss: 0.6402 - learning_rate: 0.0010
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - accuracy: 0.7143 - loss: 0.6873 - val_accuracy: 1.0000 - val_loss: 0.6136 - learning_rate: 0.0010
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.7143 - loss: 0.6561 - val_accuracy: 1.0000 - val_loss: 0.5809 - learning_rate: 0.0010
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step - accuracy: 0.7143 - loss: 0.6472 - val_accuracy: 1.0000 - val_loss: 0.5347 - learning_rate: 0.0010
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/




文本: '物流快，包装完好，推荐购买'
预测: 正面评价 (置信度: 0.8013)

模型已保存为 'sentiment_model.h5'
