## 欢迎进入 ModelWhale Notebook  

这里你可以编写代码，文档  

### 关于文件目录  


**project**：project 目录是本项目的工作空间，可以把将项目运行有关的所有文件放在这里，目录中文件的增、删、改操作都会被保留  


**input**：input 目录是数据集的挂载位置，所有挂载进项目的数据集都在这里，未挂载数据集时 input 目录被隐藏  


**temp**：temp 目录是临时磁盘空间，训练或分析过程中产生的不必要文件可以存放在这里，目录中的文件不会保存  


In [None]:
# 试试这个经典示例
print ("hello ModelWhale")

In [None]:
# 查看个人持久化工作区文件
!ls /home/mw/project/

In [None]:
# 查看当前挂载的数据集目录
!ls /home/mw/input/

# 数据预处理

In [3]:
import os
import json
import pandas as pd

# 文件路径
weibo_dir = '/home/mw/input/NLP61475/rumor/rumor/rumor_weibo'
forward_comment_dir = '/home/mw/input/NLP61475/rumor/rumor/rumor_forward_comment'

# 初始化数据列表
weibo_data = []
forward_comment_data = []

# 处理rumor_weibo文件夹中的数据
for filename in os.listdir(weibo_dir):
    if filename.endswith('.json'):
        filepath = os.path.join(weibo_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            data = json.load(file)
            weibo_data.append(data)

# 处理rumor_forward_comment文件夹中的数据
for filename in os.listdir(forward_comment_dir):
    if filename.endswith('.json'):
        filepath = os.path.join(forward_comment_dir, filename)
        with open(filepath, 'r', encoding='utf-8') as file:
            data = json.load(file)
            # 提取rumorCode
            rumor_code = filename.split('_')[1].split('.')[0]
            for comment in data:
                comment['rumorCode'] = rumor_code  # 添加rumorCode以便后续匹配
                forward_comment_data.append(comment)

# 转换为DataFrame
weibo_df = pd.DataFrame(weibo_data)
forward_comment_df = pd.DataFrame(forward_comment_data)
weibo_df.to_csv('/home/mw/work/NLP6/weibo_data.csv', index=False, encoding='utf-8')
forward_comment_df.to_csv('/home/mw/work/NLP6/forward_comment_data.csv', index=False, encoding='utf-8')


# 这部分是实现了谣言检测系统 将预训练模型

In [1]:


import json
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, models, util
from transformers import pipeline, BertTokenizer, BertForTokenClassification
import torch

# 加载微博谣言数据和辟谣数据集
def load_data():
    # 加载谣言数据
    rumor_df = pd.read_csv('/home/mw/work/NLP6/weibo_data.csv')
    
    # 加载辟谣数据
    fact_data = []
    with open('/home/mw/input/NLP61475/rumor/rumor/fact.json', 'r', encoding='utf-8') as f:
        for line in f:
            fact_data.append(json.loads(line.strip()))
    
    fact_df = pd.DataFrame(fact_data)
    fact_df = fact_df.dropna(subset=['title'])
    
    return rumor_df, fact_df

def extract_entities(text):
    """提取命名实体"""
    entities = ner_pipeline(text)
    return {entity['word'] for entity in entities}

def entity_similarity(text1, text2):
    """计算命名实体相似度"""
    entities1 = extract_entities(text1)
    entities2 = extract_entities(text2)
    if not entities1 or not entities2:
        return 0.0
    intersection = entities1.intersection(entities2)
    union = entities1.union(entities2)
    return len(intersection) / len(union)

def combined_similarity(text1, text2):
    """结合句子嵌入相似度和实体相似度"""
    embed_sim = util.pytorch_cos_sim(model.encode([text1], convert_to_tensor=True), 
                                     model.encode([text2], convert_to_tensor=True)).item()
    entity_sim = entity_similarity(text1, text2)
    return 0.5 * embed_sim + 0.5 * entity_sim

def debunk_rumor(input_rumor):
    """谣言检测"""
    similarity_scores = [combined_similarity(input_rumor, fact_text) for fact_text in fact_df['title']]
    
    most_similar_index = np.argmax(similarity_scores)
    most_similar_fact = fact_df.iloc[most_similar_index]
    
    print("微博谣言:", input_rumor)
    print(f"辟谣判断：{most_similar_fact['explain']}")
    print(f"辟谣依据：{most_similar_fact['title']}")

if __name__ == "__main__":
    # 加载数据
    rumor_df, fact_df = load_data()
    
    # 定制路径加载SimCSE模型
    simcse_path = '/home/mw/input/MINI_LLM3084'
    word_embedding_model = models.Transformer(simcse_path)
    pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
    model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
    model.to('cuda')
    
    # 定制路径加载预训练的NER模型
    ner_model_path = '/home/mw/input/bert_base_case9058'
    tokenizer = BertTokenizer.from_pretrained(ner_model_path)
    ner_model = BertForTokenClassification.from_pretrained(ner_model_path)
    ner_pipeline = pipeline('ner', model=ner_model, tokenizer=tokenizer, aggregation_strategy="simple", device=0)
    
    # 举个例子进行谣言检测
    weibo_rumor = "据最新研究发现，此次新型肺炎病毒传播途径是华南海鲜市场进口的豺——一种犬科动物携带的病毒，然后传给附近的狗，狗传狗，狗传人。狗不生病，人生病。人生病后又传给狗，循环传染。"
    debunk_rumor(weibo_rumor)

  from tqdm.autonotebook import tqdm, trange
  return self.fget.__get__(instance, owner)()
Some weights of BertForTokenClassification were not initialized from the model checkpoint at /home/mw/input/bert_base_case9058 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


微博谣言: 据最新研究发现，此次新型肺炎病毒传播途径是华南海鲜市场进口的豺——一种犬科动物携带的病毒，然后传给附近的狗，狗传狗，狗传人。狗不生病，人生病。人生病后又传给狗，循环传染。
辟谣判断：尚无定论
辟谣依据：狗能感染新型冠状病毒


In [11]:
import pandas as pd

# 读取CSV文件的前10000行数据并保存
def read_and_save_first_10000_rows(file_path, output_file):
    df = pd.read_csv(file_path, nrows=100000)
    df.to_csv(output_file, index=False)  # 将数据保存到新文件，不包含索引列

# 示例使用
input_file = '/home/mw/work/NLP6/positive_comments.csv'  # 替换成你的CSV文件路径
output_file = '/home/mw/work/NLP6/p_comments_100000.csv'  # 替换成你想保存的CSV文件路径

read_and_save_first_10000_rows(input_file, output_file)


In [12]:
import pandas as pd

# 读取CSV文件的前10000行数据并保存
def read_and_save_first_10000_rows(file_path, output_file):
    df = pd.read_csv(file_path, nrows=100000)
    df.to_csv(output_file, index=False)  # 将数据保存到新文件，不包含索引列

# 示例使用
input_file = '/home/mw/work/NLP6/negative_comments.csv'  # 替换成你的CSV文件路径
output_file = '/home/mw/work/NLP6/n_comments_1000000.csv'  # 替换成你想保存的CSV文件路径

read_and_save_first_10000_rows(input_file, output_file)
