In [1]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import time

In [2]:
result_df = pd.read_excel('combine_df.xlsx')

In [3]:
import time
import torch
from transformers import BertTokenizer, BertForSequenceClassification

# Start timing
start_time = time.time()

def load_model_and_tokenizer(model_name):
    """
    Loads the BERT model and tokenizer.

    Args:
    model_name (str): The name of the pretrained model.

    Returns:
    tuple: A tuple containing the loaded model and tokenizer.
    """
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertForSequenceClassification.from_pretrained(model_name)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device).eval()
    return model, tokenizer

# Load model and tokenizer
model_name = "hw2942/bert-base-chinese-finetuning-financial-news-sentiment-v2"
model, tokenizer = load_model_and_tokenizer(model_name)

def sentiment_analysis(text):
    """
    Performs sentiment analysis on the given text.

    Args:
    text (str): The text to analyze.

    Returns:
    int: The sentiment prediction (0 for Negative, 1 for Positive).
    """
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(model.device)
    with torch.no_grad():
        prediction = model(**inputs).logits.argmax().item()
    return prediction

# Apply sentiment analysis and adjust results
result_df['label'] = result_df['NewsContent'].apply(sentiment_analysis)
result_df = result_df[result_df['label'] != 1].replace({2: 1, 0: 0})

# Display results
print(result_df.head())

# End timing and display
total_run_time = time.time() - start_time
print(f"Total run time: {total_run_time:.2f} seconds")

   NewsID                                        NewsContent  \
0       1  　　本报记者 田雨 李京华    　　中国建设银行股份有限公司原董事长张恩照受贿案３日一审宣...   
1       1  　　中国农业银行信用卡中心由北京搬到上海了！  　　农行行长杨明生日前在信用卡中心揭牌仪式上...   
2       3  　　在新基金快速发行以及申购资金回流的情况下，市场总体上呈现资金流动性过剩格局，考虑到现阶段...   
3       4  　　胜利股份（000407）公司子公司填海造地2800亩，以青岛的地价估算，静态价值在10亿...   
5       8  　　由于全球最大的俄罗斯Uralkaly钾矿被淹，产量大减，同时满洲里口岸铁路在修复线，导致...   

         Explicit_Company  label  
0                    建设银行      0  
1                    农业银行      1  
2        中国国航, 外运发展, 金路集团      1  
3        金瑞矿业, 日科化学, 胜利股份      1  
5  ST南风, 中化岩土, 山西路桥, 冠农股份      1  
Total run time: 6424.25 seconds


In [4]:
# Save the modified DataFrame to a new XLSX file
result_df.to_excel('result_df_with_sentiment_combine.xlsx', index=False)

In [5]:
result_df

Unnamed: 0,NewsID,NewsContent,Explicit_Company,label
0,1,本报记者 田雨 李京华 中国建设银行股份有限公司原董事长张恩照受贿案３日一审宣...,建设银行,0
1,1,中国农业银行信用卡中心由北京搬到上海了！ 农行行长杨明生日前在信用卡中心揭牌仪式上...,农业银行,1
2,3,在新基金快速发行以及申购资金回流的情况下，市场总体上呈现资金流动性过剩格局，考虑到现阶段...,"中国国航, 外运发展, 金路集团",1
3,4,胜利股份（000407）公司子公司填海造地2800亩，以青岛的地价估算，静态价值在10亿...,"金瑞矿业, 日科化学, 胜利股份",1
5,8,由于全球最大的俄罗斯Uralkaly钾矿被淹，产量大减，同时满洲里口岸铁路在修复线，导致...,"ST南风, 中化岩土, 山西路桥, 冠农股份",1
...,...,...,...,...
596133,1037009,吉电股份10月13日在交易所互动平台中披露，截至10月10日公司股东户数为171303户，较...,吉电股份,0
596135,1037011,每经AI快讯，有投资者在投资者互动平台提问：请问董秘：浙文互联连续下跌4个多月，近期大部分股...,科达股份,0
596147,1037025,10月12日晚间，三星医疗发布2023年前三季度业绩预告，公司预计前三季度实现归属于母公司所...,"三星新材, 三星医疗",1
596152,1037030,每经AI快讯，有投资者在投资者互动平台提问：公司领导，请问公司经营是不是出现重大问题了，股票...,亿华通,0


# Q2_1

In [3]:
result_df = pd.read_excel('result_df_with_sentiment.xlsx')

In [16]:
from neo4j import GraphDatabase
import pandas as pd
import time

# 连接到Neo4j数据库
uri = "bolt://localhost:7687"  # 或您的Neo4j实例URI
username = "neo4j"  # 替换为您的用户名
password = "12345678"
driver = GraphDatabase.driver(uri, auth=(username, password))

In [22]:
def create_graph(driver):
    with driver.session() as session:
        # 读取公司信息并创建ID到股票代码的映射
        df_companies = pd.read_csv('KnowledgeGraph/hidy.nodes.company.csv')
        id_to_code = dict(zip(df_companies[':ID'], df_companies['code']))

        # 创建公司节点
        for index, row in df_companies.iterrows():
            session.run("MERGE (:Company {name: $company_name, code: $code})", 
                        company_name=row['company_name'], code=row['code'])

        # 处理单向关系类型
        for rel in ['invest', 'supply']:
            df_rel = pd.read_csv(f'KnowledgeGraph/hidy.relationships.{rel}.csv')
            for index, row in df_rel.iterrows():
                start_code = id_to_code.get(row[':START_ID'])
                end_code = id_to_code.get(row[':END_ID'])
                if start_code and end_code:
                    session.run(f"MATCH (a:Company {{code: $start_code}}), (b:Company {{code: $end_code}}) "
                                f"MERGE (a)-[:{rel.upper()}]->(b)",
                                start_code=start_code, end_code=end_code)

        # 处理双向关系类型
        for rel in ['compete', 'cooperate', 'dispute', 'same_industry']:
            df_rel = pd.read_csv(f'KnowledgeGraph/hidy.relationships.{rel}.csv')
            for index, row in df_rel.iterrows():
                start_code = id_to_code.get(row[':START_ID'])
                end_code = id_to_code.get(row[':END_ID'])
                if start_code and end_code:
                    session.run(f"MATCH (a:Company {{code: $start_code}}), (b:Company {{code: $end_code}}) "
                                f"MERGE (a)-[:{rel.upper()}]->(b) "
                                f"MERGE (b)-[:{rel.upper()}]->(a)",
                                start_code=start_code, end_code=end_code)

# 开始计时
start_time = time.time()

# 调用函数创建图谱
create_graph(driver)

# 总体运行时间
total_run_time = time.time()
print("Total run time: {:.2f} seconds".format(total_run_time - start_time))

Total run time: 36.28 seconds
