# DSAA 5002 - Data Mining and Knowledge Discovery in Data Science
---

# Task 1 (50 marks) Data Preprocessing and Analysis

**Background:** 
**Assuming you are a sentiment analyst at a securities firm, your task is to assess the impact of each news article on the A-share listed companies explicitly mentioned.**

# Q2. Data Analysis - Text Knowledge Mining

# part3- Application of BiLSTM-based Sentiment Analyse Model 

---
## 1. Application Domain Reset

In [2]:
import pandas as pd

# Read the original Excel file
input_file = 'News_output\\News_application_domain_set_withCompany.xlsx'
df_application_domain_set_withCompany = pd.read_excel(input_file)

# Create two new DataFrames, one to save rows without commas and one to save rows with commas
df_SingleCompany = df_application_domain_set_withCompany[df_application_domain_set_withCompany['Explicit_Company'].str.find(',') == -1]
df_MultiCompany = df_application_domain_set_withCompany[df_application_domain_set_withCompany['Explicit_Company'].str.find(',') != -1]

# Save these two new DataFrames to different Excel files
output_file_SingleCompany = 'News_output\\News_application_domain_set_withSingleCompany.xlsx'
output_file_MultiCompany = 'News_output\\News_application_domain_set_withMultiCompany.xlsx'

df_SingleCompany.to_excel(output_file_SingleCompany, index=False)
df_MultiCompany.to_excel(output_file_MultiCompany, index=False)

# Count the number of news articles with a single company entity
news_num_in_application_domain_with_SingleCompany = df_SingleCompany.shape[0]
# Count the number of news articles with multiple company entities
news_num_in_application_domain_with_MultiCompany = df_MultiCompany.shape[0]

In [7]:
print(f"News_num in application_domain with SingleCompany: {news_num_in_application_domain_with_SingleCompany}")
print(f"News_num in application_domain with MultiCompany: {news_num_in_application_domain_with_MultiCompany}")
print(f"{news_num_in_application_domain_with_SingleCompany/news_num_in_application_domain_with_MultiCompany}")

News_num in application_domain with SingleCompany: 375124
News_num in application_domain with MultiCompany: 149102
2.515888452200507


In [4]:
import pandas as pd
input_file = 'News_output\\News_application_domain_set_withCompany.xlsx'
application_set = pd.read_excel(input_file)

In [5]:
# Data reorganization for training: Removing data source
# Dropping the NewsSource column
application_set.drop(columns=['NewsSource'], inplace=True)

# Concatenating Title and NewsContent columns into a single column
application_set['NewsContent'] = application_set['Title'] + ' ' + application_set['NewsContent']

# Dropping the Title column
application_set.drop(columns=['Title'], inplace=True)

In [6]:
application_set

Unnamed: 0,NewsID,NewsContent,Explicit_Company
0,1,建设银行原董事长张恩照一审被判15年 本报记者 田雨 李京华 中国建设银行股份...,建设银行
1,2,农行信用卡中心搬到上海滩 中国农业银行信用卡中心由北京搬到上海了！ 农行行长杨明生...,农业银行
2,3,外运发展：价值型蓝筹股补涨要求强烈 在新基金快速发行以及申购资金回流的情况下，市场总体上...,"中国国航, 外运发展"
3,4,胜利股份：稳步走强形成标准上升通道 胜利股份（000407）公司子公司填海造地2800亩...,胜利股份
4,5,[港股快讯]恒指收市报18960点 成交467亿港元 全景网11月30日讯 外围股市造好...,新世界股份
...,...,...,...
524221,1037031,亿华通：公司电解槽相关产品目前还处于产品的研发及测试阶段 尚未实现批量销售 每经AI快讯，有...,亿华通
524222,1037032,依米康：接受中泰证券调研 依米康（SZ 300249，收盘价：10.38元）发布公告称，20...,"中泰证券, 依米康"
524223,1037033,天风证券给予中核科技买入评级 核电行业景气上行 公司有望乘风而起 天风证券10月13日发布研...,"天风证券, 中核科技"
524224,1037034,海特生物：公司在抗癌药CPT获批后 会考虑适时开展CPT在海外的临床并谋求上市 有投资者提问...,海特生物


In [None]:
# Save Application_set File
output_application_file = 'Result_dataset\\Application_set.xlsx'
application_set.to_excel(output_application_file, index=False)

## 2. Label Application_domain_file with BiLSTM we trained

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [3]:
model_path = 'bert-base-chinese'
output_model_path = 'model\\bilstm_model_v1.bin' #use v1 here

In [4]:
# Define the dataset
class NewsDataset(Dataset):
    def __init__(self, news, tokenizer):
        self.news = news
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.news)
    
    def __getitem__(self, idx):
        text = self.news[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=640,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
        }

In [5]:
# Define the BiLSTM Model
class BiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, _ = self.lstm(embedded)
        hidden = torch.cat((output[:, -1, :hidden_dim], output[:, 0, hidden_dim:]), dim=1)
        return self.fc(hidden)

In [7]:
# Embedding the Input
tokenizer = BertTokenizer.from_pretrained(model_path)

# Set the HyperPara
input_dim = tokenizer.vocab_size
hidden_dim = 128
output_dim = 2  # Bi-sentimental
dropout = 0.1

In [8]:
# Loading Application_domain_file
application_data_path = 'Result_dataset\\Application_set.xlsx'
df_application = pd.read_excel(application_data_path)

In [9]:
df_application

Unnamed: 0,NewsID,NewsContent,Explicit_Company
0,1,建设银行原董事长张恩照一审被判15年 本报记者 田雨 李京华 中国建设银行股份...,建设银行
1,2,农行信用卡中心搬到上海滩 中国农业银行信用卡中心由北京搬到上海了！ 农行行长杨明生...,农业银行
2,3,外运发展：价值型蓝筹股补涨要求强烈 在新基金快速发行以及申购资金回流的情况下，市场总体上...,"中国国航, 外运发展"
3,4,胜利股份：稳步走强形成标准上升通道 胜利股份（000407）公司子公司填海造地2800亩...,胜利股份
4,5,[港股快讯]恒指收市报18960点 成交467亿港元 全景网11月30日讯 外围股市造好...,新世界股份
...,...,...,...
524221,1037031,亿华通：公司电解槽相关产品目前还处于产品的研发及测试阶段 尚未实现批量销售 每经AI快讯，有...,亿华通
524222,1037032,依米康：接受中泰证券调研 依米康（SZ 300249，收盘价：10.38元）发布公告称，20...,"中泰证券, 依米康"
524223,1037033,天风证券给予中核科技买入评级 核电行业景气上行 公司有望乘风而起 天风证券10月13日发布研...,"天风证券, 中核科技"
524224,1037034,海特生物：公司在抗癌药CPT获批后 会考虑适时开展CPT在海外的临床并谋求上市 有投资者提问...,海特生物


In [10]:
application_news = df_application['NewsContent'].tolist()
application_dataset = NewsDataset(application_news, tokenizer=tokenizer)  # No labels here as this is the prediction phase

In [11]:
# Create data loader
batch_size = 64
application_loader = DataLoader(application_dataset, batch_size=batch_size)

# Set model to evaluation mode
model = BiLSTM(input_dim, hidden_dim, output_dim, dropout)
model.load_state_dict(torch.load(output_model_path))
model.eval()

# Perform predictions
predictions = []
with torch.no_grad():
    for batch in tqdm(application_loader, desc='Predicting'):
        input_ids = batch['input_ids']
        outputs = model(input_ids)
        _, predicted = torch.max(outputs, 1)
        predictions.extend(predicted.cpu().numpy())

# Append predictions to the dataset
df_application['label'] = predictions

Predicting: 100%|██████████████████████████████████████████████████████████████████| 8192/8192 [52:30<00:00,  2.60it/s]


In [12]:
df_application

Unnamed: 0,NewsID,NewsContent,Explicit_Company,label
0,1,建设银行原董事长张恩照一审被判15年 本报记者 田雨 李京华 中国建设银行股份...,建设银行,0
1,2,农行信用卡中心搬到上海滩 中国农业银行信用卡中心由北京搬到上海了！ 农行行长杨明生...,农业银行,1
2,3,外运发展：价值型蓝筹股补涨要求强烈 在新基金快速发行以及申购资金回流的情况下，市场总体上...,"中国国航, 外运发展",1
3,4,胜利股份：稳步走强形成标准上升通道 胜利股份（000407）公司子公司填海造地2800亩...,胜利股份,1
4,5,[港股快讯]恒指收市报18960点 成交467亿港元 全景网11月30日讯 外围股市造好...,新世界股份,1
...,...,...,...,...
524221,1037031,亿华通：公司电解槽相关产品目前还处于产品的研发及测试阶段 尚未实现批量销售 每经AI快讯，有...,亿华通,1
524222,1037032,依米康：接受中泰证券调研 依米康（SZ 300249，收盘价：10.38元）发布公告称，20...,"中泰证券, 依米康",1
524223,1037033,天风证券给予中核科技买入评级 核电行业景气上行 公司有望乘风而起 天风证券10月13日发布研...,"天风证券, 中核科技",1
524224,1037034,海特生物：公司在抗癌药CPT获批后 会考虑适时开展CPT在海外的临床并谋求上市 有投资者提问...,海特生物,1


In [13]:
# Count the distribution of 0s and 1s
sentiment_distribution_final = df_application['label'].value_counts()
print(sentiment_distribution_final)

label
1    416302
0    107924
Name: count, dtype: int64


In [14]:
# Application_set_labeled_with_BiLSTM_v1.xlsx is our Task1.xlsx
output_application_file = 'Result_dataset\\Application_set_BiLSTM\\Application_set_labeled_with_BiLSTM_v1.xlsx'
df_application.to_excel(output_application_file, index=False)

**Application_set_labeled_with_BiLSTM_v1.xlsx is our Task1.xlsx**

In [None]:
# Application_set_labeled_with_BiLSTM_v1.xlsx is our Task1.xlsx
output_application_file = 'Task1.xlsx'
df_application.to_excel(output_application_file, index=False)