# DSAA 5002 - Data Mining and Knowledge Discovery in Data Science
---

# Task 1 (50 marks) Data Preprocessing and Analysis

**Background:** 
**Assuming you are a sentiment analyst at a securities firm, your task is to assess the impact of each news article on the A-share listed companies explicitly mentioned.**

# Q2. Data Analysis - Text Knowledge Mining

# part2- BiLSTM-based Sentiment Analyse Model Training

---
## 1. Model Training

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
import pandas as pd
from sklearn.model_selection import train_test_split

import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

  warn("The installed version of bitsandbytes was compiled without GPU support. "


'NoneType' object has no attribute 'cadam32bit_grad_fp32'


In [3]:
# Set the paths and file names
data_path = 'Training_dataset/Training_set_with_final_annotation.xlsx'
model_path = 'bert-base-chinese'
output_model_path = 'model\\bilstm_model_v1.bin'

# Load the dataset
df = pd.read_excel(data_path)
news = df['News'].tolist()
labels = df['Final_Sentiment'].tolist()

from sklearn.model_selection import train_test_split
# Split the dataset into training and testing sets
train_news, test_news, train_labels, test_labels = train_test_split(news, labels, test_size=0.1, random_state=42)

In [4]:
# Define the dataset
class NewsDataset(Dataset):
    def __init__(self, news, labels, tokenizer):
        self.news = news
        self.labels = labels
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.news)
    
    def __getitem__(self, idx):
        text = self.news[idx]
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=640,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [5]:
# Define the BiLSTM Model
class BiLSTM(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim, dropout):
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(input_dim, hidden_dim)
        self.lstm = nn.LSTM(hidden_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        output, _ = self.lstm(embedded)
        hidden = torch.cat((output[:, -1, :hidden_dim], output[:, 0, hidden_dim:]), dim=1)
        return self.fc(hidden)

In [9]:
# Create the training dataset
tokenizer = BertTokenizer.from_pretrained(model_path)
train_dataset = NewsDataset(train_news, train_labels, tokenizer)
train_labels = torch.tensor(train_labels)  

# Set the para
input_dim = tokenizer.vocab_size
hidden_dim = 128
output_dim = 2  
dropout = 0.1

In [15]:
# Initialize the BiLSTM model
model = BiLSTM(input_dim, hidden_dim, output_dim, dropout)
model.load_state_dict(torch.load(output_model_path))

# Move the model to the device (GPU or CPU)
# Define training parameters
batch_size = 64
lr = 2e-4
epochs = 64

# Define optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=lr)
loss_fn = nn.CrossEntropyLoss()

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Train the model
model.train()
for epoch in range(epochs):
    total_loss = 0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1+48}/{epochs+48}', leave=False)

    for batch in progress_bar:
        input_ids = batch['input_ids']
        labels = batch['labels']

        optimizer.zero_grad()
        outputs = model(input_ids)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix({'loss': loss.item()})
    
    avg_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch+1+48}/{epochs+48} - Average Loss: {avg_loss:.4f}')

    # Save the model
    torch.save(model.state_dict(), output_model_path)

                                                                                                                       

Epoch 49/64 - Average Loss: 0.0827


                                                                                                                       

Epoch 50/64 - Average Loss: 0.0823


                                                                                                                       

Epoch 51/64 - Average Loss: 0.0792


                                                                                                                       

Epoch 52/64 - Average Loss: 0.0739


                                                                                                                       

Epoch 53/64 - Average Loss: 0.0709


                                                                                                                       

Epoch 54/64 - Average Loss: 0.0729


                                                                                                                       

Epoch 55/64 - Average Loss: 0.0671


                                                                                                                       

Epoch 56/64 - Average Loss: 0.0672


                                                                                                                       

Epoch 57/64 - Average Loss: 0.0627


                                                                                                                       

Epoch 58/64 - Average Loss: 0.0645


                                                                                                                       

Epoch 59/64 - Average Loss: 0.0597


                                                                                                                       

Epoch 60/64 - Average Loss: 0.0568


                                                                                                                       

Epoch 61/64 - Average Loss: 0.0550


                                                                                                                       

Epoch 62/64 - Average Loss: 0.0551


                                                                                                                       

Epoch 63/64 - Average Loss: 0.0524


                                                                                                                       

Epoch 64/64 - Average Loss: 0.0538




## 2. Model Testing

### 2.1 Test the bilstm_model_v0 get aster 48 epochs

In [11]:
# Load the saved model weights into your BiLSTM model
model = BiLSTM(input_dim, hidden_dim, output_dim, dropout)
output_model_path = 'model\\bilstm_model_v0.bin'
model.load_state_dict(torch.load(output_model_path))
model.eval()

batch_size = 64

# Test the model
total_correct = 0
total_samples = 0
test_labels = torch.tensor(test_labels)  # Convert labels to tensor format
test_dataset = NewsDataset(test_news, test_labels, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
progress_bar = tqdm(test_loader, desc=f'Test:', leave=False)
with torch.no_grad():
    for batch in progress_bar:
        input_ids = batch['input_ids']
        labels = batch['labels']
        
        outputs = model(input_ids)
        _, predicted_labels = torch.max(outputs, dim=1)

        total_correct += (predicted_labels == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
print(f'Test Accuracy: {accuracy:.4f}')


                                                                                                                       

Test Accuracy: 0.8882




### 2.2 Test the bilstm_model_v1 get aster 64 epochs

In [16]:
# Load the saved model weights into your BiLSTM model
model = BiLSTM(input_dim, hidden_dim, output_dim, dropout)
output_model_path = 'model\\bilstm_model_v1.bin'
model.load_state_dict(torch.load(output_model_path))
model.eval()

batch_size = 64

# Test the model
total_correct = 0
total_samples = 0
test_labels = torch.tensor(test_labels)  # Convert labels to tensor format
test_dataset = NewsDataset(test_news, test_labels, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
progress_bar = tqdm(test_loader, desc=f'Test:', leave=False)
with torch.no_grad():
    for batch in progress_bar:
        input_ids = batch['input_ids']
        labels = batch['labels']
        
        outputs = model(input_ids)
        _, predicted_labels = torch.max(outputs, dim=1)

        total_correct += (predicted_labels == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
print(f'Test Accuracy: {accuracy:.4f}')


                                                                                                                       

Test Accuracy: 0.8799




---

In [None]:
# import pandas as pd
# from tqdm import tqdm
# from sklearn.model_selection import KFold
# import torch
# from transformers import BertConfig, BertTokenizer, BertForSequenceClassification, AdamW
# import warnings

# warnings.filterwarnings("ignore", category=UserWarning)
# warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
# warnings.filterwarnings("ignore", category=FutureWarning)

# # Read the Excel file
# file_path = 'D:\\ProjectHub\\Jupyter Notebook\\DSAA 5002 DM\\DM-Project\\\Training_dataset\\Training_set_with_final_annotation.xlsx'
# df = pd.read_excel(file_path)

# df_train = df[:90000]
# df_test = df[90000:]

# # Data preparation
# tokenizer = BertTokenizer.from_pretrained("bert-base-chinese")
# # Splitting the dataset into training and testing sets
# num_folds = 3
# kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

# # Load the pretrained BERT configuration and modify dropout
# config = BertConfig.from_pretrained("bert-base-chinese")
# config.hidden_dropout_prob = 0.1  # Modify the dropout in the hidden layers
# config.attention_probs_dropout_prob = 0.1  # Modify the dropout in the attention mechanism

# num_epochs = 10
# for fold, (train_indices, val_indices) in enumerate(kf.split(df_train)):
#     # Split the dataset
#     train_data = df_train.iloc[train_indices]
#     val_data = df_train.iloc[val_indices]
#     print(len(train_data))
#     print(len(val_data))
#     # Modify input data to include the company name
#     modified_train_data = []
#     for index, row in train_data.iterrows():
#         news_text = row['News']
#         company_name = row['Explicit_Company']
#         modified_text = company_name + "，" + news_text
#         sentiment = row['Final_Sentiment']
#         modified_train_data.append((modified_text, sentiment))
    
#     # Load the BERT model with modified dropout
#     model = BertForSequenceClassification.from_pretrained("bert-base-chinese", config=config)
    
#     # Loss function and optimizer
#     optimizer = AdamW(model.parameters(), lr=1e-5)
#     loss_fn = torch.nn.CrossEntropyLoss()
#     # Total number of iterations per epoch
#     total_iterations = len(modified_train_data)
#     # Train the model using modified_train_data as input
#     for epoch in range(num_epochs):
#         model.train()  # Enter training mode
#         # tqdm progress bar set to total iterations
#         with tqdm(total=total_iterations) as pbar:
#             for modified_text, sentiment in modified_train_data:
#                 input_ids = tokenizer.encode(modified_text, add_special_tokens=True, padding=True, max_length=512, truncation=True)
#                 input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension
#                 labels = torch.tensor(sentiment).unsqueeze(0)
#                 labels = labels.long()
                
#                 outputs = model(input_ids, labels=labels)
#                 loss = outputs.loss

#                 optimizer.zero_grad()
#                 loss.backward()
#                 optimizer.step()
                
#                 pbar.update(1)
                
#             # Evaluate model performance on the validation set
#             model.eval()  # Enter evaluation mode
#             validation_loss = 0
#             correct_predictions = 0
#             total_samples = 0

#             for index, row in val_data.iterrows():
#                 news_text = row['News']
#                 company_name = row['Explicit_Company']
#                 modified_text = company_name + "，" + news_text
#                 sentiment = row['Final_Sentiment']

#                 input_ids = tokenizer.encode(modified_text, add_special_tokens=True, padding=True, max_length=512, truncation=True)
#                 input_ids = torch.tensor(input_ids).unsqueeze(0)  # Add batch dimension

#                 labels = torch.tensor(sentiment).unsqueeze(0)
#                 labels = labels.long()

#                 with torch.no_grad():
#                     outputs = model(input_ids, labels=labels)
#                     validation_loss += outputs.loss.item()
#                     predictions = torch.argmax(outputs.logits, dim=1)
#                     correct_predictions += torch.sum(predictions == labels).item()
#                     total_samples += 1

#             validation_loss /= total_samples
#             accuracy = correct_predictions / total_samples

#             print(f"Fold: {fold+1}, Epoch: {epoch+1}")
#             print(f"Validation Loss = {validation_loss}")
#             print(f"Accuracy = {accuracy}")

#     # Save model parameters
#     model_save_path = f"model_fold_{fold+1}.pt"
#     torch.save(model.state_dict(), model_save_path)
