# DSAA 5002 - Data Mining and Knowledge Discovery in Data Science
---

# Task 1 (50 marks) Data Preprocessing and Analysis

**Background: 
Assuming you are a sentiment analyst at a securities firm, your task is to assess the impact of each news article on the A-share listed companies explicitly mentioned.**

# Preliminary_Exp of Q2

---
## Exp2. Full Data BERT Tunning


### !!!PLEASE USECOLAB!!! 

In [None]:
import os
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!pip install faiss-cpu pyarrow pandas matplotlib scikit-learn torch tqdm

Collecting faiss-cpu
  Downloading faiss_cpu-1.7.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.6/17.6 MB[0m [31m37.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.7.4


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=pd.errors.SettingWithCopyWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
# Setting up the GPU training environment (if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
# Setting paths and filenames
data_path = '/content/drive/MyDrive/BertTraining/Training_set_with_final_annotation.xlsx'
model_path = 'bert-base-chinese'
output_model_path = '/content/drive/MyDrive/BertTraining/bert_model.bin'

# Loading the dataset
df = pd.read_excel(data_path)
news = df['News'].tolist()
labels = df['Final_Sentiment'].tolist()

from sklearn.model_selection import train_test_split
# Splitting into training and testing sets
train_news, test_news, train_labels, test_labels = train_test_split(news, labels, test_size=0.1, random_state=42)


In [None]:
# Define a custom dataset
class NewsDataset(Dataset):
    def __init__(self, news, labels, tokenizer):
        self.news = news
        self.labels = labels
        self.tokenizer = tokenizer

    def __len__(self):
        return len(self.news)

    def __getitem__(self, idx):
        text = self.news[idx]
        label = self.labels[idx]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=512,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Create the training dataset
tokenizer = BertTokenizer.from_pretrained(model_path)
train_dataset = NewsDataset(train_news, train_labels, tokenizer)


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

In [None]:
# Initialize Bert tokenizer and model
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=2).to(device)

# Define training parameters
batch_size = 16
lr = 8e-4
epochs = 10

# Define optimizer and loss function
optimizer = AdamW(model.parameters(), lr=lr)
loss_fn = torch.nn.CrossEntropyLoss()

# Create data loader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)


model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 2.Model Tunning

In [None]:
model = BertForSequenceClassification.from_pretrained(output_model_path).to(device)
model.train()

epoch = 10
total_loss = 0
progress_bar = tqdm(train_loader, desc=f'Epoch {epoch}/{epochs}', leave=False)

for batch in progress_bar:
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['labels'].to(device)

    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
    loss = outputs.loss
    total_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    progress_bar.set_postfix({'loss': loss.item()})

avg_loss = total_loss / len(train_loader)
print(f'Epoch {epoch}/{epochs} - Average Loss: {avg_loss:.4f}')

# Save
model.save_pretrained(output_model_path)



Epoch 1/10 - Average Loss: 0.5470


### 3. Model Test

In [None]:
import torch

# Load the test dataset
test_dataset = NewsDataset(test_news, test_labels, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Load the saved model
model = BertForSequenceClassification.from_pretrained(output_model_path)

# Test the model
model.eval()
total_correct = 0
total_samples = 0
progress_bar = tqdm(test_loader, desc=f'Test: ', leave=False)

with torch.no_grad():
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(model.device)
        attention_mask = batch['attention_mask'].to(model.device)
        labels = batch['labels'].to(model.device)

        outputs = model(input_ids, attention_mask=attention_mask)
        _, predicted_labels = torch.max(outputs.logits, dim=1)

        total_correct += (predicted_labels == labels).sum().item()
        total_samples += labels.size(0)

accuracy = total_correct / total_samples
print(f'Test Accuracy: {accuracy:.4f}')


**Not a efficiency methods with just-so-so Accuracy**