In [1]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np
from google.colab import drive
import pandas as pd
drive.mount('/content/drive')
model_name = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name,num_labels=3)
model.config.hidden_dropout_prob = 0.2
model.config.attention_probs_dropout_prob = 0.2



Mounted at /content/drive


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Load and Prepare CLEAR Corpus

In [2]:

aito_comment = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Capstone/Data/annotated_aito_comment.xlsx")
li_comment = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Capstone/Data/annotated_li_comment.xlsx")
df = pd.concat([aito_comment, li_comment])
columns_to_keep = ['comment', 'sentiment']

# Create a new DataFrame with only the specified columns
df = df[columns_to_keep]
df.reset_index(drop=True, inplace=True)
clear_df = df.sample(n=10000, random_state=42)

In [3]:
clear_df['sentiment'] = clear_df['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})

In [4]:
clear_df['sentiment'].value_counts()

sentiment
0.0    4325
1.0    3483
2.0    2166
Name: count, dtype: int64

In [5]:
clear_df

Unnamed: 0,comment,sentiment
71276,这种功能看起来很高级，但就是怕几年后不好用，或者紧急情况能不能打开？水能载舟亦能覆舟,0.0
35766,福特探险者乍样啊，想考虑30万左右6座suv,1.0
25361,m7送了一个电动踏板,2.0
18273,花哪个钱我还不如买个A6,0.0
45681,打火机都不一样,1.0
...,...,...
15152,你要命还是要好看,0.0
11923,不论外观还是后背箱的存储空间，跑货拉拉一定遥遥领先,2.0
46084,汽车厂每年都会让供应商降价，供应商降价了肯定会从别的地方找回来。,0.0
44753,未来的众泰,0.0


In [6]:
#!pip install jieba

In [7]:
#import jieba
#def divide_comments(comment):
#    return ' '.join(jieba.cut(comment))
#clear_df['divided_comment']= clear_df['comment'].apply(divide_comments)

In [8]:
clear_df

Unnamed: 0,comment,sentiment
71276,这种功能看起来很高级，但就是怕几年后不好用，或者紧急情况能不能打开？水能载舟亦能覆舟,0.0
35766,福特探险者乍样啊，想考虑30万左右6座suv,1.0
25361,m7送了一个电动踏板,2.0
18273,花哪个钱我还不如买个A6,0.0
45681,打火机都不一样,1.0
...,...,...
15152,你要命还是要好看,0.0
11923,不论外观还是后背箱的存储空间，跑货拉拉一定遥遥领先,2.0
46084,汽车厂每年都会让供应商降价，供应商降价了肯定会从别的地方找回来。,0.0
44753,未来的众泰,0.0


In [9]:
clear_df.replace([np.inf, -np.inf], np.nan, inplace=True)
clear_df.dropna(subset=['sentiment'], inplace=True)
#clear_df['divided_comment'] = clear_df['divided_comment'].astype(str)
clear_df['sentiment'] = clear_df['sentiment'].astype(int)

In [10]:
X_train, X_temp, y_train, y_temp = train_test_split(list(clear_df['comment']), list(clear_df['sentiment']), test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [11]:
from torch.utils.data import DataLoader, Dataset

class SentimentDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# Example usage
#dataset = SentimentDataset(clear_df['comment'],clear_df['sentiment'], tokenizer)
#loader = DataLoader(dataset, batch_size=2, shuffle=True)


In [12]:
train_dataset = SentimentDataset(X_train, y_train, tokenizer)
val_dataset = SentimentDataset(X_val, y_val, tokenizer)
test_dataset = SentimentDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


In [13]:
def train(model, loader, optimizer):
    model.train()
    total_loss = 0
    i = 0
    for batch in loader:
        optimizer.zero_grad()
        outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        i=i+1
        #print("Step"+str(i)+" Loss:"+str(total_loss))
    return total_loss / len(loader)


from sklearn.metrics import accuracy_score, precision_recall_fscore_support,classification_report

def evaluate(model, loader):
    model.eval()
    total_loss = 0
    predictions, true_labels = [], []
    loss_fn = torch.nn.CrossEntropyLoss()

    with torch.no_grad():
        for batch in loader:
            outputs = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'],labels=batch['labels'])
            loss = outputs.loss
            total_loss += loss.item()
            preds = torch.argmax(outputs.logits, dim=1)
            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    # Calculate evaluation metrics
    average_val_loss = total_loss / len(loader)
    accuracy = accuracy_score(true_labels, predictions)
    report = classification_report(true_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='macro')
    return average_val_loss,accuracy, precision, recall, f1,report







In [14]:
optimizer = AdamW(model.parameters(), lr=2e-6)
epochs = 50
train_loss_history = []
average_val_loss_history = []

# Early stopping parameters
patience = 4  # Number of epochs to wait for improvement before stopping
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer)
    train_loss_history.append(train_loss)

    average_val_loss, accuracy, precision, recall, f1, report = evaluate(model, val_loader)
    average_val_loss_history.append(average_val_loss)

    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss:.4f}, Validation Loss: {average_val_loss:.4f}, "
          f"Validation Accuracy: {accuracy:.4f}, Validation precision: {precision:.4f}, "
          f"Validation recall: {recall:.4f}, Validation F1: {f1:.4f}")

    # Early stopping logic
    if average_val_loss < best_val_loss:
        best_val_loss = average_val_loss
        patience_counter = 0  # Reset counter
        print("Validation loss improved, resetting patience counter.")
    else:
        patience_counter += 1
        print(f"No improvement in validation loss for {patience_counter} epoch(s).")
        if patience_counter > patience:
            print("Stopping early due to lack of improvement in validation loss.")
            break




Epoch 1/50, Train Loss: 0.8145, Validation Loss: 0.6724, Validation Accuracy: 0.7139, Validation precision: 0.7115, Validation recall: 0.7220, Validation F1: 0.7120
Validation loss improved, resetting patience counter.
Epoch 2/50, Train Loss: 0.5659, Validation Loss: 0.6186, Validation Accuracy: 0.7594, Validation precision: 0.7509, Validation recall: 0.7579, Validation F1: 0.7538
Validation loss improved, resetting patience counter.
Epoch 3/50, Train Loss: 0.4181, Validation Loss: 0.6902, Validation Accuracy: 0.7467, Validation precision: 0.7431, Validation recall: 0.7409, Validation F1: 0.7412
No improvement in validation loss for 1 epoch(s).
Epoch 4/50, Train Loss: 0.2965, Validation Loss: 0.7582, Validation Accuracy: 0.7493, Validation precision: 0.7410, Validation recall: 0.7504, Validation F1: 0.7429
No improvement in validation loss for 2 epoch(s).
Epoch 5/50, Train Loss: 0.2007, Validation Loss: 0.8397, Validation Accuracy: 0.7340, Validation precision: 0.7594, Validation recal

In [None]:
average_val_loss, accuracy, precision, recall, f1, report = evaluate(model, test_loader)

In [15]:
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("\nClassification Report:\n", report)

Accuracy: 0.7399732620320856
Precision: 0.7353245817789639
Recall: 0.7402041101416176
F1 Score: 0.7375199787050949

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.77      0.78       636
           1       0.69      0.70      0.70       546
           2       0.72      0.75      0.74       314

    accuracy                           0.74      1496
   macro avg       0.74      0.74      0.74      1496
weighted avg       0.74      0.74      0.74      1496



In [16]:


# Save the model weights
model_path = "/content/drive/MyDrive/Colab Notebooks/Capstone/model_chinese_weights.bin"
torch.save(model.state_dict(), model_path)

In [1]:
# Now plot them
#x_values = [x for x in range(1,len(train_loss_history)+1)]

# Now plot them with appropriate color and name
#plt.figure(figsize=(10, 5))  # Adjust the figure size if needed
#plt.plot(x_values, train_loss_history, label='Training Loss', color='skyblue')  # Set the color and label for training loss
#plt.plot(x_values, average_val_loss_history, label='Validation Loss', color='orange')  # Set color and label for validation loss

# Add labels, a legend, and display the plot
#plt.title('Training vs Validation Loss Over Epochs')
#plt.xlabel('Epochs')
#plt.ylabel('Loss')
#plt.legend()
#plt.savefig('/content/drive/MyDrive/Colab Notebooks/Capstone/saved_chinese_bert.png')
#plt.show()


# load model and get the predictions.

In [None]:
model_path = "/content/drive/MyDrive/Colab Notebooks/Capstone/model_chinese_weights.bin"

# Load the model state dictionary
model.load_state_dict(torch.load(model_path))

In [None]:
model.eval()

aito_comment_to_preict = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Capstone/Data/annotated_aito_comment.xlsx")
li_comment_to_preict = pd.read_excel("/content/drive/MyDrive/Colab Notebooks/Capstone/Data/annotated_li_comment.xlsx")

columns_to_keep = ['comment']
df = df[columns_to_keep]
df.reset_index(drop=True, inplace=True)
df = pd.concat([aito_comment_to_preict, li_comment_to_preict])
with torch.no_grad():  
    predictions = model(df['comment'])
df['sentiment'] = predictions
df.to_excel('predicted_li_aito.xlsx') # save predictions