In [1]:
%pip install sentence-transformers
%pip install tensorboard
%pip install pandas

Looking in indexes: http://mirrors.aliyun.com/pypi/simple
Collecting sentence-transformers
  Downloading http://mirrors.aliyun.com/pypi/packages/45/18/1ec591befcbdb2c97192a40fbe7c43a8b8a8b3c89b1fa101d3eeed4d79a4/sentence_transformers-3.2.1-py3-none-any.whl (255 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m255.8/255.8 kB[0m [31m23.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers<5.0.0,>=4.41.0 (from sentence-transformers)
  Downloading http://mirrors.aliyun.com/pypi/packages/75/d5/294a09a62bdd88da9a1007a341d4f8fbfc43be520c101e6afb526000e9f4/transformers-4.46.1-py3-none-any.whl (10.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting scikit-learn (from sentence-transformers)
  Downloading http://mirrors.aliyun.com/pypi/packages/c6/29/044048c5e911373827c0e1d3051321b9183b2a4f8d4e2f11c08fcff83f13/scikit_learn-1.5.2-cp312-cp312-manylinux_2_17_x86_64.manyli

In [1]:
from sentence_transformers import SentenceTransformer   
import torch
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd 
import tensorboard as tb

  from tqdm.autonotebook import tqdm, trange


#### 定义数据集结构

In [5]:
check_list = {
    "E" : 0,
    "I" : 1,
    "N" : 0, 
    "S" : 1,
    "F" : 0,
    "T" : 1,
    "J" : 0,
    "P" : 1
}

class MBTIDataset(Dataset):
    def __init__(self, folder_path, mbti_type):
        # mbti_type可以是"EI"/"NS"/"FT"/"JP"中的一个,表示二分类的类型
        assert mbti_type in ["EI", "NS", "FT", "JP"]
        self.type_idx = {"EI":0, "NS":1, "FT":2, "JP":3}[mbti_type]
        
        print("开始读取数据文件...")
        # 读取文件夹下所有csv文件并合并
        import os
        csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
        data_frames = []
        for i, csv_file in enumerate(csv_files):
            print(f"正在读取第{i+1}/{len(csv_files)}个文件: {csv_file}")
            csv_path = os.path.join(folder_path, csv_file)
            df = pd.read_csv(csv_path)
            data_frames.append(df)
        data_sheet = pd.concat(data_frames, ignore_index=True)
        print(f"成功读取{len(csv_files)}个文件")
        
        print("开始处理帖子数据...")
        self.posts = []
        total = len(data_sheet['posts'])
        for i, posts_50 in enumerate(data_sheet['posts']):
            if i % 1000 == 0:
                print(f"处理进度: {i}/{total}")
            posts_50_list = posts_50.split("|||")
            if len(posts_50_list) < 50:
                posts_50_list += [""] * (50 - len(posts_50_list))
            if len(posts_50_list) > 50:
                posts_50_list = posts_50_list[:50]
            self.posts.append(posts_50_list)
        print("帖子数据处理完成")
                        
        print("开始处理标签数据...")
        self.mbti_label = data_sheet['type']
        for i in range(len(self.mbti_label)):
            if i % 1000 == 0:
                print(f"标签处理进度: {i}/{len(self.mbti_label)}")
            # 只取对应维度的二分类标签
            self.mbti_label[i] = check_list[self.mbti_label[i][self.type_idx]]
            self.mbti_label[i] = torch.tensor(self.mbti_label[i])
        print("标签处理完成")
        print(f"数据集初始化完成,共{len(self.posts)}条数据")

    def __len__(self):
        return len(self.posts)

    def __getitem__(self, idx):
        return {"posts" : self.posts[idx], "type" : self.mbti_label[idx]}

In [7]:
dataset = MBTIDataset('../data/', "EI")
for i in range(len(dataset)):
    if len(dataset[i]["posts"]) != 50:
        print(f"{i} error, length is {len(dataset[i]['posts'])}")  
    
print("Data Preprocessing Done")
print(dataset[0])

开始读取数据文件...
正在读取第1/29个文件: reddit_mbti_chunked_0.csv
正在读取第2/29个文件: reddit_mbti_chunked_1.csv
正在读取第3/29个文件: reddit_mbti_chunked_11.csv
正在读取第4/29个文件: reddit_mbti_chunked_12.csv
正在读取第5/29个文件: reddit_mbti_chunked_13.csv
正在读取第6/29个文件: reddit_mbti_chunked_14.csv
正在读取第7/29个文件: reddit_mbti_chunked_2.csv
正在读取第8/29个文件: reddit_mbti_chunked_3.csv
正在读取第9/29个文件: reddit_mbti_chunked_4.csv
正在读取第10/29个文件: reddit_mbti_chunked_5.csv
正在读取第11/29个文件: reddit_mbti_chunked_6.csv
正在读取第12/29个文件: reddit_mbti_chunked_7.csv
正在读取第13/29个文件: reddit_mbti_chunked_8.csv
正在读取第14/29个文件: reddit_mbti_chunked_9.csv
正在读取第15/29个文件: reddit_mbti_chunked_15.csv
正在读取第16/29个文件: reddit_mbti_chunked_16.csv
正在读取第17/29个文件: reddit_mbti_chunked_17.csv
正在读取第18/29个文件: reddit_mbti_chunked_18.csv
正在读取第19/29个文件: reddit_mbti_chunked_19.csv
正在读取第20/29个文件: reddit_mbti_chunked_20.csv
正在读取第21/29个文件: reddit_mbti_chunked_21.csv
正在读取第22/29个文件: reddit_mbti_chunked_22.csv
正在读取第23/29个文件: reddit_mbti_chunked_23.csv
正在读取第24/29个文件: reddit_mbti_chunked_24.csv

#### 创建嵌入模型和分类模型

In [8]:
import torch
import torch.nn as nn
from sentence_transformers import SentenceTransformer
import math

class RoPE(nn.Module):

    def __init__(self, embedding_dim):
        super(RoPE, self).__init__()
        self.embedding_dim = embedding_dim
        
    def forward(self, x, seq_len):
        # x shape: [batch_size, seq_len, embedding_dim]
        device = x.device
        position = torch.arange(seq_len, device=device).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, self.embedding_dim, 2, device=device) * (-math.log(10000.0) / self.embedding_dim))
        
        # 计算旋转角度
        theta = position * div_term  # [seq_len, embedding_dim/2]
        
        # 分别计算sin和cos
        sin = torch.sin(theta)  # [seq_len, embedding_dim/2] 
        cos = torch.cos(theta)  # [seq_len, embedding_dim/2]
        
        # 对输入进行旋转变换
        x_even = x[..., 0::2]  # 偶数维度
        x_odd = x[..., 1::2]   # 奇数维度
        
        # 旋转变换
        x_rotated_even = x_even * cos - x_odd * sin
        x_rotated_odd = x_even * sin + x_odd * cos
        
        # 交错合并
        x_rotated = torch.zeros_like(x)
        x_rotated[..., 0::2] = x_rotated_even
        x_rotated[..., 1::2] = x_rotated_odd
        
        return x_rotated

class MBTIClassifier(nn.Module):
    def __init__(self, device=torch.device('cuda'), model_path='../model/models--sentence-transformers--all-MiniLM-L6-v2'):
        super(MBTIClassifier, self).__init__()
        
        self.device = device
        self.embedding_dim = 384
        self.num_classes = 16
        
        self.embedding_model = SentenceTransformer(model_path, device=self.device)
        self.rope = RoPE(self.embedding_dim)      
        self.layer_norm = nn.LayerNorm(self.embedding_dim)
        self.transformer_encoder_layer = nn.TransformerEncoderLayer(
            d_model=self.embedding_dim,
            nhead=8,
            dim_feedforward=768,
            dropout=0.1
        )
        self.transformer_encoder = nn.TransformerEncoder(
            self.transformer_encoder_layer,
            num_layers=2
        )
        self.transformer_decoder_layer = nn.TransformerDecoderLayer(
            d_model=self.embedding_dim,
            nhead=8,
            dim_feedforward=768,
            dropout=0.2
        )
        self.transformer_decoder = nn.TransformerDecoder(
            self.transformer_decoder_layer,
            num_layers=4
        )
        self.ffn = nn.Sequential(
            nn.Tanh(),
            nn.Linear(self.embedding_dim, self.num_classes),
        )
        
        self.to(self.device)
        
    def forward(self, batch_sentences_list):
        # Preprocess each sentence in the batch
        preprocessed_batch = []
        for sentences_list in batch_sentences_list:
            preprocessed_list = []
            for sentence in sentences_list:
                sentence = sentence.lower()
                sentence = sentence.replace("  ", " ")
                preprocessed_list.append(sentence)
            preprocessed_batch.append(preprocessed_list)
        
        # Flatten the batch to process with SentenceTransformer
        flattened_sentences = [sentence for sentences_list in preprocessed_batch for sentence in sentences_list]
        
        # Encode all sentences in the batch
        embeddings = self.embedding_model.encode(flattened_sentences, convert_to_tensor=True, device=self.device)
        
        # Reshape the embeddings to (batch_size, sequence_length, embedding_dim)
        batch_size = len(preprocessed_batch)
        sequence_length = len(preprocessed_batch[0])  # Assuming all lists have the same length
        embeddings = embeddings.view(batch_size, sequence_length, -1)
        embeddings = embeddings.permute(1, 0, 2)  # (sequence_length, batch_size, embedding_dim)
        
        # 对句子序列进行位置编码
        embeddings = self.rope(embeddings, sequence_length)
        embeddings = self.layer_norm(embeddings)
        
        transformer_encoder_output = self.transformer_encoder(embeddings)

        # 创建并对decoder输入进行位置编码
        mbti_tensor = torch.ones(4, batch_size, self.embedding_dim).to(self.device)
        mbti_tensor = self.rope(mbti_tensor, 4)

        # Pass through the transformer decoder
        transformer_output = self.transformer_decoder(mbti_tensor, transformer_encoder_output)
        transformer_output = transformer_output.squeeze(0)
        
        # Pass through the feedforward network
        output = self.ffn(transformer_output)
        return output   
    
    def train(self, mode=True):
        """
        Sets the module in training mode.
        """
        super(MBTIClassifier, self).train(mode)
        self.embedding_model.train()
        self.layer_norm.train()
        self.transformer_encoder.train()
        self.transformer_decoder.train()
        self.ffn.train()

    def eval(self):
        """
        Sets the module in evaluation mode.
        """
        super(MBTIClassifier, self).eval()
        self.embedding_model.eval()
        self.layer_norm.eval()
        self.transformer_encoder.eval()
        self.transformer_decoder.eval()
        self.ffn.eval()


In [9]:
import itertools
import shutil
import torch
from torch.optim import AdamW, Adam
from torch.optim.lr_scheduler import CosineAnnealingLR, LinearLR
from torch.utils.data import DataLoader, random_split
from torch.utils.tensorboard import SummaryWriter
import torch.nn as nn
import os

def split_dataset(dataset, train_ratio=0.8):
    train_size = int(train_ratio * len(dataset))
    val_size = len(dataset) - train_size
    return random_split(dataset, [train_size, val_size], generator=torch.Generator().manual_seed(42))

def train_epoch(model, dataloader, criterion, optimizer, device, writer, global_step):
    model.train()
    epoch_loss = 0.0
    for i, batch in enumerate(dataloader):
        
        # posts is a list of 50 strings for each of the 16 samples
        # We need to convert it to (16, 50) to fit the model
        posts = batch['posts']  # Shape: (50, 16)
        real_batch_size = len(posts[0])
        posts_shape_fix = [[] for _ in range(real_batch_size)]
        for i_, j_ in itertools.product(range(real_batch_size), range(50)):
            try:
                posts_shape_fix[i_].append(posts[j_][i_])
            except Exception as e:  
                print(f"Error: {e}, current i is {i_}, current j is {j_}")
                print(f"current posts[j] is {posts[j_]}")    
                exit(0)
        posts = posts_shape_fix

        labels = batch['type'].to(device)

        optimizer.zero_grad()
        outputs = model(posts)

        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        writer.add_scalar('Loss/train', loss.item(), global_step)
        global_step += 1

        if i % 10 == 0:
            print(f"Iter {i}, train loss: {loss.item()}")

    avg_loss = epoch_loss / len(dataloader)
    return avg_loss, global_step

def validate_epoch(model, dataloader, criterion, device):
    model.eval()
    val_loss = 0.0
    val_acc = 0.0
    with torch.no_grad():
        for batch in dataloader:
            # posts is a list of 50 strings for each of the 16 samples
            # We need to convert it to (16, 50) to fit the model
            posts = batch['posts']  # Shape: (50, 16)
            real_batch_size = len(posts[0])
            posts_shape_fix = [[] for _ in range(real_batch_size)]
            for i, j in itertools.product(range(real_batch_size), range(50)):
                try:
                    posts_shape_fix[i].append(posts[j][i])
                except Exception as e:  
                    print(f"Error: {e}, current i is {i}, current j is {j}")
                    print(f"current posts[j] is {posts[j]}")    
                    exit(0)
            posts = posts_shape_fix
            labels = batch['type'].to(device)
            
            outputs = model(posts)
            loss = criterion(outputs, labels)
            accuracy = (outputs.argmax(dim=1) == labels).float().mean()
            val_loss += loss.item()
            val_acc += accuracy.item()
    
    return val_loss / len(dataloader), val_acc / len(dataloader) 

def train(dataset, model, device, batch_size=32, epochs=10, lr=2e-5, patience=3, scheduler_type='linear', model_save_name='best_model.pth'):
    train_dataset, val_dataset = split_dataset(dataset)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=lr, weight_decay=0.01)
    
    if scheduler_type == 'linear':
        scheduler = LinearLR(optimizer, start_factor=1.0, end_factor=0.1, total_iters=epochs)
    elif scheduler_type == 'cosine':
        scheduler = CosineAnnealingLR(optimizer, T_max=epochs)
    
    # 清空之前的日志目录
    log_dir = 'runs/mbti_classification'
    if os.path.exists(log_dir):
        shutil.rmtree(log_dir)

    writer = SummaryWriter(log_dir=log_dir)
    model.to(device)
    
    global_step = 0
    best_val_acc = 0.0
    epochs_no_improve = 0
    
    for epoch in range(epochs):
        train_loss, global_step = train_epoch(model, train_loader, criterion, optimizer, device, writer, global_step)
        print("validating...")
        val_loss, val_acc = validate_epoch(model, val_loader, criterion, device)
        
        print(f"Epoch {epoch}, Average train loss: {train_loss}, Average val loss: {val_loss}, Average val accuracy: {val_acc}")   
        
        writer.add_scalar('Loss/val', val_loss, epoch)
        writer.add_scalar('Accuracy/val', val_acc, epoch)
        scheduler.step()
        
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            epochs_no_improve = 0
            torch.save(model.state_dict(), f'./result/{model_save_name}')
            print("Model saved!")
        else:
            epochs_no_improve += 1
            if epochs_no_improve >= patience:
                print("Early stopping!")
                break
    
    writer.close()

# 使用示例：
# dataset = 你的数据集
# model = 你的模型
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# train(dataset, model, device)


In [10]:
model = MBTIClassifier()
train(dataset, model, torch.device('cuda'), scheduler_type='linear', epochs=100, batch_size=128, lr=1e-3, patience=2)



TypeError: RoPE.forward() missing 1 required positional argument: 'seq_len'

In [14]:
model = MBTIClassifier()
model.load_state_dict(torch.load('./result/best_model.pth'))
train(dataset, model, torch.device('cuda'), scheduler_type='cosine', epochs=100, batch_size=64, lr=5e-5, patience=120)

Iter 0, train loss: 1.0516107082366943
Iter 10, train loss: 0.7230733036994934
Iter 20, train loss: 0.8668584227561951
Iter 30, train loss: 1.0122649669647217
Iter 40, train loss: 0.9145032167434692
Iter 50, train loss: 0.8512702584266663
Iter 60, train loss: 1.331147313117981
Iter 70, train loss: 0.885358989238739
Iter 80, train loss: 1.171416997909546
Iter 90, train loss: 1.0519291162490845
Iter 100, train loss: 0.9134889245033264
validating...
Epoch 0, Average train loss: 0.9575668414798352, Average val loss: 1.106034361890384, Average val accuracy: 0.686224490404129
Model saved!
Iter 0, train loss: 1.101692795753479
Iter 10, train loss: 1.2009905576705933
Iter 20, train loss: 0.875921368598938
Iter 30, train loss: 1.0464540719985962
Iter 40, train loss: 0.7764913439750671
Iter 50, train loss: 0.8539900779724121
Iter 60, train loss: 0.987011194229126
Iter 70, train loss: 0.8474991917610168
Iter 80, train loss: 1.010483741760254
Iter 90, train loss: 0.8990306258201599
Iter 100, train

In [None]:
def evaluate(model, dataset, device, batch_size=32):
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    model.eval()
    total_acc = 0.0
    total_samples = 0
    
    with torch.no_grad():
        for batch in dataloader:
            # posts is a list of 50 strings for each of the 16 samples
            # We need to convert it to (16, 50) to fit the model
            posts = batch['posts']  # Shape: (50, 16)
            real_batch_size = len(posts[0])
            posts_shape_fix = [[] for _ in range(real_batch_size)]
            for i, j in itertools.product(range(real_batch_size), range(50)):
                try:
                    posts_shape_fix[i].append(posts[j][i])
                except Exception as e:  
                    print(f"Error: {e}, current i is {i}, current j is {j}")
                    print(f"current posts[j] is {posts[j]}")    
                    exit(0)
            posts = posts_shape_fix
            labels = batch['type'].to(device)
            
            outputs = model(posts)
            accuracy = (outputs.argmax(dim=1) == labels).float().sum().item()
            total_acc += accuracy
            total_samples += labels.size(0)
    
    return total_acc / total_samples

# 使用示例：
# dataset = 你的数据集
# model = 训练好的模型
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
accuracy = evaluate(model, dataset, device)
print(f"Model accuracy on the entire dataset: {accuracy * 100:.2f}%")
