In [78]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from tqdm import trange
from dataclasses import dataclass
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import random_split
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from transformers import BertTokenizer  
from torch.optim import AdamW



import warnings

warnings.filterwarnings("ignore")
device = "cuda" if torch.cuda.is_available() else "cpu"

# 0. Preparation
## 0.1 GPT Blocks

In [2]:
class CausalSelfAttention(nn.Module):
    def __init__(self, config):
        super().__init__()
        # 'embd_size' sized vector divided into 'num_heads' heads
        assert config.embd_size % config.num_heads == 0, f"embedding dim should be divisible by number of heads"
        self.num_heads = config.num_heads
        self.embd_size = config.embd_size
        # batched key, query, and value projections for all heads
        self.c_attn = nn.Linear(config.embd_size, 3 * config.embd_size)
        self.c_proj = nn.Linear(config.embd_size, config.embd_size)
        self.c_proj.SCALE_INIT = 1.0
        # not really a bias, more of a mask, but following OpenAI/HF naming convention
        # self.register_buffer("bias", torch.tril(torch.ones(config.context_length, config.context_length)).view(1, 1, config.context_length, config.context_length))

    def forward(self, x):
        B, T, C = x.shape
     
        qkv = self.c_attn(x)    # (B, T, 3C)
        q, k, v = qkv.split(self.embd_size, dim=-1)    # (B,T,C), (B,T,C), (B,T,C)
        q = q.view(B, T, self.num_heads, self.embd_size // self.num_heads).transpose(1, 2)    # (B,nh,T,hs)
        k = k.view(B, T, self.num_heads, self.embd_size // self.num_heads).transpose(1, 2)    # (B,nh,T,hs)
        v = v.view(B, T, self.num_heads, self.embd_size // self.num_heads).transpose(1, 2)    # (B,nh,T,hs)
    
        out = F.scaled_dot_product_attention(q, k, v, is_causal=True)    # (B,nh,T,hs)
        out = out.transpose(1, 2).contiguous().view(B, T, C)    # (B,nh,T,hs) --> (B,T,nh,hs) --> (B,T,C=nh*hs)
        out = self.c_proj(out)    # (B,T,C) --> (B,T,C)
        return out


class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.embd_size, 4 * config.embd_size)
        self.gelu = nn.GELU(approximate='tanh')    # approximate='tanh' used to try to reproduce gpt2 paper
        self.c_proj = nn.Linear(4 * config.embd_size, config.embd_size)
        self.c_proj.SCALE_INIT = 1.0

    def forward(self, x):
        x = self.c_fc(x)
        x = self.gelu(x)
        x = self.c_proj(x)
        return x


class Block(nn.Module):
    """ Transformer Encoder block """

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.embd_size)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.embd_size)
        self.mlp = MLP(config)
    
    def forward(self, x):
        x = x + self.attn(self.ln_1(x))
        x = x + self.mlp(self.ln_2(x))
        return x
    

In [5]:
class GPTLM(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(self.config.vocab_size, self.config.embd_size),
            wpe = nn.Embedding(self.config.context_length, self.config.embd_size),
            h = nn.ModuleList([Block(self.config) for _ in range(self.config.num_layers)]),
            ln_f = nn.LayerNorm(self.config.embd_size)
        ))
        # language modeling head
        self.lm_head = nn.Linear(self.config.embd_size, self.config.vocab_size, bias=False)
        # weight sharing scheme (reduces 768*50267=~40M params, fewer params, more efficient)
        self.transformer.wte.weight = self.lm_head.weight
        # init params (iterates over all submodules and applies _init_weights)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'SCALE_INIT'):
                std /= (2 * self.config.num_layers)**0.5
            torch.nn.init.normal_(module.weight, mean=0, std=std)    # as per openai gpt-2 source code
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        assert T <= self.config.context_length, f'sequence length {T} should be <= {self.config.context_length}'
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)    # (T,)
        pos_embd = self.transformer.wpe(pos)    # (T, embd_size)
        tok_embd = self.transformer.wte(idx)    # (B, T, embd_size)
        x = pos_embd + tok_embd    # (B, T, embd_size)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)    # (B, T, embd_size)
        logits = self.lm_head(x)    # (B, T, vocab_size)
        loss = None
        if targets is not None:
            loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), targets.view(-1))
        return logits, loss

    def generate(self, input_ids, max_length=65): ## only to generate the <好评> or <差评>
        gen_tokens = input_ids
        while gen_tokens.shape[-1] < max_length:
            with torch.inference_mode():
                logits, loss = self(gen_tokens.to(device))
                logits = logits[:, -1, :]
                probs = F.softmax(logits, dim=-1)
                next_tok = torch.argmax(probs, dim=-1, keepdim=True)
                gen_tokens = torch.cat([gen_tokens, next_tok], dim=-1)
        return gen_tokens
    
    @classmethod
    def from_pretrained(cls, model_type):
        """ Loads pretrained GPT2 model weights from huggingface """
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl', 'uer/gpt2-distil-chinese-cluecorpussmall'}
        from transformers import GPT2LMHeadModel
        print(f"loading weights from pretrained gpt: {model_type}")

        config_args = {
            'uer/gpt2-distil-chinese-cluecorpussmall': dict(num_layers=6, num_heads=12, embd_size=768),    # 82M params
            'gpt2': dict(num_layers=12, num_heads=12, embd_size=768),    # 124M params
            'gpt2-medium': dict(num_layers=24, num_heads=16, embd_size=1024),    # 350M params
            'gpt2-large': dict(num_layers=36, num_heads=20, embd_size=1280),    # 774M params
            'gpt2-xl': dict(num_layers=48, num_heads=25, embd_size=1600),    # 1558M params
        }[model_type]
        config_args['vocab_size'] = 21128
        config_args['context_length'] = 1024

        # create a from-scratch minGPT model
        config = GPTConfig(**config_args)
        model = GPTLM(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')]

        # init a huggingface transformers model
        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')]
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')]
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']

        assert len(sd_keys) == len(sd_keys_hf), f"mismatched keys {len(sd_keys)} != {len(sd_keys_hf)}"

        # copy while ensuring all parameters are aligned in names and shape
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # need to transpose Conv1D weights
                assert sd_hf[k].shape[::-1]  == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k].T)
            else:
                # print(k, sd_hf[k].shape, sd[k].shape)
                assert sd_hf[k].shape == sd[k].shape
                with torch.no_grad():
                    sd[k].copy_(sd_hf[k])
        return model
    

## 0.2 Tokenizer, Dataset and Dataloader
1. We use **BertTokenizer** found on *huggingface* in order to tokenize Chinese text.
2. In order to better classify the sentiment, we add four special tokens to the beginning of each text:`<好评>`, `<差评>`, `<review>`, `<sentiment>`.
3. We may find out that in the dataset, bias is quite common, where the positive samples are much more than the negative samples(4305-695). In order to balance the dataset, we simply **over-sample** the negative samples.

In [15]:
tokenizer = BertTokenizer.from_pretrained("uer/gpt2-distil-chinese-cluecorpussmall")

## Important!!!!! Add special tokens in order to train the model

special_tokens_dict = {"additional_special_tokens": ["<好评>", "<差评>", "<review>", "<sentiment>"]}
tokenizer.add_special_tokens(special_tokens_dict)
len(tokenizer)

21132

In [16]:
df = pd.read_csv("./reviews.csv")


df_positive = df[df['sentiment'] == 1]
df_negative = df[df['sentiment'] == 0]
n_positive = len(df_positive)

df_negative_over = df_negative.sample(n=n_positive, replace=True, random_state=42)
df = pd.concat([df_positive, df_negative_over])


train_df, val_df = train_test_split(df, test_size=0.1, random_state=42)

In [18]:
def preprocess_sample(review_text, label, tokenizer):
    label_str = "<好评>" if label == 1 else "<差评>"
    prompt = f"<review>\n{review_text}\n<sentiment>\n"
    full_text = prompt + label_str

    encoding = tokenizer(full_text,
                         return_tensors="pt",
                         padding="max_length",  
                         max_length=64,
                         truncation=True, add_special_tokens=False)
    
    input_ids = encoding.input_ids.squeeze(0)
    labels = input_ids.clone()

    # 计算 prompt 部分的 token 数量，设置为 -100 来屏蔽其 loss
    prompt_encoding = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
    prompt_length = prompt_encoding.input_ids.shape[1]
    labels[:prompt_length] = -100 # set to -100 to ignore loss

    return {
        "input_ids": input_ids,
        "labels": labels,
        "prompt_length": prompt_length
    }
    
class TrainDataset(Dataset):
    def __init__(self, dataframe, tokenizer=None):
        self.df = dataframe
        if tokenizer is not None:
            self.tokenizer = tokenizer
        else:
            self.tokenizer = BertTokenizer.from_pretrained("uer/gpt2-distil-chinese-cluecorpussmall")
            self.tokenizer.add_special_tokens({"additional_special_tokens": ["<好评>", "<差评>", "<review>", "<sentiment>"]})

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        review = self.df.iloc[index]['review']
        sentiment = self.df.iloc[index]['sentiment']  # 1 or 0

     
        return preprocess_sample(review, sentiment, self.tokenizer)

class ValidDataset(Dataset):
    def __init__(self, dataframe, tokenizer=None):
        self.df = dataframe
        if tokenizer is not None:
            self.tokenizer = tokenizer
        else:
            self.tokenizer = BertTokenizer.from_pretrained("uer/gpt2-distil-chinese-cluecorpussmall")
            self.tokenizer.add_special_tokens({"additional_special_tokens": ["<好评>", "<差评>", "<review>", "<sentiment>"]})

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        review_text = self.df.iloc[index]['review']
        label = self.df.iloc[index]['sentiment']  # 1 or 0
        label_str = "<好评>" if label == 1 else "<差评>"
        prompt = f"<review>\n{review_text}\n<sentiment>\n"
        # full_text = prompt + label_str

        encoding = tokenizer(prompt,
                            return_tensors="pt",
                            padding="max_length",  
                            max_length=64,
                            truncation=True, add_special_tokens=False)
        
        input_ids = encoding.input_ids.squeeze(0) 
        
        attention_mask = encoding.attention_mask.squeeze(0)

        # 计算 prompt 部分的 token 数量，设置为 -100 来屏蔽其 loss
        labels = tokenizer(label_str, return_tensors="pt", add_special_tokens=False).input_ids.squeeze(0)
        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask
        }

In [19]:
trainset = TrainDataset(train_df, tokenizer)
validset = ValidDataset(val_df, tokenizer)
trainloader = DataLoader(trainset, batch_size=64, shuffle=True)
validloader = DataLoader(validset, batch_size=64, shuffle=True) 

# 1. GPT-2 with autoregressive 
## 1.1 Train a tiny toy GPT-2 model
**Hyperparams are listed as follows**
1. `num_layers`: 2
2. `hidden_size`: 384
3. `num_attention_heads`: 2

In [54]:
@dataclass
class GPTConfig:
    context_length: int = 128   # max context / sequence length
    vocab_size: int = 21132    # number of tokens for bert-tokenizer + special tokens
    # num_layers: int = 6
    num_layers: int = 2
    # embd_size: int = 768   # embedding dim
    embd_size: int = 384
    num_heads: int = 2
    hidden_size: int = 384

In [55]:
config = GPTConfig()
model = GPTLM(config).to(device)

In [30]:
def evaluate(model):
    model.eval()
    total_samples = 0
    correct_preds = 0

    
    with torch.no_grad():
        for batch in validloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            batch_size = input_ids.size(0)
            actual_lengths = attention_mask.sum(dim=1) 
 
            for i in range(batch_size):
                
                prompt_len = int(actual_lengths[i].item())
                single_input_ids = input_ids[i][:prompt_len].unsqueeze(0)
                single_mask = attention_mask[i][:prompt_len].unsqueeze(0)
                generated_ids = model.generate(single_input_ids, max_length=single_input_ids.shape[1]+1)
                answer_ids = generated_ids[0, prompt_len:prompt_len+1]

                # print(f"generated_ids: {generated_ids}")
                # print(f"answer_ids: {answer_ids}")
                # print(f"labels: {labels[i]}")

    
                true_text = labels[i]
                # print(f"answer_ids: {answer_ids}, true_text: {true_text}")
     
                total_samples += 1
                
                # 这里简单判断：如果预测结果中包含真实标签，就认为预测正确
                if true_text == answer_ids:
                    correct_preds += 1
                # if total_samples >= 200:
                #     return correct_preds / total_samples
                
    accuracy = correct_preds / total_samples if total_samples > 0 else 0.0
    model.train()
    return accuracy
            
def train(model, optimizer, num_epochs):
    model.train()
    for epoch in trange(num_epochs):

        for step, batch in enumerate(trainloader):
            input_ids = batch["input_ids"].to(device)
            labels = batch["labels"].to(device)
            
            optimizer.zero_grad()
            logits, loss = model(input_ids, labels)
            
            loss.backward()
            optimizer.step()
            
            if step % 50 == 0:
                print(f"Epoch: {epoch} Step {step}: Loss = {loss.item():.6f}")
        
        eval_accuracy = evaluate(model)
        print(f"Validation Accuracy: {eval_accuracy:.4f}")
        
        model.train()

    print("Training complete!")

In [38]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
train(model, optimizer, num_epochs=3)

  0%|          | 0/3 [00:00<?, ?it/s]

Epoch: 0 Step 0: Loss = 8.574745
Epoch: 0 Step 50: Loss = 1.433478
Epoch: 0 Step 100: Loss = 0.573784


 33%|███▎      | 1/3 [00:04<00:09,  4.77s/it]

Validation Accuracy: 0.5319
Epoch: 1 Step 0: Loss = 0.393401
Epoch: 1 Step 50: Loss = 0.199039
Epoch: 1 Step 100: Loss = 0.118902


 67%|██████▋   | 2/3 [00:09<00:04,  4.52s/it]

Validation Accuracy: 0.5517
Epoch: 2 Step 0: Loss = 0.099751
Epoch: 2 Step 50: Loss = 0.069668
Epoch: 2 Step 100: Loss = 0.052152


100%|██████████| 3/3 [00:13<00:00,  4.51s/it]

Validation Accuracy: 0.5470
Training complete!





In [39]:
torch.save(model.state_dict(), "toy-gpt2-5470")

In [40]:
def predict_sentiment(review_text, model, tokenizer, device, gen_max_length=1):
    prompt = f"<review>\n{review_text}\n<sentiment>\n"
    
    encoding = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, add_special_tokens=False)
    input_ids = encoding.input_ids.to(device)
    print(input_ids)

    # print(attention_mask)
    output_ids = model.generate(
        input_ids,
        max_length=input_ids.shape[1] + gen_max_length,
    )

    print(output_ids)
    generated_tokens = output_ids[0][input_ids.shape[1]:]

    generated_text = tokenizer.decode(generated_tokens)
    
    return generated_text.strip()

In [43]:
comment = "烂片！！！"
model.eval()
result = predict_sentiment(comment, model, tokenizer, device)
print("Prediction:", result)

tensor([[21130,  4162,  4275,  8013,  8013,  8013, 21131]], device='cuda:0')
tensor([[21130,  4162,  4275,  8013,  8013,  8013, 21131, 21129]],
       device='cuda:0')
Prediction: <差评>


## 1.2 Post-train a original GPT-2 117M(GPT small) with pretrained weights.
> In this part, we first load the pretrained weights from huggingface and extract the weights of the model. Then we load the weights into our own model.

In [56]:
@dataclass
class GPTConfig:
    context_length: int = 1024   # max context / sequence length
    vocab_size: int = 21132    # number of tokens for bert-tokenizer + special tokens
    num_layers: int = 2
    # embd_size: int = 768   # embedding dim
    embd_size: int = 768
    num_heads: int = 12
    hidden_size: int = 768





In [59]:
from transformers import GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained("uer/gpt2-distil-chinese-cluecorpussmall")
model.resize_token_embeddings(len(tokenizer))
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(21132, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-5): 6 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=21132, bias=False)
)

In [46]:
def evaluate(model):
    model.eval()
    total_samples = 0
    correct_preds = 0

    
    with torch.no_grad():
        for batch in validloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            batch_size = input_ids.size(0)
            actual_lengths = attention_mask.sum(dim=1) 
 
            for i in range(batch_size):
                
                prompt_len = int(actual_lengths[i].item())
                single_input_ids = input_ids[i][:prompt_len].unsqueeze(0)
                single_mask = attention_mask[i][:prompt_len].unsqueeze(0)
                generated_ids = model.generate(single_input_ids, attention_mask = single_mask, max_length=64+1, pad_token_id=0, do_sample=False)
                answer_ids = generated_ids[0, prompt_len:prompt_len+1]

                # print(f"generated_ids: {generated_ids}")
                # print(f"answer_ids: {answer_ids}")
                # print(f"labels: {labels[i]}")

                # 获取当前样本的真实标签文本
                true_text = labels[i]
                # print(f"answer_ids: {answer_ids}, true_text: {true_text}")
     
                total_samples += 1
                
                if true_text == answer_ids:
                    correct_preds += 1
                if total_samples >= 200:
                    return correct_preds / total_samples
                
    accuracy = correct_preds / total_samples if total_samples > 0 else 0.0
    return accuracy
            
        
            

In [60]:
from torch.optim import AdamW
from tqdm import trange
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 10
for epoch in trange(num_epochs):
    for step, batch in enumerate(trainloader):
        input_ids = batch["input_ids"].to(device)
        labels = batch["labels"].to(device)
        
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        
        if step % 50 == 0:
            print(f"Step {step}: Loss = {loss.item():.8f}")
    if epoch % 3 == 0:
        print("Evaluating...")
        eval_accuracy = evaluate(model)
        print(f"Validation Accuracy: {eval_accuracy:.4f}")
        model.train()
            

print("Training complete!")

  0%|          | 0/10 [00:00<?, ?it/s]

Step 0: Loss = 10.38696480
Step 50: Loss = 0.02465775
Step 100: Loss = 0.01558975
Evaluating...


 10%|█         | 1/10 [00:41<06:10, 41.17s/it]

Validation Accuracy: 0.6250
Step 0: Loss = 0.01364681
Step 50: Loss = 0.01095307
Step 100: Loss = 0.00756635


 20%|██        | 2/10 [00:59<03:42, 27.86s/it]

Step 0: Loss = 0.01118568
Step 50: Loss = 0.00486148
Step 100: Loss = 0.00687969


 30%|███       | 3/10 [01:18<02:45, 23.65s/it]

Step 0: Loss = 0.00792512
Step 50: Loss = 0.00663822
Step 100: Loss = 0.00481947
Evaluating...


 40%|████      | 4/10 [02:01<03:06, 31.16s/it]

Validation Accuracy: 0.8250
Step 0: Loss = 0.00388948
Step 50: Loss = 0.00548437
Step 100: Loss = 0.00470487


 50%|█████     | 5/10 [02:19<02:13, 26.67s/it]

Step 0: Loss = 0.00419920
Step 50: Loss = 0.00166438
Step 100: Loss = 0.00204354


 60%|██████    | 6/10 [02:38<01:36, 24.01s/it]

Step 0: Loss = 0.00154979
Step 50: Loss = 0.00122290
Step 100: Loss = 0.00309407
Evaluating...


 70%|███████   | 7/10 [03:23<01:32, 31.00s/it]

Validation Accuracy: 0.8950
Step 0: Loss = 0.00078810
Step 50: Loss = 0.00114134
Step 100: Loss = 0.00068413


 80%|████████  | 8/10 [03:42<00:54, 27.13s/it]

Step 0: Loss = 0.00026817
Step 50: Loss = 0.00019991
Step 100: Loss = 0.00161236


 90%|█████████ | 9/10 [04:02<00:24, 24.73s/it]

Step 0: Loss = 0.00041864
Step 50: Loss = 0.00069275
Step 100: Loss = 0.00043163
Evaluating...


100%|██████████| 10/10 [04:50<00:00, 29.02s/it]

Validation Accuracy: 0.9100
Training complete!





In [61]:
torch.save(model.state_dict(), "GPT2-small.pt")

In [63]:
config = GPTConfig()
model = GPTLM(config)


def load_pretrained_gpt2(model_path: str):
    """
    从本地的 Hugging Face GPT-2 预训练权重加载到自定义的 GPTLM 模型中。
    :param model_path: 本地的 GPT-2 预训练模型文件路径（.pt 文件）
    :return: 加载完毕的 GPTLM 模型
    """

    # 1️ 加载 Hugging Face 格式的 .pt 预训练权重
    print(f"Loading pretrained GPT-2 weights from {model_path}...")
    checkpoint = torch.load(model_path, map_location=torch.device("cpu"))

    # 确保 checkpoint 是 state_dict
    if "state_dict" in checkpoint:
        checkpoint = checkpoint["state_dict"]  # 提取权重部分

    print(f"Checkpoint keys loaded: {len(checkpoint.keys())}")

    # 2️ 定义你的 GPTLM 模型
    config_args = dict(
        num_layers=12,    # 层数
        num_heads=12,     # 头数
        embd_size=768,    # 隐藏层维度
        vocab_size=21132, # 词汇大小（确保和 tokenizer 匹配）
        context_length=1024  # 最大序列长度
    )
    
    config = GPTConfig(**config_args)
    model = GPTLM(config)

    # 3️ 获取自定义模型的 state_dict
    model_sd = model.state_dict()
    
    # 过滤掉 Hugging Face 中不需要的 keys，如 attn.bias 和 masked_bias
    checkpoint = {k: v for k, v in checkpoint.items() if not k.endswith('.attn.bias') and not k.endswith('.attn.masked_bias')}
    
    # 确保 key 数量匹配
    print(f"Filtered checkpoint keys: {len(checkpoint.keys())}, Model keys: {len(model_sd.keys())}")

    # 4️ 处理 Hugging Face 中的 Conv1D 层（需要转置）
    transposed_layers = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
    
    for k in checkpoint:
        if k in model_sd:
            if any(k.endswith(w) for w in transposed_layers):
                # 转置 Hugging Face 的 Conv1D 层权重
                checkpoint[k] = checkpoint[k].T

    # 5️ 加载权重到模型
    model.load_state_dict(checkpoint, strict=False)  # strict=False 允许部分 key 不匹配
    print("Weights successfully loaded!")


    model.eval()
    print("Model is ready for inference.")
    
    return model


gpt_model = load_pretrained_gpt2("GPT2-small.pt").to(device)


Loading pretrained GPT-2 weights from GPT2-small.pt...
Checkpoint keys loaded: 77
Filtered checkpoint keys: 77, Model keys: 149
Weights successfully loaded!
Model is ready for inference.


In [71]:
comment = "但是太烂了，太棒了"
gpt_model.eval()
result = predict_sentiment(comment, gpt_model, tokenizer, device)
print("预测结果:", result)

tensor([[21130,   852,  3221,  1922,  4162,   749,  8024,  1922,  3472,   749,
         21131]], device='cuda:0')
tensor([[21130,   852,  3221,  1922,  4162,   749,  8024,  1922,  3472,   749,
         21131, 21128]], device='cuda:0')
预测结果: <好评>


In [67]:
def evaluate(model):
    model.eval()
    total_samples = 0
    correct_preds = 0

    
    with torch.no_grad():
        for batch in validloader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['labels'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            batch_size = input_ids.size(0)
            actual_lengths = attention_mask.sum(dim=1) 
 
            for i in range(batch_size):
                
                prompt_len = int(actual_lengths[i].item())
                single_input_ids = input_ids[i][:prompt_len].unsqueeze(0)
                single_mask = attention_mask[i][:prompt_len].unsqueeze(0)
                generated_ids = model.generate(single_input_ids, max_length=single_input_ids.shape[1]+1)
                # 筛选出生成的部分：从实际 prompt 长度位置开始，直到生成结束
                answer_ids = generated_ids[0, prompt_len:prompt_len+1]

                # print(f"generated_ids: {generated_ids}")
                # print(f"answer_ids: {answer_ids}")
                # print(f"labels: {labels[i]}")
                true_text = labels[i]
                # print(f"answer_ids: {answer_ids}, true_text: {true_text}")
     
                total_samples += 1
   
                if true_text == answer_ids:
                    correct_preds += 1
                # if total_samples >= 200:
                #     return correct_preds / total_samples
                
    accuracy = correct_preds / total_samples if total_samples > 0 else 0.0
    model.train()
    return accuracy
            
evaluate(gpt_model)

0.8861788617886179

# 2. GPT-2 with classification head
> This part is a another try which is mentioned in GPT-1 paper. In this part, the GPT model works as a feature extractor to extract the features of the text, and then we add a classification head to classify the sentiment.

Note that in this part, we do not add the special tokens to tokenizers. Instead, we use the **last hidden state of the model** to work as a **encoder**.

1. We can see that with the help of the **classification head**, the model can achieve a better performance than the autoregressive model.
2. Only train the classification with the pretrained weights, the model performs poor(only a little better than random guess). This is maybe because of the **feature extractor** is not good enough.

In [86]:
class GPTDataset(Dataset):

    def __init__(self) -> None:
        super().__init__()
        self.data = df
        self.data = self.data.dropna()

    def __getitem__(self, index):
        return self.data.iloc[index]["review"], self.data.iloc[index]["sentiment"]
    
    def __len__(self):
        return len(self.data)
    
dataset = GPTDataset()
trainset, validset = random_split(dataset, lengths=[0.9, 0.1])
len(trainset), len(validset)


(7749, 861)

In [87]:
def collate_func(batch):
    texts, sentiments = [], []
    for text, sentiment in batch:
        texts.append(text)
        sentiments.append(sentiment)
        
    inputs = tokenizer(texts, max_length = 64, padding="max_length", truncation=True, return_tensors="pt")
    inputs['sentiments'] = torch.tensor(sentiments)
    # inputs.pop("token_type_ids")
    # inputs.pop("attention_mask")
    return inputs

trainloader = DataLoader(trainset, batch_size=32, shuffle=True, collate_fn=collate_func)
validloader = DataLoader(validset, batch_size=64, shuffle=False, collate_fn=collate_func)

In [106]:
@dataclass
class GPTConfig:
    context_length: int = 1024    # max context / sequence length
    vocab_size: int = 21128    # number of tokens for bert-tokenizer
    num_layers: int = 12
    embd_size: int = 768    # embedding dim
    num_heads: int = 12
    hidden_size: int = 768

class GPT(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.transformer = nn.ModuleDict(dict(
            wte = nn.Embedding(self.config.vocab_size, self.config.embd_size),
            wpe = nn.Embedding(self.config.context_length, self.config.embd_size),
            h = nn.ModuleList([Block(self.config) for _ in range(self.config.num_layers)]),
            ln_f = nn.LayerNorm(self.config.embd_size)
        ))
        self.transformer.wte.weight =  nn.Linear(self.config.embd_size, self.config.vocab_size, bias=False).weight
        
        # init params (iterates over all submodules and applies _init_weights)
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            std = 0.02
            if hasattr(module, 'SCALE_INIT'):
                std /= (2 * self.config.num_layers)**0.5
            torch.nn.init.normal_(module.weight, mean=0, std=std)    # as per openai gpt-2 source code
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        assert T <= self.config.context_length, f'sequence length {T} should be <= {self.config.context_length}'
        pos = torch.arange(0, T, dtype=torch.long, device=idx.device)    # (T,)
        pos_embd = self.transformer.wpe(pos)    # (T, embd_size)
        tok_embd = self.transformer.wte(idx)    # (B, T, embd_size)
        x = pos_embd + tok_embd    # (B, T, embd_size)
        for block in self.transformer.h:
            x = block(x)
        x = self.transformer.ln_f(x)    # (B, T, embd_size)
        # logits = self.lm_head(x)    # (B, T, vocab_size)
        
        # loss = None
        # if targets is not None:
        #     loss = F.cross_entropy(logits.view(-1, logits.shape[-1]), targets.view(-1))
        # return logits, loss
        return x

    @classmethod
    def from_pretrained(cls, model_type):
        """ Loads pretrained GPT2 model weights from huggingface """
        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl', 'uer/gpt2-distil-chinese-cluecorpussmall'}
        from transformers import GPT2Model, GPT2LMHeadModel
        print(f"loading weights from pretrained gpt: {model_type}")

        config_args = {
            'uer/gpt2-distil-chinese-cluecorpussmall': dict(num_layers=6, num_heads=12, embd_size=768),    # 82M params
            'gpt2': dict(num_layers=12, num_heads=12, embd_size=768),    # 124M params
            'gpt2-medium': dict(num_layers=24, num_heads=16, embd_size=1024),    # 350M params
            'gpt2-large': dict(num_layers=36, num_heads=20, embd_size=1280),    # 774M params
            'gpt2-xl': dict(num_layers=48, num_heads=25, embd_size=1600),    # 1558M params
        }[model_type]
        # config_args['vocab_size'] = 50257
        config_args['vocab_size'] = 21128
        config_args['context_length'] = 1024

        # create a from-scratch minGPT model
        config = GPTConfig(**config_args)
        model = GPT(config)
        sd = model.state_dict()
        sd_keys = sd.keys()
        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')]

        # init a huggingface transformers model
        
        # model_hf = GPT2LMHeadModel.from_pretrained(model_type)
        model_hf = GPT2Model.from_pretrained(model_type)
        sd_hf = model_hf.state_dict()
        sd_keys_hf = sd_hf.keys()
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')]
        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')]
        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']

        assert len(sd_keys) == len(sd_keys_hf), f"mismatched keys {len(sd_keys)} != {len(sd_keys_hf)}"

        # copy while ensuring all parameters are aligned in names and shape
        for k in sd_keys_hf:
            if any(k.endswith(w) for w in transposed):
                # need to transpose Conv1D weights
                assert sd_hf[k].shape[::-1]  == sd["transformer." + k].shape
                with torch.no_grad():
                    sd["transformer." + k].copy_(sd_hf[k].T)
            else:
                assert sd_hf[k].shape == sd["transformer." + k].shape
                with torch.no_grad():
                    sd["transformer." + k].copy_(sd_hf[k])
        return model

class GPT2ForClassification(nn.Module):
    def __init__(self, base_model, num_classes):
        super().__init__()
        self.gpt2 = base_model
        self.dropout = nn.Dropout(0.1)
        self.classifer = nn.Linear(base_model.config.hidden_size, num_classes)
        
    def forward(self, input_ids, attention_mask=None):
        outputs = self.gpt2(input_ids)
        hidden_state = outputs
        last_hidden_state = hidden_state[:, -1, :]
        output = self.dropout(last_hidden_state)
        logits = self.classifer(output)
        return logits
        
tokenizer = BertTokenizer.from_pretrained("uer/gpt2-distil-chinese-cluecorpussmall")

In [82]:
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            output = model(batch["input_ids"], attention_mask=batch["attention_mask"])
            pred = torch.argmax(output, dim=-1)
            acc_num += (pred.long() == batch["sentiments"].long()).float().sum()
    return acc_num / len(validset)

def train(model, epoch=3, log_step=100, optimizer=None):
    global_step = 0
    critierion = nn.CrossEntropyLoss()
    for ep in trange(epoch):
        model.train()

        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k: v.cuda() for k, v in batch.items()}
            optimizer.zero_grad()
            logits = model(batch["input_ids"], attention_mask=batch["attention_mask"])
            loss = critierion(logits, batch["sentiments"])
            loss.backward()
            optimizer.step()
            
            if global_step % log_step == 0:
                print(f"ep: {ep+1}, global_step: {global_step}, loss: {loss.item()}")
                
            global_step += 1
        acc = evaluate()
        print(f"ep: {ep+1}, acc: {acc}")

In [88]:
config = GPTConfig()
baseModel = GPT(config)
model = GPT2ForClassification(baseModel, 2).to(device)


In [89]:
optimizer = AdamW(model.parameters(), lr=1e-5)

train(model, 8, 100, optimizer)

  0%|          | 0/8 [00:00<?, ?it/s]

ep: 1, global_step: 0, loss: 0.7901384234428406
ep: 1, global_step: 100, loss: 0.7122424840927124
ep: 1, global_step: 200, loss: 0.5884904861450195


 12%|█▎        | 1/8 [00:25<02:55, 25.14s/it]

ep: 1, acc: 0.7479674816131592
ep: 2, global_step: 300, loss: 0.3923863470554352
ep: 2, global_step: 400, loss: 0.26907235383987427


 25%|██▌       | 2/8 [00:49<02:29, 24.91s/it]

ep: 2, acc: 0.7886178493499756
ep: 3, global_step: 500, loss: 0.3506380319595337
ep: 3, global_step: 600, loss: 0.265786737203598
ep: 3, global_step: 700, loss: 0.2473343312740326


 38%|███▊      | 3/8 [01:14<02:04, 24.97s/it]

ep: 3, acc: 0.9047619104385376
ep: 4, global_step: 800, loss: 0.2590174078941345
ep: 4, global_step: 900, loss: 0.04546459764242172


 50%|█████     | 4/8 [01:40<01:41, 25.37s/it]

ep: 4, acc: 0.9500580430030823
ep: 5, global_step: 1000, loss: 0.038341064006090164
ep: 5, global_step: 1100, loss: 0.04042918607592583
ep: 5, global_step: 1200, loss: 0.13934391736984253


 62%|██████▎   | 5/8 [02:10<01:20, 26.93s/it]

ep: 5, acc: 0.9465737342834473
ep: 6, global_step: 1300, loss: 0.035134296864271164
ep: 6, global_step: 1400, loss: 0.19272421300411224


 75%|███████▌  | 6/8 [02:40<00:55, 27.87s/it]

ep: 6, acc: 0.9628338813781738
ep: 7, global_step: 1500, loss: 0.0009227603441104293
ep: 7, global_step: 1600, loss: 0.01415631826967001
ep: 7, global_step: 1700, loss: 0.0009144622599706054


 88%|████████▊ | 7/8 [03:12<00:29, 29.13s/it]

ep: 7, acc: 0.9709639549255371
ep: 8, global_step: 1800, loss: 0.0060768830589950085
ep: 8, global_step: 1900, loss: 0.0030458541586995125


100%|██████████| 8/8 [03:44<00:00, 28.04s/it]

ep: 8, acc: 0.9767711758613586





In [95]:
comment = "呵呵，一起来围观一下吧。"
id2_label = {0: "差评！", 1: "好评！"}
model.eval()
# tokenizer(sen)['input_ids']
with torch.inference_mode():
    inputs = tokenizer(comment, return_tensors="pt")
    # inputs = {k: v.cuda() for k, v in inputs.items()}
    logits = model(inputs['input_ids'].cuda(), attention_mask=inputs['attention_mask'].cuda())
    pred = torch.argmax(logits, dim=1)
    print(f"Input：{comment}\nPrediction:{id2_label.get(pred.item())}")

Input：呵呵，一起来围观一下吧。
Prediction:好评！


In [96]:
torch.save(model.state_dict(), "GPT2-small_with_classification_head.pt")

In [112]:
config = GPTConfig()
baseModel = GPT(config)
baseModel = baseModel.from_pretrained("uer/gpt2-distil-chinese-cluecorpussmall")

model = GPT2ForClassification(baseModel, 2).to(device)
"Only train the last layer"
for param in model.gpt2.parameters():
    param.requires_grad = False 

loading weights from pretrained gpt: uer/gpt2-distil-chinese-cluecorpussmall


In [113]:
optimizer = AdamW(model.parameters(), lr=1e-5)

train(model, 15, 100, optimizer)

  0%|          | 0/15 [00:00<?, ?it/s]

ep: 1, global_step: 0, loss: 1.0900163650512695
ep: 1, global_step: 100, loss: 0.6946725249290466
ep: 1, global_step: 200, loss: 0.7056988477706909


  7%|▋         | 1/15 [00:03<00:55,  3.99s/it]

ep: 1, acc: 0.5121951103210449
ep: 2, global_step: 300, loss: 0.6752122044563293
ep: 2, global_step: 400, loss: 0.7305796146392822


 13%|█▎        | 2/15 [00:07<00:51,  3.93s/it]

ep: 2, acc: 0.5180023312568665
ep: 3, global_step: 500, loss: 0.6500470042228699
ep: 3, global_step: 600, loss: 0.6702553629875183
ep: 3, global_step: 700, loss: 0.6600936651229858


 20%|██        | 3/15 [00:11<00:46,  3.90s/it]

ep: 3, acc: 0.5191637277603149
ep: 4, global_step: 800, loss: 0.6901057958602905
ep: 4, global_step: 900, loss: 0.6669338345527649


 27%|██▋       | 4/15 [00:15<00:42,  3.89s/it]

ep: 4, acc: 0.5133565664291382
ep: 5, global_step: 1000, loss: 0.6859195232391357
ep: 5, global_step: 1100, loss: 0.6924697160720825
ep: 5, global_step: 1200, loss: 0.6346258521080017


 33%|███▎      | 5/15 [00:19<00:38,  3.89s/it]

ep: 5, acc: 0.5238094925880432
ep: 6, global_step: 1300, loss: 0.7667348980903625
ep: 6, global_step: 1400, loss: 0.6618183851242065


 40%|████      | 6/15 [00:23<00:34,  3.89s/it]

ep: 6, acc: 0.5354239344596863
ep: 7, global_step: 1500, loss: 0.7271660566329956
ep: 7, global_step: 1600, loss: 0.7051323652267456
ep: 7, global_step: 1700, loss: 0.6964710354804993


 47%|████▋     | 7/15 [00:27<00:31,  3.89s/it]

ep: 7, acc: 0.5331010222434998
ep: 8, global_step: 1800, loss: 0.6328598856925964
ep: 8, global_step: 1900, loss: 0.6865546703338623


 53%|█████▎    | 8/15 [00:31<00:27,  3.89s/it]

ep: 8, acc: 0.5400696992874146
ep: 9, global_step: 2000, loss: 0.7053009271621704
ep: 9, global_step: 2100, loss: 0.6811754703521729


 60%|██████    | 9/15 [00:35<00:23,  3.89s/it]

ep: 9, acc: 0.5400696992874146
ep: 10, global_step: 2200, loss: 0.6535419225692749
ep: 10, global_step: 2300, loss: 0.7783303260803223
ep: 10, global_step: 2400, loss: 0.7006934285163879


 67%|██████▋   | 10/15 [00:39<00:20,  4.14s/it]

ep: 10, acc: 0.5481997728347778
ep: 11, global_step: 2500, loss: 0.699901282787323
ep: 11, global_step: 2600, loss: 0.6970802545547485


 73%|███████▎  | 11/15 [00:44<00:17,  4.31s/it]

ep: 11, acc: 0.5493611693382263
ep: 12, global_step: 2700, loss: 0.6449683308601379
ep: 12, global_step: 2800, loss: 0.6801198124885559
ep: 12, global_step: 2900, loss: 0.6923301219940186


 80%|████████  | 12/15 [00:49<00:13,  4.65s/it]

ep: 12, acc: 0.5493611693382263
ep: 13, global_step: 3000, loss: 0.6453996896743774
ep: 13, global_step: 3100, loss: 0.6626791954040527


 87%|████████▋ | 13/15 [00:53<00:08,  4.42s/it]

ep: 13, acc: 0.5598141551017761
ep: 14, global_step: 3200, loss: 0.6464694738388062
ep: 14, global_step: 3300, loss: 0.6497233510017395
ep: 14, global_step: 3400, loss: 0.6897488236427307


 93%|█████████▎| 14/15 [00:58<00:04,  4.51s/it]

ep: 14, acc: 0.5540069341659546
ep: 15, global_step: 3500, loss: 0.6780378818511963
ep: 15, global_step: 3600, loss: 0.7538015246391296


100%|██████████| 15/15 [01:03<00:00,  4.26s/it]

ep: 15, acc: 0.5656213760375977



