In [1]:
import urllib.request
import zipfile
import os
from pathlib import Path

In [2]:
url = 'https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip'
zip_path = 'sms_spam_collection.zip'
extracted_path = 'sms_spam_collection'
data_file_path = Path(extracted_path) / 'SMSSpamCollection.tsv'

def download_and_unzip_spam_data(url: str, zip_path: str, extracted_path: str, data_file_path: Path) -> None:
    """下载并解压数据集

    Args:
        url (str): 下载地址
        zip_path (str): 压缩文件名
        extracted_path (str): 解压地址
        data_file_path (Path): 数据集地址
    """
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction")
        return
    
    # 下载文件
    with urllib.request.urlopen(url=url) as response:
        with open(zip_path, 'wb') as out_file:
            out_file.write(response.read())
    
    # 解压文件
    with zipfile.ZipFile(file=zip_path) as zip_ref:
        zip_ref.extractall(extracted_path)
    
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saveed as {data_file_path}")

download_and_unzip_spam_data(url=url, zip_path=zip_path, extracted_path=extracted_path, data_file_path=data_file_path)

sms_spam_collection/SMSSpamCollection.tsv already exists. Skipping download and extraction


In [3]:
import pandas as pd
from pandas.core.frame import DataFrame
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
# DataFrame.value_counts()可以查看数据的分布情况，基于分布情况需要创建一个平衡数据集
df['Label'].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

In [5]:
def create_balanced_dataset(df: DataFrame) -> DataFrame:
    """创建平衡数据集

    Args:
        df (DataFrame): 原始数据集

    Returns:
        DataFrame: 平衡数据集
    """
    num_spam = df[df['Label'] == 'spam'].shape[0]
    # 随机采样 ham 使其数量与 spam一致 这里先获取df的label列中值为ham的index，再通过df[[indexs]]读取数据，利用sample从这一批数据中随机抽样num_spam个样本
    ham_subset = df[df['Label'] == 'ham'].sample(num_spam, random_state=123)
    balanced_df = pd.concat([
        ham_subset, df[df['Label'] == "spam"]
    ]) # 拼接全部spam数据和随机采样与spam相同数量的ham数据集
    return balanced_df
balanced_df = create_balanced_dataset(df)
balanced_df['Label'].value_counts()

Label
ham     747
spam    747
Name: count, dtype: int64

In [6]:
# 转换文本为整数类别标签
balanced_df['Label'] = balanced_df['Label'].map({"ham": 0, "spam": 1})
balanced_df['Label']

4307    0
4138    0
4831    0
4461    0
5440    0
       ..
5537    1
5540    1
5547    1
5566    1
5567    1
Name: Label, Length: 1494, dtype: int64

In [7]:
def random_split(df: DataFrame, train_frac: float, validation_frac: float) -> tuple[DataFrame, DataFrame, DataFrame]:
    """拆分训练集、验证集和测试集

    Args:
        df (DataFrame): 数据框
        train_frac (float): 训练集比例
        validation_frac (float): 验证集比例

    Returns:
        tuple[DataFrame, DataFrame, DataFrame]: 完整数据集
    """
    df = df.sample(frac=1, random_state=123).reset_index(drop=True) # 重新设置索引
    train_end = int(len(df) * train_frac) # 训练集结束索引位置
    validation_end = train_end + int(len(df) * validation_frac) # 验证集结束索引位置
    
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]
    
    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

In [8]:
from torch.utils.data import Dataset, DataLoader
import tiktoken
from tiktoken.core import Encoding
import torch

class SpamDataset(Dataset):
    """填充长度不足序列Dataset

    Args:
        Dataset (Dataset): PyTorch Dataset
    """
    def __init__(self, csv_file: str, tokenizer: Encoding, max_length: int, pad_token_id: int = 50256):
        """构造函数，加载数据集、对数据集进行编码、对超过最大长度的截断，不足最大长度的填充

        Args:
            csv_file (str): csv 数据集路径
            tokenizer (Encoding): 编码器
            max_length (int): 最大长度
            pad_token_id (int, optional): _description_. Defaults to 50256.
        """
        super().__init__()
        self.data = pd.read_csv(filepath_or_buffer=csv_file) # 读取数据集
        self.encoded_texts = [ tokenizer.encode(text=text) for text in self.data["Text"] ] # 文本分词编码
        
        if max_length is None:
            self.max_length = self._longest_encoded_length() # 没有设置最大长度 获取最大长度
        else:
            # 序列长度超过设置的max_length长度则截断
            self.max_length = max_length
            self.encoded_texts = [encoded_text[:self.max_length] for encoded_text in self.encoded_texts]
        
        self.encoded_texts = [ encoded_text + [pad_token_id] * (self.max_length - len(encoded_text)) for encoded_text in self.encoded_texts ] # 填充序列到最大长度
    
    def __getitem__(self, index):
        encoded = self.encoded_texts[index]
        label = self.data.iloc[index]['Label']
        return (
            torch.tensor(encoded, dtype=torch.long),
            torch.tensor(label, dtype=torch.long)
        )
        
    def __len__(self):
        return len(self.data)
    
    def _longest_encoded_length(self) -> int:
        """获取编码数据集中最长序列

        Returns:
            int: 最大长度
        """
        max_length = 0
        for encoded_text in self.encoded_texts:
            encoded_length = len(encoded_text)
            if encoded_length > max_length:
                max_length = encoded_length
        return max_length

tokenizer = tiktoken.get_encoding("gpt2")
# 加载数据
train_dataset = SpamDataset(csv_file="train.csv", max_length=None, tokenizer=tokenizer)
val_dataset = SpamDataset(csv_file="validation.csv", max_length=train_dataset.max_length, tokenizer=tokenizer)
test_dataset = SpamDataset(csv_file="test.csv", max_length=train_dataset.max_length, tokenizer=tokenizer)

In [9]:
# 创建数据加载器
batch_size = 8
num_workers= 0
train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True)
val_dataloader = DataLoader(dataset=val_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True)
test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers, drop_last=True)

for input_batch, target_batch in train_dataloader:
    pass
print("Input Batch dimensions:", input_batch.shape)
print("Label Batch dimensions:", target_batch.shape)

Input Batch dimensions: torch.Size([8, 120])
Label Batch dimensions: torch.Size([8])


In [11]:
# 准备模型配置
CHOOSE_MODEL = "gpt2-small (124M)"
INPUT_PROMPT = "Every effort moves"
BASE_CONFIG = {
    "vocab_size": 50257,
    "context_length": 1024,
    "drop_rate": 0.0,
    "qkv_bias": True
}
model_configs = {
    "gpt2-small (124M)":  {"emb_dim": 768,  "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)":  {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)":    {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
# 加载参数到模型
from GPTModel import GPTModel2, load_weights_into_gpt, generate, text_to_token_ids, token_ids_to_text
from gpt_download import download_and_load_gpt2
import tiktoken

model_size = CHOOSE_MODEL.split(" ")[-1].strip("(").rstrip(")")
settings, params = download_and_load_gpt2(model_size=model_size, models_dir="gpt2")

model = GPTModel2(BASE_CONFIG) # 初始化模型
load_weights_into_gpt(model, params) # 加载参数
model.eval() # 模型进入评估模式
model.to("cuda")

tokenizer = tiktoken.get_encoding("gpt2")
token_ids = generate(
    model=model,
    idx=text_to_token_ids(text=INPUT_PROMPT, tokenizer=tokenizer),
    context_size=BASE_CONFIG['context_length'],
    max_new_tokens=15
)
token_ids_to_text(token_ids=token_ids, tokenizer=tokenizer)

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


'Every effort moves forward, but it\'s not enough.\n\n"I\'m not going'

In [None]:
from torch import nn
# 添加分类头
model # 查看模型架构
# 冻结层（不进行反向传播，不存储梯度）
for param in model.parameters():
    param.requires_grad = False
# 替换最终输出层
model.out_head = nn.Linear(in_features=BASE_CONFIG['emb_dim'], out_features=2).requires_grad_(True)

GPTModel2(
  (token_embeddings): Embedding(50257, 768)
  (position_embeddings): Embedding(1024, 768)
  (dropout_embeddings): Dropout(p=0.0, inplace=False)
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (pre_layer_norm): LayerNorm()
      (post_layer_nrom): LayerNorm()
      (multi_head_attention): MultiHeadAttention(
        (query_layer): Linear(in_features=768, out_features=768, bias=True)
        (key_layer): Linear(in_features=768, out_features=768, bias=True)
        (value_layer): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
        (out_layer): Linear(in_features=768, out_features=768, bias=True)
      )
      (dropout): Dropout(p=0.0, inplace=False)
      (feed_forward): FeedForwardLayer(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
    )
 