In [1]:
import torch
import torch.nn as nn

In [2]:
!pip install tiktoken

Collecting tiktoken
  Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tiktoken-0.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.4/1.2 MB[0m [31m13.0 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tiktoken
Successfully installed tiktoken-0.9.0


In [3]:
import tiktoken
tokenizer = tiktoken.get_encoding('gpt2')

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
GPT2_124M_CFG={
    'dropout':0.1,
    'n_layers':12,
    'n_heads':12,
    'emb_size':768,
    'context_length':1024,
    'vocab_size':50257,
    'qkv_bias':False
}

In [6]:

class LayerNormalization(nn.Module):
  def __init__(self,emb_size):
       super().__init__()
       self.scale = nn.Parameter(torch.ones(emb_size))
       self.shift = nn.Parameter(torch.ones(emb_size))
       self.eps = 1e-5

  def forward(self,x):
      mean = torch.mean(x,dim=-1,keepdim=True)
      var = torch.var(x,dim=-1,keepdim=True,unbiased=False)
      return self.scale* (x-mean)/torch.sqrt(var+self.eps) + self.shift

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))
class FeedForward(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.layers = nn.Sequential(
        nn.Linear(cfg['emb_size'],4*cfg['emb_size']),
        GELU(),
        nn.Linear(4*cfg['emb_size'],cfg['emb_size'])
    )
  def forward(self,x):
        return self.layers(x)

In [7]:
class MultiheadAttention(nn.Module):
  def __init__(self,din,dout,n_heads,context_length,dropout,qkv_bias=False):
        super().__init__()
        self.w_queries = nn.Linear(din,dout,qkv_bias)
        self.w_keys = nn.Linear(din,dout,qkv_bias)
        self.w_values = nn.Linear(din,dout,qkv_bias)
        self.register_buffer('mask',torch.triu(torch.ones(context_length,context_length),diagonal=1))
        self.out_layer = nn.Linear(dout,dout)
        self.n_heads= n_heads
        self.dropout = nn.Dropout(dropout)

  def forward(self,x):
    batch, context_length, emb_size = x.shape
    queries = self.w_queries(x)
    keys = self.w_keys(x)
    values = self.w_values(x)
    head_dim = emb_size//self.n_heads

    queries = queries.view(batch,context_length,self.n_heads,head_dim)
    keys = keys.view(batch,context_length,self.n_heads,head_dim)
    values = values.view(batch,context_length,self.n_heads,head_dim)
     # b,cl,nheads,hd.


     # b, nheads, cl, hd
    queries = queries.transpose(1,2)
    keys = keys.transpose(1,2)
    values = values.transpose(1,2)

    attention_scores = queries @ keys.transpose(2,3)
       #b, nheads, cl,cl

    attention_scores.masked_fill_(self.mask.bool()[:context_length,:context_length],-torch.inf)

    attention_weights = torch.softmax(attention_scores/(keys.shape[-1])**0.5, dim =-1)
    attention_weights = self.dropout(attention_weights)
    context_vectors = (attention_weights @ values).transpose(1,2)
    # b, nheads,cl,cl.     b,nheads, cl, hd
    # b, nheads, cl,hd.     .T -> b, cl,nheads,hd
    context_vectors = context_vectors.contiguous().view(batch, context_length, emb_size)
    context_vectors = self.out_layer(context_vectors)
    return context_vectors


In [8]:
class Transformer(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.ff = FeedForward(cfg)
    self.dropout = nn.Dropout(cfg['dropout'])
    self.norm1 = LayerNormalization(cfg['emb_size'])
    self.norm2 = LayerNormalization(cfg['emb_size'])
    self.att = MultiheadAttention(
        din = cfg['emb_size'],
        dout = cfg['emb_size'],
        n_heads=cfg['n_heads'],
        context_length = cfg['context_length'],
        dropout = cfg['dropout'],
        qkv_bias=cfg['qkv_bias'])

  def forward(self,x):
    shortcut = x
    x = self.norm1(x)
    x= self.att(x)
    x= self.dropout(x)
    x= shortcut+x

    shortcut = x
    x = self.norm2(x)
    x= self.ff(x)
    x= self.dropout(x)
    x= shortcut+x

    return x

In [9]:
class GPT2(nn.Module):
  def __init__(self,cfg):
    super().__init__()
    self.tok_emb = nn.Embedding(cfg['vocab_size'],cfg['emb_size'])
    self.pos_emb = nn.Embedding(cfg['context_length'],cfg['emb_size'])
    self.out_head = nn.Linear(cfg['emb_size'],cfg['vocab_size'],bias=False)
    self.drop_emb = nn.Dropout(cfg['dropout'])
    self.blocks = nn.Sequential(
        *[Transformer(cfg) for _ in range(cfg['n_layers'])]
    )
    self.final_norm = LayerNormalization(cfg['emb_size'])

  def forward(self,x):
    batch, n_tokens = x.shape
    token_embed = self.tok_emb(x)
    position_embed = self.pos_emb(torch.arange(n_tokens,device=x.device))
    x =  token_embed + position_embed
    x= self.drop_emb(x)
    x= self.blocks(x)
    x= self.final_norm(x)
    logits= self.out_head(x)
    return logits


In [10]:
def get_text_simple(idx,model,max_words,context_length):
  model.eval()
  for _ in range(max_words):
   idx_new = idx[:,-context_length:]
   with torch.no_grad():
    logits = model(idx_new)
   temp = logits[:,-1,:]
   temp= torch.softmax(temp,dim=-1)
   next = torch.argmax(temp,dim=-1,keepdim=True)
   idx = torch.cat((idx,next),dim=-1)
  return idx

In [11]:
def text_to_token_ids(text, tokenizer):
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    encoded_tensor = torch.tensor(encoded).unsqueeze(0)
    return encoded_tensor

def token_ids_to_text(token_ids, tokenizer):
    flat = token_ids.squeeze(0)
    return tokenizer.decode(flat.tolist())

start_context = "Every effort moves you"

In [12]:
def generate(idx,context_length,model,max_words,temperature=0.0,top_k=None,end_token=None):
  model.eval()
  for _ in range(max_words):
      new_idx = idx[:,-context_length:]
      with torch.no_grad():
        logits = model(new_idx)
      logits = logits[:,-1,:]

     # temperature scaling + topk sampling
      if top_k is not None:
          top_logits,_ = torch.topk(logits,top_k)
          min_val = top_logits[:,-1].unsqueeze(-1)
          logits = torch.where(logits<min_val,torch.tensor(float("-inf")).to(logits.device),logits)
      if temperature>0.0:
          logits = logits/temperature
          probas = torch.softmax(logits,dim=-1)
          next = torch.multinomial(probas,num_samples=1)
      else:
        logits = torch.softmax(logits,dim=-1)
        next = torch.argmax(logits,dim=-1,keepdim=True)
      if next==end_token:
          break
      idx = torch.cat((idx,next),dim=-1)
  return idx


In [13]:
import urllib.request
import ssl
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Create an unverified SSL context
    ssl_context = ssl._create_unverified_context()

    # Downloading the file
    with urllib.request.urlopen(url, context=ssl_context) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)


File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv


In [14]:
import pandas as pd

df = pd.read_csv(data_file_path,sep='\t',header=None, names=['Label','Text'])
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [15]:
df['Label'].value_counts()

Unnamed: 0_level_0,count
Label,Unnamed: 1_level_1
ham,4825
spam,747


In [16]:
def create_balanced_dataset(df):

    # Count the instances of "spam"
    num_spam = df[df["Label"] == "spam"].shape[0]

    # Randomly sample "ham" instances to match the number of "spam" instances
    ham_subset = df[df["Label"] == "ham"].sample(num_spam, random_state=123)

    # Combine ham "subset" with "spam"
    balanced_df = pd.concat([ham_subset, df[df["Label"] == "spam"]])

    return balanced_df

balanced_df = create_balanced_dataset(df)
print(balanced_df["Label"].value_counts())

Label
ham     747
spam    747
Name: count, dtype: int64


In [17]:
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})

In [18]:
def random_split(df, train_frac, validation_frac):
    # Shuffle the entire DataFrame
    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    # Calculate split indices
    train_end = int(len(df) * train_frac)
    validation_end = train_end + int(len(df) * validation_frac)

    # Split the DataFrame
    train_df = df[:train_end]
    validation_df = df[train_end:validation_end]
    test_df = df[validation_end:]

    return train_df, validation_df, test_df

train_df, validation_df, test_df = random_split(balanced_df, 0.7, 0.1)
# Test size is implied to be 0.2 as the remainder


In [19]:
print(len(train_df))
print(len(validation_df))
print(len(test_df))

1045
149
300


In [20]:
train_df.to_csv("train.csv", index=None)
validation_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)

In [21]:
import torch
from torch.utils.data import Dataset

class spamDataset(Dataset):
  def __init__(self, csv_file, tokenizer, max_length=None, pad_id = 50256):
      self.data = pd.read_csv(csv_file)
      self.encoded_texts = [
          tokenizer.encode(text) for text in self.data['Text']
      ]
      if max_length is None:
        self.max_length = max([len(tokens) for tokens in self.encoded_texts])
      else:
        self.max_length = max_length
        self.encoded_texts = [
          tokens[:self.max_length] for tokens in self.encoded_texts
        ]
      self.encoded_texts = [encoded_txt + [pad_id]*(self.max_length-len(encoded_txt)) for encoded_txt in self.encoded_texts]

  def __getitem__(self, index):
     encoded = self.encoded_texts[index]
     label = self.data['Label'][index]
     return (
             torch.tensor(encoded,dtype=torch.long),
             torch.tensor(label,dtype=torch.long)
            )
  def __len__(self):
    return len(self.data)


In [22]:
train_dataset = spamDataset('train.csv',tokenizer)
train_dataset.max_length

120

In [23]:
test_dataset = spamDataset('test.csv',tokenizer,max_length=train_dataset.max_length)
test_dataset.max_length

120

In [24]:
val_dataset = spamDataset('validation.csv',tokenizer,max_length=train_dataset.max_length)
val_dataset.max_length

120

In [25]:
from torch.utils.data import DataLoader

num_workers = 0
batch_size = 8

torch.manual_seed(123)

train_dataloader = DataLoader(
    dataset=train_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=True,
    shuffle=True
)

validation_dataloader = DataLoader(
    dataset=val_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=True,
    shuffle=True
)

test_dataloader = DataLoader(
    dataset=test_dataset,
    batch_size=batch_size,
    num_workers=num_workers,
    drop_last=True,
    shuffle=True
)

In [26]:
for input_batch, target_batch in train_dataloader:
      pass
print(input_batch.shape)
print(target_batch.shape)

torch.Size([8, 120])
torch.Size([8])


In [27]:
print(len(train_dataloader)," # of batches")
print(len(test_dataloader)," # of batches")
print(len(validation_dataloader)," # of batches")

130  # of batches
37  # of batches
18  # of batches


In [28]:
CHOOSE_MODEL ="(gpt2-small 124M)"
input_promt ="Every effort moves"

BASE_CONFIG ={
    'dropout':0.0,
    'n_layers':12,
    'n_heads':12,
    'emb_size':768,
    'context_length':1024,
    'vocab_size':50257,
    'qkv_bias':True
}

assert train_dataset.max_length <= BASE_CONFIG['context_length'],(
    f"the dataset exceeds models context length"
)

In [58]:
from gpt_download3 import download_and_load_gpt2

setting, params = download_and_load_gpt2(model_size='124M',models_dir="gpt2")

gpt = GPT2(cfg=BASE_CONFIG)
gpt.eval();

def assign(left,right):
  if left.shape != right.shape:
    raise ValueError("shape mismatch")
  return torch.nn.Parameter(torch.tensor(right))



File already exists and is up-to-date: gpt2/124M/checkpoint




File already exists and is up-to-date: gpt2/124M/encoder.json




File already exists and is up-to-date: gpt2/124M/hparams.json




File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001




File already exists and is up-to-date: gpt2/124M/model.ckpt.index




File already exists and is up-to-date: gpt2/124M/model.ckpt.meta




File already exists and is up-to-date: gpt2/124M/vocab.bpe


In [59]:
import numpy as np


def load_weights_into_gpt(gpt,params):
  gpt.tok_emb.weight = assign(gpt.tok_emb.weight, params['wte'])
  gpt.pos_emb.weight = assign(gpt.pos_emb.weight, params['wpe'])

  for b in range(len(params['blocks'])):

        q_w, k_w, v_w = np.split(params['blocks'][b]['attn']['c_attn']['w'],3,axis=-1)
        gpt.blocks[b].att.w_queries.weight = assign(gpt.blocks[b].att.w_queries.weight,q_w.T)
        gpt.blocks[b].att.w_keys.weight = assign(gpt.blocks[b].att.w_keys.weight,k_w.T)
        gpt.blocks[b].att.w_values.weight = assign(gpt.blocks[b].att.w_values.weight,v_w.T)

        q_b, k_b, v_b = np.split(params['blocks'][b]['attn']['c_attn']['b'],3,axis=-1)
        gpt.blocks[b].att.w_queries.bias = assign(gpt.blocks[b].att.w_queries.bias,q_b)
        gpt.blocks[b].att.w_keys.bias = assign(gpt.blocks[b].att.w_keys.bias,k_b)
        gpt.blocks[b].att.w_values.bias = assign(gpt.blocks[b].att.w_values.bias,v_b)

        gpt.blocks[b].att.out_layer.weight = assign(gpt.blocks[b].att.out_layer.weight, params['blocks'][b]['attn']['c_proj']['w'].T)
        gpt.blocks[b].att.out_layer.bias = assign(gpt.blocks[b].att.out_layer.bias, params['blocks'][b]['attn']['c_proj']['b'])

        gpt.blocks[b].ff.layers[0].weight = assign(gpt.blocks[b].ff.layers[0].weight, params['blocks'][b]['mlp']['c_fc']['w'].T)
        gpt.blocks[b].ff.layers[0].bias = assign(gpt.blocks[b].ff.layers[0].bias, params['blocks'][b]['mlp']['c_fc']['b'])
        gpt.blocks[b].ff.layers[2].weight = assign(gpt.blocks[b].ff.layers[2].weight, params['blocks'][b]['mlp']['c_proj']['w'].T)
        gpt.blocks[b].ff.layers[2].bias = assign(gpt.blocks[b].ff.layers[2].bias, params['blocks'][b]['mlp']['c_proj']['b'])


        gpt.blocks[b].norm1.scale = assign(gpt.blocks[b].norm1.scale, params['blocks'][b]['ln_1']['g'])
        gpt.blocks[b].norm1.shift = assign(gpt.blocks[b].norm1.shift, params['blocks'][b]['ln_1']['b'])

        gpt.blocks[b].norm2.scale = assign(gpt.blocks[b].norm2.scale, params['blocks'][b]['ln_2']['g'])
        gpt.blocks[b].norm2.shift = assign(gpt.blocks[b].norm2.shift, params['blocks'][b]['ln_2']['b'])

  gpt.final_norm.scale = assign(gpt.final_norm.scale,params['g'])
  gpt.final_norm.shift = assign(gpt.final_norm.shift,params['b'])
  gpt.out_head.weight = assign(gpt.out_head.weight, params['wte'])

In [60]:
load_weights_into_gpt(gpt,params)
gpt.to(device);

In [61]:
torch.manual_seed(123)
token_ids = generate(
                     idx = text_to_token_ids("Every effort moves you",tokenizer).to(device),
                     context_length=1024,
                     model=gpt,
                     max_words = 20,
                     temperature=1.5,top_k=50,end_token=None
                     )

In [62]:
token_ids_to_text(token_ids,tokenizer)

'Every effort moves you toward an equal share for each vote plus half. Inequality is often not an accurate representation of human'

In [63]:
text2 =("Is the following text spam? Answer with 'yes' or 'no'"
        "'You are a winner you have been specially selected to received $1000 cash or $2000 award.")
token_ids = generate(
                     idx = text_to_token_ids(text2,tokenizer).to(device),
                     context_length=1024,
                     model=gpt,
                     max_words = 50,
                     temperature=1.5,top_k=50,end_token=None
                     )

In [64]:
token_ids_to_text(token_ids,tokenizer)

'Is the following text spam? Answer with \'yes\' or \'no\'\'You are a winner you have been specially selected to received $1000 cash or $2000 award.You can view any form that you received by selecting a form number: "Cash Card Card" (3D) - PayPal Credit CARD or IDI Online: [FBA]\n\n\nMoney Order\n\n\n1 Credit Card/ IDI card ('

In [65]:
num_class = 2

In [66]:
for params in gpt.parameters():
  params.requires_grad=False

In [67]:
gpt.out_head = nn.Linear(BASE_CONFIG['emb_size'],num_class)

In [68]:
for params in gpt.final_norm.parameters():
   params.requires_grad=True

In [69]:
for params in gpt.blocks[-1].parameters():
   params.requires_grad=True

In [70]:
inputs = text_to_token_ids("do you have time",tokenizer).to(device)
inputs.shape

torch.Size([1, 4])

In [71]:
gpt.to(device);

In [72]:
with torch.no_grad():
  output = gpt(inputs)
output.shape

torch.Size([1, 4, 2])

In [73]:
output

tensor([[[ 1.4661, -0.2504],
         [10.3746, -3.1089],
         [ 8.9922, -1.8229],
         [ 5.2944, -1.0763]]])

In [102]:
def calc_accuracy_loader(data_loader,model,device,num_batches=None):
    model.eval()
    correct_predictions , num_examples =0,0

    if num_batches is  None:
      num_batches = len(data_loader)
    else:
      num_batches = min( len(data_loader),num_batches)

    for i,(input_batch, target_batch) in enumerate(data_loader):
        if i< num_batches:
          input_batch, target_batch = input_batch.to(device), target_batch.to(device)
          with torch.no_grad():
            output = model(input_batch)[:,-1,:]
          pred_labels = torch.argmax(output,dim=-1)
          num_examples += pred_labels.shape[0]
          correct_predictions += (pred_labels == target_batch).sum().item()
        else:
          break
    return correct_predictions/num_examples

In [75]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [76]:
gpt.to(device)
train_acc = calc_accuracy_loader(train_dataloader,gpt,device,num_batches=10)
val_acc = calc_accuracy_loader(validation_dataloader,gpt,device,num_batches=10)
test_acc = calc_accuracy_loader(test_dataloader,gpt,device,num_batches=10)

print("the train accuracy is :",train_acc*100)
print("the validation accuracy is :",val_acc*100)
print("the test accuracy is :",test_acc*100)

the train accuracy is : 47.5
the validation accuracy is : 42.5
the test accuracy is : 55.00000000000001


In [103]:
def calc_loss_batch(model,input_batch,target_batch,device):
   input_batch,target_batch =  input_batch.to(device),target_batch.to(device)
   logits = model(input_batch)[:,-1,:]
   loss = nn.functional.cross_entropy(logits,target_batch)
   return loss

In [104]:
def calc_loss_loader(data_loader,model,device,num_batches=None):
    model.eval()
    total_loss = 0.

    if  len(data_loader) == 0:
      return float("nan")
    elif num_batches is None:
      num_batches = len(data_loader)
    else:
      num_batches = min( len(data_loader),num_batches)

    for i,(input_batch, target_batch) in enumerate(data_loader):
        if i< num_batches:
          input_batch, target_batch = input_batch.to(device), target_batch.to(device)
          loss = calc_loss_batch(model, input_batch, target_batch, device)
          total_loss += loss.item()
        else:
          break
    return total_loss/num_batches

In [79]:
gpt.to(device)
train_loss = calc_loss_loader(train_dataloader,gpt,device,num_batches=10)
val_loss = calc_loss_loader(validation_dataloader,gpt,device,num_batches=10)
test_loss = calc_loss_loader(test_dataloader,gpt,device,num_batches=10)

print("the train loss is :",train_loss)
print("the validation loss is :",val_loss)
print("the test loss is :",test_loss)

the train loss is : 2.5197812795639036
the validation loss is : 2.3361956238746644
the test loss is : 2.8181297421455382


Finetuning model on supervised data

In [105]:
def evaluate_model(model,train_loader,val_loader,device,eval_iter):
  model.eval()
  with torch.no_grad():
    train_loss = calc_loss_loader(train_loader,model,device,num_batches=eval_iter)
    val_loss = calc_loss_loader(val_loader,model,device,num_batches=eval_iter)
  return train_loss, val_loss

In [106]:
def train_classifier_simple(model,train_loader,val_loader,optimizer,device,num_epochs,eval_freq,eval_iter):
  train_losses, val_losses, train_accs, val_accs = [], [], [], []
  example_seen, global_step = 0 ,1

  for epoch in range(num_epochs):
    model.train()
    for input_batch, target_batch in train_loader:
      optimizer.zero_grad()
      loss = calc_loss_batch(model,input_batch,target_batch,device)
      loss.backward()
      optimizer.step()
      example_seen += input_batch.shape[0]
      global_step += 1

      if global_step% eval_freq == 0:
        train_loss, val_loss = evaluate_model(model,train_loader,val_loader,device,eval_iter)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        print(f"epoch:{epoch+1} step:{global_step}. train_loss:{train_loss}. val_loss:{val_loss}")
    train_accuracy = calc_accuracy_loader(train_loader,model,device,num_batches=eval_iter)
    val_accuracy= calc_accuracy_loader(val_loader,model,device,num_batches=eval_iter)
    print("training accurcay:",train_accuracy)
    print("validation accuracy:",val_accuracy)
    train_accs.append(train_accuracy)
    val_accs.append(val_accuracy)
  return train_losses, val_losses, train_accs, val_accs

In [107]:
import time
start_time = time.time()
torch.manual_seed(123)
optimizer = torch.optim.AdamW(gpt.parameters(),lr=5e-5,weight_decay=0.1)

train_losses, val_losses, train_accs, val_accs= train_classifier_simple(gpt,train_dataloader,
                                                                        validation_dataloader,optimizer,device,
                                                                        num_epochs=5,eval_freq=50,eval_iter=5)
end_time = time.time()
print("total time taken:",(end_time-start_time)/60,"min")

epoch:1 step:50. train_loss:0.13297739587724208. val_loss:0.0415451155975461
epoch:1 step:100. train_loss:0.0384824201464653. val_loss:0.0463905394077301
training accurcay: 1.0
validation accuracy: 0.975
epoch:2 step:150. train_loss:0.09851536089554429. val_loss:0.05158086847513914
epoch:2 step:200. train_loss:0.017506040912121535. val_loss:0.10436288760975003
epoch:2 step:250. train_loss:0.018467493914067747. val_loss:0.20461810659617186
training accurcay: 0.925
validation accuracy: 0.975
epoch:3 step:300. train_loss:0.029174886597320437. val_loss:0.09178385958075523
epoch:3 step:350. train_loss:0.05620777998119593. val_loss:0.0359452607226558
training accurcay: 0.95
validation accuracy: 0.975
epoch:4 step:400. train_loss:0.15595321580767632. val_loss:0.02922084084711969
epoch:4 step:450. train_loss:0.18419695184566082. val_loss:0.009554243134334683
epoch:4 step:500. train_loss:0.015533103747293353. val_loss:0.16502193324267864
training accurcay: 1.0
validation accuracy: 0.975
epoch:5

In [108]:
train_acc = calc_accuracy_loader(train_dataloader,gpt,device)
val_acc = calc_accuracy_loader(validation_dataloader,gpt,device)
test_acc = calc_accuracy_loader(test_dataloader,gpt,device)

print("the train accuracy is :",train_acc*100)
print("the validation accuracy is :",val_acc*100)
print("the test accuracy is :",test_acc*100)

the train accuracy is : 98.46153846153847
the validation accuracy is : 98.61111111111111
the test accuracy is : 96.28378378378379


In [113]:
def classify_review(model,text,tokenizer,max_length,device,padding_id=50256):
  model.eval()
  token_ids = tokenizer.encode(text)
  token_ids = token_ids[:min(max_length,model.pos_emb.weight.shape[0])]

  token_ids += [padding_id]*(max_length-len(token_ids))
  token_ids = torch.tensor(token_ids,device=device).unsqueeze(0)
  with torch.no_grad():
    output = model(token_ids)[:,-1,:]
  predicted_label =torch.argmax(output,dim=-1).item()

  return "spam" if predicted_label == 1 else "no spam"


In [114]:
text = """
You are a winner you have been specially selected to received $1000 cash or $2000 award.
"""
print(classify_review(gpt,text,tokenizer,max_length=train_dataset.max_length,device=device))

spam


In [121]:
text = """
Hi dinner money lottery give me cash
"""
print(classify_review(gpt,text,tokenizer,max_length=train_dataset.max_length,device=device))

no spam


In [123]:
torch.save(gpt.state_dict(),'spam_classifier.pth')

In [124]:
gpt_state_dict = torch.load('spam_classifier.pth')
gpt.load_state_dict(gpt_state_dict)

<All keys matched successfully>

In [152]:
text = """
Dear Friend,

Are you tired of working a 9 to 5 job? Want to become your own boss and make thousands of dollars from home? This is your chance! Our proven system has helped people just like you to earn money more quickly and easily than they ever imagined.

Just visit our website and sign up to start making money today: [Insert Link Here]

Don't wait, this opportunity won't last long!
"""
print(classify_review(gpt,text,tokenizer,max_length=train_dataset.max_length,device=device))

spam
