In [1]:
import rootutils
root_path = rootutils.setup_root(".", indicator=".project-root", pythonpath=True)

In [2]:
import torch

In [3]:
import pandas as pd

data_file_path = r"C:\Users\WalterBuenodeBritoNe\Documents\GitHub\LLMPlayground\Data\SMSSpamCollection"

df = pd.read_csv(
 data_file_path, sep="\t", header=None, names=["Label", "Text"]
)
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
def create_balanced_data(df):

    spam_df = df[df["Label"] == "spam"]
    
    num_spam = len(spam_df)


    ham_df = df[df["Label"] == "ham"]


    ham_df = ham_df.sample(num_spam)


    balanced_df = pd.concat([spam_df, ham_df]).reset_index(drop = True)


    return balanced_df

In [5]:
balanced_df = create_balanced_data(df)
balanced_df['Label'] = balanced_df['Label'].map({'spam': 1, 'ham': 0})
balanced_df

Unnamed: 0,Label,Text
0,1,Free entry in 2 a wkly comp to win FA Cup fina...
1,1,FreeMsg Hey there darling it's been 3 week's n...
2,1,WINNER!! As a valued network customer you have...
3,1,Had your mobile 11 months or more? U R entitle...
4,1,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
1489,0,Jus finish my lunch on my way home lor... I to...
1490,0,All day working day:)except saturday and sunday..
1491,0,R we still meeting 4 dinner tonight?
1492,0,"Yeah work is fine, started last week, all the ..."


In [6]:
def random_split(df, train_ratio = 0.7, validation_ratio = 0.1):


    df = df.sample(frac=1, random_state=123).reset_index(drop=True)

    train_size = int(len(df) * train_ratio)

    validation_size = int(len(df) * validation_ratio) 

    df_train = df.iloc[:train_size]
    df_validation = df.iloc[train_size:train_size + validation_size]
    df_test = df.iloc[train_size + validation_size:]

    return df_train, df_validation, df_test


In [7]:
df_train, df_validation, df_test = random_split(balanced_df)

In [8]:
from core.data.dataset import SMSDataset

In [9]:
import tiktoken

In [10]:
tokenizer = tiktoken.get_encoding("gpt2")

In [11]:
df_train['Text']  = df_train['Text'].astype(str)

In [12]:
df_train['Text']

0       U have a secret admirer who is looking 2 make ...
1       We tried to contact you re your reply to our o...
2                            I donno if they are scorable
3       No shit, but I wasn't that surprised, so I wen...
4            No messages on her phone. I'm holding it now
                              ...                        
1040    I'm home. Doc gave me pain meds says everythin...
1041    I want to send something that can sell fast.  ...
1042                  U WILL SWITCH YOUR FONE ON DAMMIT!!
1043          Now project pa. After that only i can come.
1044    BIG BROTHER ALERT! The computer has selected u...
Name: Text, Length: 1045, dtype: object

In [13]:
tokenizer.encode(df_train['Text'].iloc[0])

[52,
 423,
 257,
 3200,
 21099,
 81,
 508,
 318,
 2045,
 362,
 787,
 2800,
 351,
 471,
 12,
 19796,
 503,
 508,
 484,
 371,
 9,
 36955,
 282,
 508,
 6834,
 37902,
 523,
 2041,
 12,
 13345,
 319,
 7769,
 2713,
 34583,
 2231,
 5824]

In [14]:
train_dataset = SMSDataset(df_train, tokenizer, max_length = 100)

In [15]:
num_workers = 0
batch_size = 8
torch.manual_seed(123)
from core.data.dataset import SMSDataset
from core.data.dataloader import create_dataloader_sms

train_dataset = SMSDataset(df_train, tokenizer)

validation_dataset = SMSDataset(df_validation, tokenizer, max_length=train_dataset.max_length)

test_dataset = SMSDataset(df_test, tokenizer, max_length=train_dataset.max_length)

train_loader = create_dataloader_sms(train_dataset, batch_size, num_workers=num_workers)

validation_loader = create_dataloader_sms(validation_dataset, batch_size, num_workers=num_workers)

test_loader = create_dataloader_sms(test_dataset, batch_size, num_workers=num_workers)

In [31]:
BASE_CONFIG = {
 "vocab_size": 50257,
 "context_length": 1024,
 "dropout": 0.0,
 "qvk_bias": True
}

In [32]:
model_configs = {
 "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
 "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
 "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
 "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}

In [33]:
CHOOSE_MODEL = "gpt2-small (124M)"

In [34]:
BASE_CONFIG.update(model_configs[CHOOSE_MODEL])
BASE_CONFIG

{'vocab_size': 50257,
 'context_length': 1024,
 'dropout': 0.0,
 'qvk_bias': True,
 'emb_dim': 768,
 'n_layers': 12,
 'n_heads': 12}

In [35]:
INPUT_TEXT = "Every effort moves"

In [36]:
from gpt_download import download_and_load_gpt2
from core.models.gpt import GPTModel

from core.data.utils import load_weights_into_gpt

model_size = CHOOSE_MODEL.split(" ")[-1].lstrip("(").rstrip(")")
settings, params = download_and_load_gpt2(
 model_size=model_size, models_dir=r"C:\Users\WalterBuenodeBritoNe\Documents\GitHub\LLMPlayground\Weigths\gpt2"
)

File already exists and is up-to-date: C:\Users\WalterBuenodeBritoNe\Documents\GitHub\LLMPlayground\Weigths\gpt2\124M\checkpoint
File already exists and is up-to-date: C:\Users\WalterBuenodeBritoNe\Documents\GitHub\LLMPlayground\Weigths\gpt2\124M\encoder.json
File already exists and is up-to-date: C:\Users\WalterBuenodeBritoNe\Documents\GitHub\LLMPlayground\Weigths\gpt2\124M\hparams.json
File already exists and is up-to-date: C:\Users\WalterBuenodeBritoNe\Documents\GitHub\LLMPlayground\Weigths\gpt2\124M\model.ckpt.data-00000-of-00001
File already exists and is up-to-date: C:\Users\WalterBuenodeBritoNe\Documents\GitHub\LLMPlayground\Weigths\gpt2\124M\model.ckpt.index
File already exists and is up-to-date: C:\Users\WalterBuenodeBritoNe\Documents\GitHub\LLMPlayground\Weigths\gpt2\124M\model.ckpt.meta
File already exists and is up-to-date: C:\Users\WalterBuenodeBritoNe\Documents\GitHub\LLMPlayground\Weigths\gpt2\124M\vocab.bpe


In [37]:
model = GPTModel(BASE_CONFIG)
load_weights_into_gpt(model, params)
model.eval()

GPTModel(
  (emb_layer): Embedding(50257, 768)
  (pos_emb_layer): Embedding(1024, 768)
  (dropout): Dropout(p=0.0, inplace=False)
  (transformer_blocks): ModuleList(
    (0-11): 12 x TransformerBlock(
      (attn_layer): MultiHeadAttention(
        (Wq): Linear(in_features=768, out_features=768, bias=True)
        (Wk): Linear(in_features=768, out_features=768, bias=True)
        (Wv): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ff_layer): FeedForward(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (norm_layer1): LayerNorm()
      (norm_layer2): LayerNorm()
      (dropout): Dropout(p=0.0, inplace=False)
    )
  )
  (ln_f): LayerNorm()
  (out_head): Linear(in_features=768, out_features=50257,