In [2]:
import urllib.request
import zipfile
import os
from pathlib import Path

In [2]:

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Downloading the file
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

try:
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
    print(f"Primary URL failed: {e}. Trying backup URL...")
    url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) 


File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv


In [3]:
! head -10 sms_spam_collection/SMSSpamCollection.tsv

ham	Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
ham	Ok lar... Joking wif u oni...
spam	Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's
ham	U dun say so early hor... U c already then say...
ham	Nah I don't think he goes to usf, he lives around here though
spam	FreeMsg Hey there darling it's been 3 week's now and no word back! I'd like some fun you up for it still? Tb ok! XxX std chgs to send, £1.50 to rcv
ham	Even my brother is not like to speak with me. They treat me like aids patent.
ham	As per your request 'Melle Melle (Oru Minnaminunginte Nurungu Vettam)' has been set as your callertune for all Callers. Press *9 to copy your friends Callertune
spam	WINNER!! As a valued network customer you have been selected to receivea £900 prize reward! To claim call 09061701461. Claim code KL341. Valid 12 hours only.
spam	H

In [6]:
import pandas as pd


df=pd.read_csv("sms_spam_collection/SMSSpamCollection.tsv", sep='\t', header=None, names=['label', 'content'])

In [7]:
df

Unnamed: 0,label,content
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [8]:
print(df["label"].value_counts())


label
ham     4825
spam     747
Name: count, dtype: int64


In [9]:
# we want to create a balanced dataset 

def create_balanced_dataset(df):
   
    df_spam= df[df['label'] == 'spam']
    df_ham= df[df['label'] == 'ham']

    N_sample=min(len(df_spam), len(df_ham))
                  
    df_spam_sampled = df_spam.sample(n=N_sample, random_state=111)
    df_ham_sampled = df_ham.sample(n=N_sample, random_state=111)

    df_balanced = pd.concat([df_ham_sampled, df_spam_sampled])

    return df_balanced


In [25]:
balanced_df= create_balanced_dataset(df)

In [26]:
print(balanced_df["label"].value_counts())

label
ham     747
spam    747
Name: count, dtype: int64


In [27]:
balanced_df["label"]=balanced_df["label"].map({"ham": 0, "spam": 1})

In [28]:
# train test validation split
from sklearn.model_selection import train_test_split
def split_data(df, test_size=0.2, val_size=0.1):
    train_df, test_df = train_test_split(df, test_size=test_size+val_size, random_state=111)
    test_df, val_df = train_test_split(test_df, test_size=test_size/(test_size+val_size), random_state=111)
    return train_df, val_df, test_df

In [29]:
train_df, val_df, test_df = split_data(balanced_df)
print(len(train_df), len(val_df), len(test_df))

1045 300 149


In [30]:
train_df.to_csv('train_data.csv')
val_df.to_csv('val_data.csv')
test_df.to_csv('test_data.csv')

In [32]:
data_tmp=pd.read_csv("train_data.csv")

In [33]:
print(data_tmp)

      Unnamed: 0  label                                            content
0           4578      1  Had your contract mobile 11 Mnths? Latest Moto...
1           1245      0                 Now? I'm going out 4 dinner soon..
2           1007      1  Panasonic & BluetoothHdset FREE. Nokia FREE. M...
3           1350      1  FREE2DAY sexy St George's Day pic of Jordan!Tx...
4            973      0              Yup i thk so until e shop closes lor.
...          ...    ...                                                ...
1040         507      0  Maybe westshore or hyde park village, the plac...
1041         305      1  SMS. ac Blind Date 4U!: Rodds1 is 21/m from Ab...
1042        1317      0               Why nothing. Ok anyway give me treat
1043         772      0  Lol! U drunkard! Just doing my hair at d momen...
1044        2146      1  Sunshine Hols. To claim ur med holiday send a ...

[1045 rows x 3 columns]


In [36]:
data_test=pd.read_csv("train_data.csv")
print(data_test)
labels= data_tmp['label'].tolist()
content= data_tmp['content'].astype(str).tolist()

      Unnamed: 0  label                                            content
0           4578      1  Had your contract mobile 11 Mnths? Latest Moto...
1           1245      0                 Now? I'm going out 4 dinner soon..
2           1007      1  Panasonic & BluetoothHdset FREE. Nokia FREE. M...
3           1350      1  FREE2DAY sexy St George's Day pic of Jordan!Tx...
4            973      0              Yup i thk so until e shop closes lor.
...          ...    ...                                                ...
1040         507      0  Maybe westshore or hyde park village, the plac...
1041         305      1  SMS. ac Blind Date 4U!: Rodds1 is 21/m from Ab...
1042        1317      0               Why nothing. Ok anyway give me treat
1043         772      0  Lol! U drunkard! Just doing my hair at d momen...
1044        2146      1  Sunshine Hols. To claim ur med holiday send a ...

[1045 rows x 3 columns]


In [38]:
print(labels[:10])
print(content[:10])

[1, 0, 1, 1, 0, 0, 0, 1, 0, 1]
['Had your contract mobile 11 Mnths? Latest Motorola, Nokia etc. all FREE! Double Mins & Text on Orange tariffs. TEXT YES for callback, no to remove from records', "Now? I'm going out 4 dinner soon..", 'Panasonic & BluetoothHdset FREE. Nokia FREE. Motorola FREE & DoubleMins & DoubleTxt on Orange contract. Call MobileUpd8 on 08000839402 or call 2optout', "FREE2DAY sexy St George's Day pic of Jordan!Txt PIC to 89080 dont miss out, then every wk a saucy celeb!4 more pics c PocketBabe.co.uk 0870241182716 £3/wk", 'Yup i thk so until e shop closes lor.', 'what is your account number?', 'No. Did you multimedia message them or e-mail?', "UpgrdCentre Orange customer, you may now claim your FREE CAMERA PHONE upgrade for your loyalty. Call now on 0207 153 9153. Offer ends 26th July. T&C's apply. Opt-out available", 'Aiyah u did ok already lar. E nydc at wheellock?', "Free entry in 2 a weekly comp for a chance to win an ipod. Txt POD to 80182 to get entry (std txt ra

In [None]:
import torch
import tiktoken
enc= tiktoken.get_encoding("gpt2")
X_test=[]
for text in content:
    tokens = enc.encode(text)
    X_test.append(tokens)

TypeError: expected string or buffer