In [77]:
import urllib
import os
import numpy as np
import torch
import matplotlib.pyplot as plt
import zipfile
from pathlib import Path
import pandas as pd

from torch.utils.data import DataLoader, Dataset

In [20]:
if not os.path.exists("sms_spam_collection/SMSSpamCollection.tsv"):
    url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
    zip_path = "sms_spam_collection.zip"
    extract_path = "sms_spam_collection"
    data_fite_path = Path(extract_path) / "SMSSpamCollection.tsv"

    with urllib.request.urlopen(url) as response:
        with open (zip_path, "wb") as out_file:
            out_file.write(response.read())
    with zipfile.ZipFile(zip_path, "r") as f:
        f.extractall(extract_path)

    original_file_path = Path(extract_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_fite_path)

    os.remove(zip_path)
    print("file downloaded just now")
else: print("file already downloaded")

file downloaded just now


In [29]:
df = pd.read_csv(data_fite_path, sep = "\t", header = None, names = ["Label", "Text"])
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [32]:
df["Label"].value_counts()

Label
ham     4825
spam     747
Name: count, dtype: int64

In [40]:
int(df["Label"].value_counts().spam)

747

In [47]:
ham_subset = df[df["Label"] == "ham"].sample(int(df["Label"].value_counts().spam))
balanced_df = pd.concat((ham_subset, df[df["Label"] == "spam"])).reset_index(drop = True)
balanced_df["Label"].value_counts()

Label
ham     747
spam    747
Name: count, dtype: int64

In [48]:
balanced_df

Unnamed: 0,Label,Text
0,ham,Are u coming to the funeral home
1,ham,"OH MR SHEFFIELD! You wanna play THAT game, oka..."
2,ham,Howz that persons story
3,ham,Also remember to get dobby's bowl from your car
4,ham,I forgot 2 ask ü all smth.. There's a card on ...
...,...,...
1489,spam,Want explicit SEX in 30 secs? Ring 02073162414...
1490,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
1491,spam,Had your contract mobile 11 Mnths? Latest Moto...
1492,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [None]:
# ham = 0
# spam = 0

In [51]:
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})
balanced_df

Unnamed: 0,Label,Text
0,0,Are u coming to the funeral home
1,0,"OH MR SHEFFIELD! You wanna play THAT game, oka..."
2,0,Howz that persons story
3,0,Also remember to get dobby's bowl from your car
4,0,I forgot 2 ask ü all smth.. There's a card on ...
...,...,...
1489,1,Want explicit SEX in 30 secs? Ring 02073162414...
1490,1,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
1491,1,Had your contract mobile 11 Mnths? Latest Moto...
1492,1,REMINDER FROM O2: To get 2.50 pounds free call...


In [71]:
def random_split(df :pd.DataFrame, train_split, val_split):
    assert (train_split > 0 )and (train_split < 1)
    assert (val_split > 0) and (val_split < 1)

    shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)

    n = len(df)
    train_end = int(train_split * n)
    val_end = train_end + int(val_split * n)

    train_df = shuffled_df.iloc[:train_end]
    val_df   = shuffled_df.iloc[train_end:val_end]
    test_df  = shuffled_df.iloc[val_end:]

    return train_df, val_df, test_df

    return train_df, val_df, test_df

train_df, val_df, test_df = random_split(balanced_df, 0.7, 0.1)
len(train_df), len(val_df), len(test_df)

(1045, 149, 300)

In [72]:
train_df.to_csv("train.csv", index = None)
val_df.to_csv("val.csv", index = None)
test_df.to_csv("test.csv", index = None)

In [76]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
tokenizer.decode([50256])

'<|endoftext|>'

In [79]:
class SpamDataset(Dataset):
    def __init__(self, csv_file, tokenizer, max_length = None, pad_token = 50256):
        
        self.df = pd.read_csv(csv_file)
        self.encoded_text = [tokenizer.encode(text) for text in self.df["Text"]]

        if max_length is None:
            self.max_length = self._longest_encoded_length()
        else:
            self.max_length = self.max_length
            [self.encoded_text[:self.max_length] for encoded_text in self.encoded_text]

        # padding
        self.encoded_text = [encoded_text + [pad_token] * abs(self.max_length - len(encoded_text)) for encoded_text in self.encoded_text]