In [1]:
import pandas as pd 
import tiktoken

In [2]:
df = pd.read_csv('sms+spam+collection/SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])
df

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [3]:
df['label'].value_counts()

label
ham     4825
spam     747
Name: count, dtype: int64

In [4]:
reduced_ham = df[df['label'] == 'ham'].sample(len(df[df['label']== 'spam']), random_state=1)
spam = df[df['label'] == 'spam']
df = pd.concat([reduced_ham, spam])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

df.head()

df['label'].value_counts()


label
spam    747
ham     747
Name: count, dtype: int64

In [5]:
df

Unnamed: 0,label,message
0,spam,"URGENT, IMPORTANT INFORMATION FOR O2 USER. TOD..."
1,spam,Panasonic & BluetoothHdset FREE. Nokia FREE. M...
2,spam,Do you want a new Video handset? 750 any time ...
3,spam,Hi if ur lookin 4 saucy daytime fun wiv busty ...
4,spam,09066362231 URGENT! Your mobile No 07xxxxxxxxx...
...,...,...
1489,spam,December only! Had your mobile 11mths+? You ar...
1490,spam,Loans for any purpose even if you have Bad Cre...
1491,spam,You have an important customer service announc...
1492,spam,URGENT! Your Mobile number has been awarded wi...


In [6]:
df['label'].value_counts()

label
spam    747
ham     747
Name: count, dtype: int64

In [7]:
encode = {"ham": 0, "spam": 1}
df['label'] = df['label'].map(encode)
df.head()


Unnamed: 0,label,message
0,1,"URGENT, IMPORTANT INFORMATION FOR O2 USER. TOD..."
1,1,Panasonic & BluetoothHdset FREE. Nokia FREE. M...
2,1,Do you want a new Video handset? 750 any time ...
3,1,Hi if ur lookin 4 saucy daytime fun wiv busty ...
4,1,09066362231 URGENT! Your mobile No 07xxxxxxxxx...


In [9]:
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
df.head()



Unnamed: 0,label,message
0,1,Get your garden ready for summer with a FREE s...
1,1,"Thanks for your ringtone order, ref number R83..."
2,0,Convey my regards to him
3,0,Hey doc pls I want to get nice t shirt for my ...
4,0,"Sorry vikky, i'm Watching olave mandara movie ..."


In [10]:
df['label'].value_counts()

label
1    747
0    747
Name: count, dtype: int64

In [13]:
train_ratio = 0.7
val_ratio = 0.1
test_ratio = 0.2

train_size = int(train_ratio * len(df))
val_size = int(val_ratio * len(df))
test_size = int(test_ratio * len(df))

train_df = df[:train_size]
val_df = df[train_size:train_size+val_size]
test_df = df[train_size+val_size:]

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")



Train size: 1045
Validation size: 149
Test size: 300


In [18]:
train_df.to_csv('sms+spam+collection/train.csv', index=False)
val_df.to_csv('sms+spam+collection/val.csv', index=False)
test_df.to_csv('sms+spam+collection/test.csv', index=False)





In [2]:
from dataset import SpamDataset

train_dataset = SpamDataset(
    csv_path='sms+spam+collection/train.csv',
    tokenizer=tiktoken.get_encoding('gpt2'),
    max_len=None
)

In [4]:
x,y = train_dataset[0]

In [6]:
print(x)
print('-'*100)
print(y)

tensor([ 3855,   534, 11376,  3492,   329,  3931,   351,   257, 17189,  6356,
          286,  3931, 34122,   290, 11904,  2861,  4248,  2091,    25,  1120,
          691,   351,   383, 33860,   805,   428,  3909,    13,  1675,  2245,
          467,    17,   407,   742,    13,  1073,    13,  2724, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256,
        50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256])
-----------------------------------------------------------------------------