## Fine-tuning for classification

### 1) Preparing the dataset

In [3]:
import pandas as pd
import urllib.request
import zipfile
import os
from pathlib import Path

In [None]:
url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Downloading the file
    with urllib.request.urlopen(url) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

try:
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)
except (urllib.error.HTTPError, urllib.error.URLError, TimeoutError) as e:
    print(f"Primary URL failed: {e}. Trying backup URL...")
    url = "https://f001.backblazeb2.com/file/LLMs-from-scratch/sms%2Bspam%2Bcollection.zip"
    download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path) 

File downloaded and saved as sms_spam_collection/SMSSpamCollection.tsv


In [5]:
df = pd.read_csv(data_file_path, sep="\t", header=None, names=["Label", "Text"])

In [6]:
df

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [7]:
print(f"Number of spam messages: {df[df['Label'] == 'spam'].shape[0]}")
print(f"Number of non spam messages: {df[df['Label'] == 'ham'].shape[0]}")

Number of spam messages: 747
Number of non spam messages: 4825


In [8]:
def create_balanced_dataset(df):
    num_spam = df[df['Label'] == 'spam'].shape[0]

    ham_subset = df[df['Label'] == 'ham'].sample(n=num_spam, random_state=42)
    balanced_df = pd.concat([ham_subset, df[df['Label'] == 'spam']])
    return balanced_df

balanced_df = create_balanced_dataset(df)
print(f"Num of spam messages: {balanced_df[balanced_df['Label'] == 'spam'].shape[0]}")
print(f"Num of non-spam messages: {balanced_df[balanced_df['Label'] == 'ham'].shape[0]}")

Num of spam messages: 747
Num of non-spam messages: 747


In [9]:
balanced_df.head()

Unnamed: 0,Label,Text
3714,ham,If i not meeting ü all rite then i'll go home ...
1311,ham,"I.ll always be there, even if its just in spir..."
548,ham,"Sorry that took so long, omw now"
1324,ham,I thk 50 shd be ok he said plus minus 10.. Did...
3184,ham,Dunno i juz askin cos i got a card got 20% off...


In [10]:
balanced_df["Label"] = balanced_df["Label"].map({"ham": 0, "spam": 1})

In [11]:
balanced_df.head()

Unnamed: 0,Label,Text
3714,0,If i not meeting ü all rite then i'll go home ...
1311,0,"I.ll always be there, even if its just in spir..."
548,0,"Sorry that took so long, omw now"
1324,0,I thk 50 shd be ok he said plus minus 10.. Did...
3184,0,Dunno i juz askin cos i got a card got 20% off...


In [13]:
def random_split(df, train_frac=0.7, valid_frac=0.2):
    df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    train_end = int(train_frac * len(df))
    valid_end = train_end + int(valid_frac * len(df))
    return df[:train_end], df[train_end:valid_end], df[valid_end:] # train, valid, test

In [14]:
train_df, valid_df, test_df = random_split(balanced_df)
train_df.to_csv("train.csv", index=None)
valid_df.to_csv("validation.csv", index=None)
test_df.to_csv("test.csv", index=None)