In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sms-spam-collection-dataset/spam.csv
/kaggle/input/spam-mails-dataset/spam_ham_dataset.csv


In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from datasets import load_dataset, concatenate_datasets
from transformers import (
    RobertaTokenizerFast,
    RobertaForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    AutoConfig,
    TextClassificationPipeline,
    AutoModelForSequenceClassification,
    AutoTokenizer
)

In [5]:
sms_file_path = '/kaggle/input/sms-spam-collection-dataset/spam.csv'
loaded_ds = load_dataset("SetFit/enron_spam", split=None)
concatenated_ds = concatenate_datasets([loaded_ds["train"], loaded_ds["test"]])
to_pandas_ds = pd.DataFrame(concatenated_ds)
to_pandas_ds=to_pandas_ds.drop(to_pandas_ds.columns[[0,3,4,5,6]], axis=1)
df_2 = to_pandas_ds.sample(frac=0.5, random_state=42)


df = pd.read_csv(sms_file_path, encoding='latin-1')
df=df.drop(df.columns[[2,3,4]], axis=1)
df['spam'] = df['v1'].apply(lambda x: 1 if x == 'spam' else 0)
df=df.drop(df.columns[[0]], axis=1)
df.columns = ['text', 'label']
final_df = pd.concat([df, df_2], axis=0, ignore_index=True)

print(final_df)

Repo card metadata block was not found. Setting CardData to empty.


                                                    text  label
0      Go until jurong point, crazy.. Available only ...      0
1                          Ok lar... Joking wif u oni...      0
2      Free entry in 2 a wkly comp to win FA Cup fina...      1
3      U dun say so early hor... U c already then say...      0
4      Nah I don't think he goes to usf, he lives aro...      0
...                                                  ...    ...
22425  agenda : ubs warburg / " energy " integration ...      0
22426  your file sleeps around man cheating -\nstart ...      1
22427  it works greatt hello , welcome to medzonli mo...      1
22428  are you a penny stox player ? mnei - the best ...      1
22429  re [ 10 ] the biggest tit b / \ bes | n the wo...      1

[22430 rows x 2 columns]


In [6]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



In [7]:
X = final_df["text"]
y=final_df["label"]
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [14]:
from datasets import Dataset

In [9]:
train_dataset = Dataset.from_pandas(pd.DataFrame({"text": X_train, "label": y_train}))
val_dataset = Dataset.from_pandas(pd.DataFrame({"text": X_val, "label": y_val}))
test_dataset = Dataset.from_pandas(pd.DataFrame({"text": X_test, "label": y_test}))

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True, max_length=256)

train_dataset = train_dataset.map(tokenize, batched=True)
val_dataset = val_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)


Map:   0%|          | 0/15701 [00:00<?, ? examples/s]

Map:   0%|          | 0/3364 [00:00<?, ? examples/s]

Map:   0%|          | 0/3365 [00:00<?, ? examples/s]

In [11]:
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [12]:
num_labels = 2  
class_names = ["ham", "spam"]
print(f"number of labels: {num_labels}")
print(f"the labels: {class_names}")

id2label = {0: "ham", 1: "spam"}

config = AutoConfig.from_pretrained("roberta-base")
config.update({"id2label": id2label})

number of labels: 2
the labels: ['ham', 'spam']


In [13]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    return {
        "f1": f1_score(labels, predictions, average="binary"),
        "accuracy": accuracy_score(labels, predictions),
        "precision": precision_score(labels, predictions, average="binary"),
        "recall": recall_score(labels, predictions, average="binary"),
    }

In [14]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base',config=config)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="output",
    num_train_epochs=5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_steps=250,
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="tensorboard",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1,Accuracy,Precision,Recall
1,0.0315,0.101173,0.972779,0.977408,0.979798,0.965861
2,0.0428,0.076403,0.980378,0.98365,0.983536,0.97724
3,0.0391,0.065416,0.986856,0.989001,0.985806,0.987909
4,0.0001,0.075085,0.987892,0.989893,0.989301,0.986486
5,0.0001,0.07764,0.98826,0.99019,0.988612,0.987909




TrainOutput(global_step=4910, training_loss=0.05256599639806766, metrics={'train_runtime': 2707.0586, 'train_samples_per_second': 29.0, 'train_steps_per_second': 1.814, 'total_flos': 1.03277667005184e+16, 'train_loss': 0.05256599639806766, 'epoch': 5.0})

In [16]:
trainer.evaluate()



{'eval_loss': 0.06541559100151062,
 'eval_f1': 0.9868561278863234,
 'eval_accuracy': 0.989001189060642,
 'eval_precision': 0.985805535841022,
 'eval_recall': 0.9879089615931721,
 'eval_runtime': 36.9957,
 'eval_samples_per_second': 90.929,
 'eval_steps_per_second': 5.703,
 'epoch': 5.0}

In [17]:
test_results = trainer.predict(test_dataset)

print(test_results.metrics)

predictions = test_results.predictions




{'test_loss': 0.07663601636886597, 'test_f1': 0.9828203292770222, 'test_accuracy': 0.9857355126300149, 'test_precision': 0.9856424982053122, 'test_recall': 0.9800142755174875, 'test_runtime': 37.1981, 'test_samples_per_second': 90.462, 'test_steps_per_second': 5.672}


In [19]:
pipeline = TextClassificationPipeline(model=model, tokenizer=tokenizer,device=0)
message='Todays Vodafone numbers ending with 4882 are selected to a receive a £350 award. If your number matches call 09064019014 to receive your £350 award.'
result = pipeline(message)
print(result)

[{'label': 'spam', 'score': 0.9991759657859802}]


In [2]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [24]:
trainer.push_to_hub("roberta_email_sms_spam_classifier")

training_args.bin:   0%|          | 0.00/5.11k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/coconutsc/output/commit/9316406e3df941878670276bda9579576d130326', commit_message='roberta_email_sms_spam_classifier', commit_description='', oid='9316406e3df941878670276bda9579576d130326', pr_url=None, pr_revision=None, pr_num=None)

In [2]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, RobertaTokenizerFast, TextClassificationPipeline

In [3]:
loaded_model = AutoModelForSequenceClassification.from_pretrained("coconutsc/roberta_email_sms_spam_classifier")
# tokenizer = AutoTokenizer.from_pretrained("coconutsc/output")

config.json:   0%|          | 0.00/788 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

In [9]:
tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
pipeline = TextClassificationPipeline(model=loaded_model, tokenizer=tokenizer,device=0)
message='hi how are you'
result = pipeline(message)
print(result)

[{'label': 'spam', 'score': 0.9870651960372925}]


In [11]:
test_set_path = '/kaggle/input/spam-mails-dataset/spam_ham_dataset.csv'
df_test = pd.read_csv(test_set_path)
df_test=df_test.drop(df_test.columns[[0,1]], axis=1)
df_test.columns = ['text', 'label']
# df_test = df_test.sample(frac=0.5, random_state=42)
print(df_test)


                                                   text  label
0     Subject: enron methanol ; meter # : 988291\r\n...      0
1     Subject: hpl nom for january 9 , 2001\r\n( see...      0
2     Subject: neon retreat\r\nho ho ho , we ' re ar...      0
3     Subject: photoshop , windows , office . cheap ...      1
4     Subject: re : indian springs\r\nthis deal is t...      0
...                                                 ...    ...
5166  Subject: put the 10 on the ft\r\nthe transport...      0
5167  Subject: 3 / 4 / 2000 and following noms\r\nhp...      0
5168  Subject: calpine daily gas nomination\r\n>\r\n...      0
5169  Subject: industrial worksheets for august 2000...      0
5170  Subject: important online banking alert\r\ndea...      1

[5171 rows x 2 columns]


In [12]:
results = pipeline(df_test["text"].tolist(), batch_size=16,padding=True,truncation=True)

In [13]:
df_test["predicted_label"] = [1 if res["label"] == "spam" else 0 for res in results]
print(df_test)

                                                   text  label  \
0     Subject: enron methanol ; meter # : 988291\r\n...      0   
1     Subject: hpl nom for january 9 , 2001\r\n( see...      0   
2     Subject: neon retreat\r\nho ho ho , we ' re ar...      0   
3     Subject: photoshop , windows , office . cheap ...      1   
4     Subject: re : indian springs\r\nthis deal is t...      0   
...                                                 ...    ...   
5166  Subject: put the 10 on the ft\r\nthe transport...      0   
5167  Subject: 3 / 4 / 2000 and following noms\r\nhp...      0   
5168  Subject: calpine daily gas nomination\r\n>\r\n...      0   
5169  Subject: industrial worksheets for august 2000...      0   
5170  Subject: important online banking alert\r\ndea...      1   

      predicted_label  
0                   0  
1                   0  
2                   0  
3                   1  
4                   0  
...               ...  
5166                0  
5167           

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

true_labels = df_test["label"]

accuracy = accuracy_score(true_labels, df_test["predicted_label"])
precision = precision_score(true_labels, df_test["predicted_label"])
recall = recall_score(true_labels, df_test["predicted_label"])
f1 = f1_score(true_labels, df_test["predicted_label"])

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F11 Score: {f1}")


Accuracy: 0.9696383678205376
Precision: 0.9955686853766618
Recall: 0.8992661774516344
F11 Score: 0.9449702067998597
