# Prepare data

- Xử lý email và đưa vào file csv

In [1]:
import pandas as pd
import email
import csv
from email import policy
import os
import re

# Sắp xếp file theo thứ tự
def natural_sort_key(s):
    return [int(c) if c.isdigit() else c.lower() for c in re.split(r'(\d+)', s)]

def pandas_escape(text):
    df = pd.DataFrame([text])
    return df.to_csv(index=False, header=False, quoting=csv.QUOTE_ALL, escapechar='\\').strip()

def email_to_string(email_content):
    msg = email.message_from_string(email_content, policy=policy.default)

    headers = ['From', 'To', 'Subject', 'Date']
    email_data = ""
    for header in headers:
        if msg.get(header, '') == '':
            continue
        email_data += f"{header}: {msg.get(header, '')}\n"

    body = ''
    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == "text/plain":
                body = part.get_payload(decode=True).decode()
                break
    else:
        body = msg.get_payload(decode=True).decode()

    body = body.rstrip('\n')
    full_email = f"{email_data}Body: {body}"
    return pandas_escape(full_email)

def custom_data_to_csv(label, folder_path, csv_file):
    for file_name in sorted(os.listdir(folder_path), key=natural_sort_key):
        file_path = os.path.join(folder_path, file_name)
        email_str = ''
        with open(file_path, 'r') as f:
            email_str = f.read()
            email_str = email_to_string(email_str)
        with open(csv_file, 'a') as f:
            f.write(f"{file_name},{label},{email_str}\n")

In [9]:
ROOT_DIR = "/teamspace/studios/this_studio"

spam_data_dir = os.path.join(ROOT_DIR, "data/TrainData/spam")
notspam_data_dir = os.path.join(ROOT_DIR, "data/TrainData/notspam")
test_data_dir = os.path.join(ROOT_DIR, "data/TestData_nolabel")

train_csv_file = os.path.join(ROOT_DIR, "BaiThi2/train_data.csv")
test_csv_file = os.path.join(ROOT_DIR, "BaiThi2/test_data.csv")

In [5]:
# from custom_data import custom_data_to_csv
with open(train_csv_file, 'w') as csv_f:
    csv_f.write("filename,label,text\n")

custom_data_to_csv("spam", spam_data_dir, train_csv_file)
custom_data_to_csv("notspam", notspam_data_dir, train_csv_file)

In [6]:
df = pd.read_csv(train_csv_file)
print(df.head())
print(df.info())

     filename label                                               text
0  0_spam.txt  spam  Subject: great part-time summer job !\nBody: *...
1  1_spam.txt  spam  Subject: auto insurance rates too high ?\nBody...
2  2_spam.txt  spam  Subject: advertsing ? legal ! ! offer smtp ! !...
3  3_spam.txt  spam  Subject: free trial membership\nBody: latest a...
4  4_spam.txt  spam  Subject: market millions , try free\nBody: mes...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 211 entries, 0 to 210
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  211 non-null    object
 1   label     211 non-null    object
 2   text      211 non-null    object
dtypes: object(3)
memory usage: 5.1+ KB
None


In [5]:
df = df.drop(columns=['filename'])
df

Unnamed: 0,label,text
0,spam,Subject: great part-time summer job !\nBody: *...
1,spam,Subject: auto insurance rates too high ?\nBody...
2,spam,Subject: advertsing ? legal ! ! offer smtp ! !...
3,spam,Subject: free trial membership\nBody: latest a...
4,spam,"Subject: market millions , try free\nBody: mes..."
...,...,...
206,notspam,Subject: english snow words\nBody: jonathan da...
207,notspam,Subject: news ippe ( 04 nov 94 )\nBody: n e w ...
208,notspam,Subject: translator\nBody: order forms transla...
209,notspam,"Subject: special issues\nBody: names , journal..."


# Load dataset

In [6]:
import os
import numpy as np
import pandas as pd
from datasets import Dataset

label2id = {"notspam": 0, "spam": 1}
id2label = {id: label for label, id in label2id.items()}

dataset_email = pd.read_csv(train_csv_file)
dataset_email = dataset_email.drop(columns=["filename"])

def load_dataset(model_type: str = "") -> Dataset:
    """Load dataset."""

    dataset_email["label"] = dataset_email["label"].astype(str)
    if model_type == "AutoModelForSequenceClassification":
        # Convert labels to integers
        dataset_email["label"] = dataset_email["label"].map(
            label2id
        )

    dataset_email["text"] = dataset_email["text"].astype(str)
    dataset = Dataset.from_pandas(dataset_email)
    dataset = dataset.shuffle(seed=42)
    dataset = dataset.train_test_split(test_size=0.2)

    return dataset

In [7]:
# from data_loader import load_dataset

train_dataset = load_dataset("AutoModelForSequenceClassification")
train_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 168
    })
    test: Dataset({
        features: ['label', 'text'],
        num_rows: 43
    })
})

# Initialize base model and tokenizer

In [10]:
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
)

label2id = {"notspam": 0, "spam": 1}
id2label = {id: label for label, id in label2id.items()}

MODEL_ID = "google/flan-t5-base"
REPOSITORY_ID = f"{MODEL_ID.split('/')[1]}-email-classification"

model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, num_labels=len(label2id))
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at google/flan-t5-base and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
type(train_dataset['train']['label'])

list

In [22]:
import evaluate
import nltk
import numpy as np
from sklearn.metrics import precision_recall_fscore_support

metric = evaluate.load("accuracy")

def tokenize_function(examples):
   return tokenizer(examples["text"], padding="max_length", truncation=True)

def compute_metrics(eval_pred) -> dict:
   """Compute metrics for evaluation"""
   logits, labels = eval_pred
   if isinstance(
      logits, tuple
   ):  # if the model also returns hidden_states or attentions
      logits = logits[0]
   predictions = np.argmax(logits, axis=-1)
   precision, recall, f1, _ = precision_recall_fscore_support(
      labels, predictions, average="binary"
   )
   return {"precision": precision, "recall": recall, "f1": f1}

tokenized_datasets = train_dataset.map(tokenize_function, batched=True)
print(tokenized_datasets)

small_train_dataset = tokenized_datasets["train"].shuffle(seed=42)
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42)

Map:   0%|          | 0/168 [00:00<?, ? examples/s]

Map:   0%|          | 0/43 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 168
    })
    test: Dataset({
        features: ['label', 'text', 'input_ids', 'attention_mask'],
        num_rows: 43
    })
})


[nltk_data] Downloading package punkt to
[nltk_data]     /teamspace/studios/this_studio/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Training

In [9]:
training_args = TrainingArguments(
    num_train_epochs=5,
    output_dir=REPOSITORY_ID,
    logging_strategy="steps",
    logging_steps=100,
    report_to="tensorboard",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    fp16=False,  # Overflows with fp16
    learning_rate=3e-4,
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=False,
)

trainer = Trainer(
   model=model,
   args=training_args,
   train_dataset=small_train_dataset,
   eval_dataset=small_eval_dataset,
   compute_metrics=compute_metrics,
)

- Save model

In [19]:
trainer.train()

Step,Training Loss


TrainOutput(global_step=42, training_loss=0.011568018368312291, metrics={'train_runtime': 68.5006, 'train_samples_per_second': 4.905, 'train_steps_per_second': 0.613, 'total_flos': 205221043077120.0, 'train_loss': 0.011568018368312291, 'epoch': 2.0})

In [None]:
tokenizer.save_pretrained(REPOSITORY_ID)
print(trainer.evaluate())

# Evaluate

- Load model

In [11]:
import torch
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
)
model = AutoModelForSequenceClassification.from_pretrained(REPOSITORY_ID)
model.to("cuda") if torch.cuda.is_available() else model.to("cpu")

T5ForSequenceClassification(
  (transformer): T5Model(
    (shared): Embedding(32128, 768)
    (encoder): T5Stack(
      (embed_tokens): Embedding(32128, 768)
      (block): ModuleList(
        (0): T5Block(
          (layer): ModuleList(
            (0): T5LayerSelfAttention(
              (SelfAttention): T5Attention(
                (q): Linear(in_features=768, out_features=768, bias=False)
                (k): Linear(in_features=768, out_features=768, bias=False)
                (v): Linear(in_features=768, out_features=768, bias=False)
                (o): Linear(in_features=768, out_features=768, bias=False)
                (relative_attention_bias): Embedding(32, 12)
              )
              (layer_norm): T5LayerNorm()
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (1): T5LayerFF(
              (DenseReluDense): T5DenseGatedActDense(
                (wi_0): Linear(in_features=768, out_features=2048, bias=False)
                (wi_1): Linea

- Classify & Eval

In [7]:
from time import time
from typing import List, Tuple

import torch
from loguru import logger
from sklearn.metrics import classification_report
from tqdm.auto import tqdm
from transformers import AutoModelForSequenceClassification, AutoTokenizer

def classify(texts_to_classify: List[str]) -> List[Tuple[str, float]]:
    """Classify a list of texts using the model."""
    # Tokenize all texts in the batch
    start = time()
    inputs = tokenizer(
        texts_to_classify,
        return_tensors="pt",
        max_length=512,
        truncation=True,
        padding=True,
    )
    inputs = inputs.to("cuda" if torch.cuda.is_available() else "cpu")
    # Get predictions
    with torch.no_grad():
        outputs = model(**inputs)
    
    logger.debug(
        f"Classification of {len(texts_to_classify)} examples took {time() - start} seconds"
    ) # logger

    # Process the outputs to get the probability distribution
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)

    # Get the top class and the corresponding probability (certainty) for each input text
    confidences, predicted_classes = torch.max(probs, dim=1)
    predicted_classes = (
        predicted_classes.cpu().numpy()
    )  # Move to CPU for numpy conversion if needed
    confidences = confidences.cpu().numpy()  # Same here

    # Map predicted class IDs to labels
    predicted_labels = [id2label[class_id] for class_id in predicted_classes]

    # Zip together the predicted labels and confidences and convert to a list of tuples
    return list(zip(predicted_labels, confidences))

def eval():
    """Evaluate the model on the test dataset."""
    predictions_list, labels_list = [], []

    batch_size = 16  # Adjust batch size based GPU capacity
    num_batches = len(train_dataset["test"]) // batch_size + (
        0 if len(train_dataset["test"]) % batch_size == 0 else 1
    )
    progress_bar = tqdm(total=num_batches, desc="Evaluating")

    for i in range(0, len(train_dataset["test"]), batch_size):
        batch_texts = train_dataset["test"]["text"][i : i + batch_size]
        batch_labels = train_dataset["test"]["label"][i : i + batch_size]

        batch_predictions = classify(batch_texts)

        predictions_list.extend(batch_predictions)
        labels_list.extend([id2label[label_id] for label_id in batch_labels])

        progress_bar.update(1)

    progress_bar.close()
    report = classification_report(labels_list, [pair[0] for pair in predictions_list])
    print(report)

eval()

In [35]:
predictions_list, labels_list = [], []

texts = train_dataset['test']['text']
labels = train_dataset['test']['label']

predictions = classify(texts)

predictions_list.extend(predictions)
labels_list.extend([id2label[label_id] for label_id in labels])

[32m2024-08-10 11:07:41.955[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mclassify[0m:[36m26[0m - [34m[1mClassification of 43 examples took 2.0632755756378174 seconds[0m


In [38]:
for id, prediction in enumerate(predictions):
    print(f"Actual Label: {labels_list[id]}\n>>> Prediction: {predictions_list[id]}")

Actual Label: notspam
>>> Prediction: ('notspam', 0.9999989)
Actual Label: notspam
>>> Prediction: ('notspam', 0.9999988)
Actual Label: notspam
>>> Prediction: ('notspam', 0.99999523)
Actual Label: notspam
>>> Prediction: ('notspam', 0.99999905)
Actual Label: notspam
>>> Prediction: ('notspam', 0.9999988)
Actual Label: notspam
>>> Prediction: ('notspam', 0.9999949)
Actual Label: notspam
>>> Prediction: ('notspam', 0.99999523)
Actual Label: notspam
>>> Prediction: ('notspam', 0.9999994)
Actual Label: notspam
>>> Prediction: ('notspam', 0.99999726)
Actual Label: notspam
>>> Prediction: ('notspam', 0.99999654)
Actual Label: notspam
>>> Prediction: ('notspam', 0.99999833)
Actual Label: notspam
>>> Prediction: ('notspam', 0.99999726)
Actual Label: notspam
>>> Prediction: ('notspam', 0.99999917)
Actual Label: notspam
>>> Prediction: ('notspam', 0.9999988)
Actual Label: notspam
>>> Prediction: ('notspam', 0.99999714)
Actual Label: notspam
>>> Prediction: ('notspam', 0.99999917)
Actual Label: 

# Classify test_dataset

In [10]:
with open(test_csv_file, 'w') as f:
    f.write(f"filename,label,text\n")
custom_data_to_csv(-1, test_data_dir, test_csv_file)

In [11]:
df = pd.read_csv(test_csv_file)
print(df.head())
print(df.info())

        filename  label                                               text
0  0_unknown.txt     -1  Subject: base generated adjuncts\nBody: does a...
1  1_unknown.txt     -1  Subject: basic journals\nBody: are facing majo...
2  2_unknown.txt     -1  Subject: query : tagalog philippine informants...
3  3_unknown.txt     -1  Subject: ancient vocal tract simulation\nBody:...
4  4_unknown.txt     -1  Subject: re : 3 . 386 chomsky , mac concordanc...
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 78 entries, 0 to 77
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  78 non-null     object
 1   label     78 non-null     int64 
 2   text      78 non-null     object
dtypes: int64(1), object(2)
memory usage: 2.0+ KB
None


In [15]:
test_dataset = df
test_dataset["text"] = test_dataset["text"].astype(str)
test_dataset["filename"] = test_dataset["filename"].astype(str)
test_dataset = Dataset.from_pandas(test_dataset)
test_dataset

Dataset({
    features: ['filename', 'label', 'text'],
    num_rows: 78
})

In [18]:
predictions_list = []

texts = test_dataset['text']
filename = test_dataset['filename']
predictions = classify(texts)
predictions_list.extend(predictions)
with open(f"result.csv", "w") as f:
    f.write("")
for id, prediction in enumerate(predictions_list):
    with open("result.csv", "a") as f:
        f.write(f"{filename[id]},{prediction[0]}\n")
    print(f"{filename[id]}: {texts[id]}")
    print(f">>> {prediction}")

[32m2024-08-11 16:17:58.568[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mclassify[0m:[36m26[0m - [34m[1mClassification of 78 examples took 3.7677488327026367 seconds[0m


0_unknown.txt: Subject: base generated adjuncts
Body: does anyone references mechanisms are used encode restrictions base generation adjuncts gb ( . e why adverbs occur predicates adjectives referentials ) ? seems can't selection x - - bar theory is appropriate set restrictions . references lovely , recent ones . post directly & ' ll post summary list 's interest . david adger adger @ uk . ac . ed . cogsci
>>> ('notspam', 0.9999939)
1_unknown.txt: Subject: basic journals
Body: are facing major cuts library periodicals budget university alabama , are being asked eliminate linguistics journals . univ . alabama birmingham has b . . linguistics strong syntacticians , here tuscaloosa " ad hoc interdepartmental program " graduate level strengths applied linguistics , inter - actional sociolinguistics pragmatics . appreciate advice matter . particular , ' re wondering is consensus set essential journals undergraduate major . pragmatics
>>> ('notspam', 0.9999889)
2_unknown.txt: Subject: query 

In [18]:
predictions_list = []

texts = test_dataset['text']
filename = test_dataset['filename']
predictions = classify(texts)
predictions_list.extend(predictions)
with open(f"result.csv", "w") as f:
    f.write("")
for id, prediction in enumerate(predictions_list):
    with open("result.csv", "a") as f:
        f.write(f"{filename[id]},{prediction[0]}\n")
    print(f"{filename[id]}: {texts[id]}")
    print(f">>> {prediction}")

[32m2024-08-11 06:35:47.255[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mclassify[0m:[36m26[0m - [34m[1mClassification of 78 examples took 120.58920001983643 seconds[0m


0_unknown.txt: Subject: base generated adjuncts
Body: does anyone references mechanisms are used encode restrictions base generation adjuncts gb ( . e why adverbs occur predicates adjectives referentials ) ? seems can't selection x - - bar theory is appropriate set restrictions . references lovely , recent ones . post directly & ' ll post summary list 's interest . david adger adger @ uk . ac . ed . cogsci
>>> ('notspam', 0.99999905)
1_unknown.txt: Subject: basic journals
Body: are facing major cuts library periodicals budget university alabama , are being asked eliminate linguistics journals . univ . alabama birmingham has b . . linguistics strong syntacticians , here tuscaloosa " ad hoc interdepartmental program " graduate level strengths applied linguistics , inter - actional sociolinguistics pragmatics . appreciate advice matter . particular , ' re wondering is consensus set essential journals undergraduate major . pragmatics
>>> ('notspam', 0.9999987)
2_unknown.txt: Subject: query

In [45]:
for id, prediction in enumerate(predictions_list):
    formatted_num = "{:.10f}".format(prediction[1])
    print(f"{filename[id].rjust(14)} >>> {prediction[0].rjust(7)} >>> {formatted_num.rjust(10)}")

 0_unknown.txt >>> notspam >>> 0.9999990463
 1_unknown.txt >>> notspam >>> 0.9999986887
 2_unknown.txt >>> notspam >>> 0.9999973774
 3_unknown.txt >>> notspam >>> 0.9999986887
 4_unknown.txt >>> notspam >>> 0.9999979734
 5_unknown.txt >>> notspam >>> 0.9999984503
 6_unknown.txt >>> notspam >>> 0.9999952316
 7_unknown.txt >>> notspam >>> 0.9999991655
 8_unknown.txt >>> notspam >>> 0.9999988079
 9_unknown.txt >>> notspam >>> 0.9999952316
10_unknown.txt >>> notspam >>> 0.9999982119
11_unknown.txt >>> notspam >>> 0.9999976158
12_unknown.txt >>> notspam >>> 0.9999996424
13_unknown.txt >>> notspam >>> 0.9999982119
14_unknown.txt >>> notspam >>> 0.9999985695
15_unknown.txt >>> notspam >>> 0.9999988079
16_unknown.txt >>> notspam >>> 0.9999979734
17_unknown.txt >>> notspam >>> 0.9999984503
18_unknown.txt >>> notspam >>> 0.9999980927
19_unknown.txt >>> notspam >>> 0.9999973774
20_unknown.txt >>> notspam >>> 0.9999983311
21_unknown.txt >>> notspam >>> 0.9999980927
22_unknown.txt >>> notspam >>> 0