<a href="https://colab.research.google.com/github/IgorCzudy/whatsapp_message_clasterization/blob/main/notebooks/GC_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
! pip install transformers datasets evaluate
! pip install accelerate -U



# Prepare data

In [16]:
import pandas as pd
music_aug = pd.read_csv("https://raw.githubusercontent.com/IgorCzudy/whatsapp_message_clasterization/main/finalData/music_augmented.csv")
party_aug = pd.read_csv("https://raw.githubusercontent.com/IgorCzudy/whatsapp_message_clasterization/main/finalData/party_augmented.csv")
full = pd.read_csv("https://raw.githubusercontent.com/IgorCzudy/whatsapp_message_clasterization/main/CleanChats/FullChat.csv")

In [17]:
full.rename(columns={"Message": "message", "Section": "section"}, inplace=True)

In [18]:
full = full.drop(columns=["Date", "Time", "Sender"])

sections = ["Football", "Music", "Party","General"]
final_df = full[full["section"].isin(sections)]

music_aug["section"] = "Music"
party_aug["section"] = "Party"

final_df = pd.concat([final_df, music_aug, party_aug])

In [19]:
# remove duplicates
final_df = final_df.drop_duplicates()
final_df = final_df.reset_index(drop=True)

In [20]:
final_df["section"].value_counts()

Football    3415
Party       2715
General     2636
Music       1480
Name: section, dtype: int64

In [21]:
final_df=final_df.rename(columns={'section': 'label','message':'text'})
final_df['label'] = final_df['label'].replace('General', 0)
final_df['label'] = final_df['label'].replace('Football', 1)
final_df['label'] = final_df['label'].replace('Party', 2)
final_df['label'] = final_df['label'].replace('Music', 3)
final_df.head()

Unnamed: 0,text,label
0,Someone going to the beach this afternoon?,0
1,We'll be a few minutes late \nSiuuuuuuu,0
2,locatie: https://maps.google.com/?q=39.473291...,0
3,"Hii, I also will be a few minutes late for th...",0
4,Me too,0


In [22]:

final_df.describe()

Unnamed: 0,label
count,10246.0
mean,1.296604
std,1.006064
min,0.0
25%,0.0
50%,1.0
75%,2.0
max,3.0


## Tokenize

In [23]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from datasets import Dataset

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

data = Dataset.from_pandas(final_df)

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
tokenized_dataset = data.map(preprocess_function, batched=True)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/10246 [00:00<?, ? examples/s]

In [24]:
tokenized_dataset[1]


{'text': " We'll be a few minutes late \nSiuuuuuuu",
 'label': 0,
 'input_ids': [101,
  2057,
  1005,
  2222,
  2022,
  1037,
  2261,
  2781,
  2397,
  9033,
  2226,
  2226,
  2226,
  2226,
  2226,
  2226,
  2226,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [25]:
splits = tokenized_dataset.train_test_split(test_size=0.3, shuffle=True)
dataset_train = splits["train"]
dataset_test = splits["test"]

# Model

In [37]:
# track accuracy during training
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")
recall = evaluate.load("recall")
f1m = evaluate.load("f1")
precision = evaluate.load("precision")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    acc = accuracy.compute(predictions=predictions, references=labels)["accuracy"]
    rc = recall.compute(predictions=predictions, references=labels,average='macro')["recall"]
    f1 = f1m.compute(predictions=predictions, references=labels,average='macro')["f1"]
    pcs = precision.compute(predictions=predictions, references=labels,average='macro')["precision"]
    return {"accuracy":acc,"precision":pcs,"recall":rc,"f1":f1}

# Login to publish model to hugginface


In [27]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [38]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

id2label = {0: "GENERAL", 1: "FOOTBALL", 2: "PARTY", 3: "MUSIC"}
label2id = {"GENERAL": 0, "FOOTBALL": 1, "PARTY": 2, "MUSIC": 3}

model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=4, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="whatsapp-group-classifier",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.434363,0.841574,0.864731,0.842684,0.851658
2,0.554600,0.460606,0.818803,0.854971,0.838875,0.839062
3,0.313600,0.512457,0.846779,0.868586,0.852836,0.859754
4,0.200800,0.603974,0.838647,0.855957,0.84698,0.85118
5,0.147500,0.732558,0.838647,0.859225,0.847554,0.852764
6,0.081100,0.786516,0.839297,0.853665,0.84831,0.850759


TrainOutput(global_step=2694, training_loss=0.2483255128640994, metrics={'train_runtime': 650.5749, 'train_samples_per_second': 66.145, 'train_steps_per_second': 4.141, 'total_flos': 1289158376092704.0, 'train_loss': 0.2483255128640994, 'epoch': 6.0})

To publish to hugginface need login with the access token

In [39]:
#trainer.push_to_hub()

events.out.tfevents.1700927728.d7f5b7938fe3.174.6:   0%|          | 0.00/8.39k [00:00<?, ?B/s]

'https://huggingface.co/DTempo/whatsapp-group-classifier/tree/main/'

## Inference

In [40]:
from transformers import pipeline
import transformers
#Change model to the one you want to use
model = "Dtempo/whatsapp-group-classifier"
tokenizer = AutoTokenizer.from_pretrained(model)
pipeline = transformers.pipeline("text-classification",model=model)

sequences = pipeline(["Hey everyone! There's a jazz concert at Jimmy Glass Jazz Bar this Friday night. Who's up for some  tunes?",
                      "Lest go do some running tomorrow ",
                      "Hello, any plans for tonight? I heard Bamboo Pub has a free entry until 00:30. What do you think?"])
for seq in sequences:
    print(f"Result: {seq}")


tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/913 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

Result: {'label': 'MUSIC', 'score': 0.9926152229309082}
Result: {'label': 'FOOTBALL', 'score': 0.9275616407394409}
Result: {'label': 'PARTY', 'score': 0.9916572570800781}
