<a href="https://colab.research.google.com/github/IgorCzudy/whatsapp_message_clasterization/blob/main/notebooks/GC_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
! pip install transformers datasets evaluate
! pip install accelerate -U



# Prepare data

In [17]:
import pandas as pd
music_aug = pd.read_csv("https://raw.githubusercontent.com/IgorCzudy/whatsapp_message_clasterization/main/finalData/music_augmented.csv")
party_aug = pd.read_csv("https://raw.githubusercontent.com/IgorCzudy/whatsapp_message_clasterization/main/finalData/party_augmented.csv")
full = pd.read_csv("https://raw.githubusercontent.com/IgorCzudy/whatsapp_message_clasterization/main/CleanChats/FullChat.csv")

In [18]:
full.rename(columns={"Message": "message", "Section": "section"}, inplace=True)

In [19]:
full = full.drop(columns=["Date", "Time", "Sender"])

sections = ["Football", "Music", "Party","General"]
final_df = full[full["section"].isin(sections)]

music_aug["section"] = "Music"
party_aug["section"] = "Party"

final_df = pd.concat([final_df, music_aug, party_aug])

In [20]:
# remove duplicates
final_df = final_df.drop_duplicates()
final_df = final_df.reset_index(drop=True)

In [21]:
final_df["section"].value_counts()

Football    3415
Party       2715
General     2636
Music       1480
Name: section, dtype: int64

In [22]:
final_df=final_df.rename(columns={'section': 'label','message':'text'})
final_df['label'] = final_df['label'].replace('General', 0)
final_df['label'] = final_df['label'].replace('Football', 1)
final_df['label'] = final_df['label'].replace('Party', 2)
final_df['label'] = final_df['label'].replace('Music', 3)
final_df.head()

Unnamed: 0,text,label
0,Someone going to the beach this afternoon?,0
1,We'll be a few minutes late \nSiuuuuuuu,0
2,locatie: https://maps.google.com/?q=39.473291...,0
3,"Hii, I also will be a few minutes late for th...",0
4,Me too,0


In [23]:

final_df.describe()

Unnamed: 0,label
count,10246.0
mean,1.296604
std,1.006064
min,0.0
25%,0.0
50%,1.0
75%,2.0
max,3.0


## Tokenize

In [24]:
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from datasets import Dataset

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

data = Dataset.from_pandas(final_df)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = data.map(preprocess_function, batched=True)


data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/10246 [00:00<?, ? examples/s]

In [25]:
tokenized_dataset[1]


{'text': " We'll be a few minutes late \nSiuuuuuuu",
 'label': 0,
 'input_ids': [101,
  2057,
  1005,
  2222,
  2022,
  1037,
  2261,
  2781,
  2397,
  9033,
  2226,
  2226,
  2226,
  2226,
  2226,
  2226,
  2226,
  102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [26]:
splits = tokenized_dataset.train_test_split(test_size=0.3, shuffle=True)
dataset_train = splits["train"]
dataset_test = splits["test"]

# Model

In [27]:
# track accuracy during training
import numpy as np
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [28]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments


id2label = {0: "GENERAL", 1: "FOOTBALL", 3: "PARTY", 4: "MUSIC"}
label2id = {"GENERAL": 0, "FOOTBALL": 2, "PARTY": 3, "MUSIC": 4}
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=4, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="whatsappmodel",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.449892,0.836044
2,0.571500,0.435347,0.835719
3,0.343300,0.486174,0.844177
4,0.243500,0.523096,0.840273


TrainOutput(global_step=1796, training_loss=0.3578791225407862, metrics={'train_runtime': 218.709, 'train_samples_per_second': 131.17, 'train_steps_per_second': 8.212, 'total_flos': 432949947398976.0, 'train_loss': 0.3578791225407862, 'epoch': 4.0})