# Prepare data

In [1]:
import pandas as pd

music_aug = pd.read_csv("../finalData/music_augmented.csv")
party_aug = pd.read_csv("../finalData/party_augmented.csv")
full = pd.read_csv("../CleanChats/FullChat.csv")

In [2]:
full.rename(columns={"Message": "message", "Section": "section"}, inplace=True)

In [3]:
full = full.drop(columns=["Date", "Time", "Sender"])

sections = ["Football", "Music", "Party"]
final_df = full[full["section"].isin(sections)]

music_aug["section"] = "Music"
party_aug["section"] = "Party"

final_df = pd.concat([final_df, music_aug, party_aug])

In [4]:
# remove duplicates
final_df = final_df.drop_duplicates()
final_df = final_df.reset_index(drop=True)

In [5]:
final_df["section"].value_counts()

Football    3415
Party       2715
Music       1480
Name: section, dtype: int64

In [6]:
final_df.head()

Unnamed: 0,message,section
0,"hey guys, does anyone know a good football cl...",Football
1,Is anyone managing to play here?,Football
2,I'm also looking for a soccer team,Football
3,I applied for the university team,Football
4,At UCV,Football


In [7]:
final_df.describe()

Unnamed: 0,message,section
count,7610.0,7610
unique,7601.0,3
top,,Football
freq,3.0,3415


## Tokenize

In [17]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = tokenizer(final_df["message"].to_list(), truncation=True, padding="max_length", return_tensors="tf")

2023-11-22 11:09:02.816318: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-22 11:09:03.849123: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-22 11:09:03.856741: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-22 11:09:21.773919: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:266] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [22]:
tokenized_dataset["input_ids"]

<tf.Tensor: shape=(7610, 512), dtype=int32, numpy=
array([[  101,  4931,  4364, ...,     0,     0,     0],
       [  101,  2003,  3087, ...,     0,     0,     0],
       [  101,  1045,  1005, ...,     0,     0,     0],
       ...,
       [  101,  4931, 21416, ...,     0,     0,     0],
       [  101, 11685, 10253, ...,     0,     0,     0],
       [  101,  3693,  2149, ...,     0,     0,     0]], dtype=int32)>

In [23]:
from datasets import Dataset
dataset = Dataset.from_dict({
    "input_ids": tokenized_dataset["input_ids"],
    "attention_mask": tokenized_dataset["attention_mask"],
    "labels": final_df["section"]
})

In [30]:
splits = dataset.train_test_split(test_size=0.3, shuffle=True)
dataset_train = splits["train"]
dataset_test = splits["test"]

In [31]:
dataset_train

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5327
})

# Model

In [32]:
# track accuracy during training
import numpy as np
from datasets import load_metric

metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

  metric = load_metric("accuracy")


In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=len(final_df["section"].unique())
)

training_args = TrainingArguments(
    output_dir="../finetuned_model",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_test,
    compute_metrics=compute_metrics

)

trainer.train()