In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import string

import warnings
warnings.filterwarnings("ignore")

# Make sure to replace the path with the path to your specific file
file_path = 'Data.csv'
df = pd.read_csv(file_path)

def preprocess(df):

  df = df[['text','sentiment']]
  texts = df['text']

  # drop same texts
  df.drop_duplicates(subset = ['text'], inplace = True)

  df['sentiment'] = df['sentiment'].str.lower()
  df['sentiment'] = df['sentiment'].map({'sexist': 'sexist', 'insult': 'insult', 'racist': 'racist', 'profanity': 'profanity'
  , 'not-offensive': 'notoffensive', "notoffensive": "notoffensive", "not offensive" : "notoffensive","ınsult": "insult"})
    # Create dummy columns without dropping any columns
  df_dummies = pd.get_dummies(df['sentiment'], drop_first=False)

  # Concatenate the dummy columns with the original DataFrame
  df = pd.concat([df, df_dummies], axis=1)
  df = df.dropna(subset=['sentiment'])
  df.drop(columns = ['sentiment'],inplace = True)

  # Remove words that contain a '#' in them entirely
  df["text"] = df["text"].apply(lambda text: re.sub(r"#\S+", "", text))

  # Remove rows with less than 5 characters
  df = df[df['text'].str.len() > 5]

  # Remove rows with only punctuation
  df = df[~df['text'].str.contains(r'^[\W_]+$')]

  # Remove rows with only whitespace
  df = df[~df['text'].str.isspace()]

  # Remove rows with only digits
  df = df[~df['text'].str.isdigit()]

  # Apply the function to the "text" column
  df["text"] = df["text"].apply(lambda text: re.sub(r"@\S+", "", text))

  # Remove punctuation from the 'text' column
  punctuation_translator = str.maketrans('', '', string.punctuation)
  df["text"] = df["text"].apply(lambda text: text.translate(punctuation_translator))


  return df
  # df.to_csv("/content/drive/MyDrive/data/Preprocessed_data.csv", index=False)

df = preprocess(df)

In [3]:
df.head()

Unnamed: 0,text,insult,notoffensive,profanity,racist,sexist
0,hemen cep bank yapıyorum ozaman siteye çökücez...,False,True,False,False,False
1,geçmiş olsun fenerin anasini sik,False,False,True,False,False
2,migros adet bilet var ilgilenen varsa yazsın,False,True,False,False,False
3,çok hızlı gidenlere yavaş demek için geride du...,False,True,False,False,False
4,nolu ile fetöcü öğrencilerin tüm borcu silindi...,False,True,False,False,False


In [4]:
# Find the maximum length of the text column
max_length = df['text'].apply(len).max()
df = df[['text','insult','profanity','racist','sexist']]
print(f"The maximum length of the text column is: {max_length} characters")


The maximum length of the text column is: 397 characters


In [5]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader

# Load the BERTurk tokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")

class MultiLabelDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.float)
        }

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

In [6]:
combined_labels = df[['sexist', 'racist', 'profanity', 'insult']].values.tolist()

# Split the dataset into training and validation/test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), combined_labels, test_size=0.2, random_state=42
)

# Create dataset instances
train_dataset = MultiLabelDataset(train_texts, train_labels, tokenizer, max_length=128)
test_dataset = MultiLabelDataset(test_texts, test_labels, tokenizer, max_length=128)


# Split test data into validation and test sets
validation_texts, final_test_texts, validation_labels, final_test_labels = train_test_split(
    test_texts, test_labels, test_size=0.5, random_state=42
)

# Create instances of the dataset class for validation and final test sets
validation_dataset = MultiLabelDataset(validation_texts, validation_labels, tokenizer, max_length=max_length)
final_test_dataset = MultiLabelDataset(final_test_texts, final_test_labels, tokenizer, max_length=max_length)


In [7]:
# Define batch sizes as per your requirement
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(final_test_dataset, batch_size=64, shuffle=False)

In [8]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "dbmdz/bert-base-turkish-uncased",
    num_labels=4  # Number of output labels
)


pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3  # Assuming 3 epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)


In [10]:
import torch
import numpy as np
from tqdm.auto import tqdm
from torch.nn import BCEWithLogitsLoss
from sklearn.metrics import f1_score
# Initialize the device and loss function
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
criterion = BCEWithLogitsLoss()

progress_bar = tqdm(range(num_training_steps))
train_losses, val_losses = [], []


# Training loop
model.train()
for epoch in range(3):  # 3 epochs
    train_loss = 0
    num_correct = 0
    num_total = 0
    training_f1  = []
    validation_f1 = []
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass and calculate loss
        outputs = model(**batch)
        logits = outputs.logits
        loss = criterion(logits, batch['labels'])

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        train_loss += loss.item()

        # Calculate accuracy per batch
        preds = torch.sigmoid(logits) > 0.5
        num_correct += (preds == batch['labels']).all(dim=1).sum().item()
        num_total += len(batch['labels'])
        training_f1.append(f1_score(preds.cpu().detach().numpy(),batch["labels"].cpu().detach().numpy(),average = "weighted"))

        progress_bar.update(1)

    avg_train_loss = train_loss / len(train_loader)
    train_accuracy = num_correct / num_total

    # Validation loop
    model.eval()
    val_loss = 0
    num_correct = 0
    num_total = 0

    for batch in validation_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        val_loss += criterion(logits, batch['labels']).item()

        preds = torch.sigmoid(logits) > 0.5
        num_correct += (preds == batch['labels']).all(dim=1).sum().item()
        num_total += len(batch['labels'])
        validation_f1.append(f1_score(preds.cpu().detach().numpy(),batch["labels"].cpu().detach().numpy(),average = "weighted"))
    avg_val_loss = val_loss / len(validation_loader)
    val_accuracy = num_correct / num_total

    # Print results and update tracking lists
    print(f"Epoch {epoch + 1}:")
    print(f"Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}, Training F1 Score: {np.array(training_f1).mean()}")
    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}, Validation F1 Score: {np.array(validation_f1).mean()}")

    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)

    model.train()

  0%|          | 0/8496 [00:00<?, ?it/s]

Epoch 1:
Training Loss: 0.0808, Training Accuracy: 0.9032, Training F1 Score: 0.8261616072168362
Validation Loss: 0.0608, Validation Accuracy: 0.9204, Validation F1 Score: 0.8541045250187208
Epoch 2:
Training Loss: 0.0523, Training Accuracy: 0.9308, Training F1 Score: 0.8630365447669178
Validation Loss: 0.0545, Validation Accuracy: 0.9230, Validation F1 Score: 0.8439140426076153
Epoch 3:
Training Loss: 0.0378, Training Accuracy: 0.9503, Training F1 Score: 0.8945652599104089
Validation Loss: 0.0586, Validation Accuracy: 0.9227, Validation F1 Score: 0.8437894614667618


In [12]:
# Test the model with the test data and calculate the F1 score, test accuracy, precision, recall, confusion matrix and matthews correlation coefficient

from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, matthews_corrcoef

model.eval()

test_loss = 0
num_correct = 0
num_total = 0

predictions = []
actuals = []

for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}

    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    test_loss += criterion(logits, batch['labels']).item()

    preds = torch.sigmoid(logits) > 0.5
    num_correct += (preds == batch['labels']).all(dim=1).sum().item()
    num_total += len(batch['labels'])

    predictions.extend(preds.cpu().detach().numpy())
    actuals.extend(batch['labels'].cpu().detach().numpy())


avg_test_loss = test_loss / len(test_loader)
test_accuracy = num_correct / num_total

predictions = np.array(predictions)
actuals = np.array(actuals)

f1 = f1_score(actuals, predictions, average='weighted')

precision = precision_score(actuals, predictions, average='weighted')

recall = recall_score(actuals, predictions, average='weighted')

conf_matrix = confusion_matrix(actuals.argmax(axis=1), predictions.argmax(axis=1))

mcc = matthews_corrcoef(actuals.argmax(axis=1), predictions.argmax(axis=1))

print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, Test F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, MCC: {mcc:.4f}")

print("Confusion Matrix:")
print(conf_matrix)

# Save the model
torch.save(model.state_dict(), "model2_v1.pth")

Test Loss: 0.0640, Test Accuracy: 0.9227, Test F1 Score: 0.8381, Precision: 0.8394, Recall: 0.8405, MCC: 0.8352
Confusion Matrix:
[[3865   32   44   94]
 [  32  100    3    2]
 [  26    2 1009   15]
 [ 120   16   34  269]]
