In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
import string

import warnings
warnings.filterwarnings("ignore")

# Make sure to replace the path with the path to your specific file
file_path = 'Data.csv'
df = pd.read_csv(file_path)

def preprocess(df):

  df = df[['text','sentiment']]
  texts = df['text']

  # drop same texts
  df.drop_duplicates(subset = ['text'], inplace = True)

  df['sentiment'] = df['sentiment'].str.lower()
  df['sentiment'] = df['sentiment'].map({'sexist': 'sexist', 'insult': 'insult', 'racist': 'racist', 'profanity': 'profanity'
  , 'not-offensive': 'notoffensive', "notoffensive": "notoffensive", "not offensive" : "notoffensive","ınsult": "insult"})
    # Create dummy columns without dropping any columns
  df_dummies = pd.get_dummies(df['sentiment'], drop_first=False)

  # Concatenate the dummy columns with the original DataFrame
  df = pd.concat([df, df_dummies], axis=1)
  df = df.dropna(subset=['sentiment'])
  df.drop(columns = ['sentiment'],inplace = True)

  # Remove words that contain a '#' in them entirely
  df["text"] = df["text"].apply(lambda text: re.sub(r"#\S+", "", text))

  # Remove rows with less than 5 characters
  df = df[df['text'].str.len() > 5]

  # Remove rows with only punctuation
  df = df[~df['text'].str.contains(r'^[\W_]+$')]

  # Remove rows with only whitespace
  df = df[~df['text'].str.isspace()]

  # Remove rows with only digits
  df = df[~df['text'].str.isdigit()]

  # Apply the function to the "text" column
  df["text"] = df["text"].apply(lambda text: re.sub(r"@\S+", "", text))

  # Remove punctuation from the 'text' column
  punctuation_translator = str.maketrans('', '', string.punctuation)
  df["text"] = df["text"].apply(lambda text: text.translate(punctuation_translator))


  return df
  # df.to_csv("/content/drive/MyDrive/data/Preprocessed_data.csv", index=False)

df = preprocess(df)

In [2]:
df.head()

Unnamed: 0,text,insult,notoffensive,profanity,racist,sexist
0,hemen cep bank yapıyorum ozaman siteye çökücez...,0,1,0,0,0
1,geçmiş olsun fenerin anasini sik,0,0,1,0,0
2,migros adet bilet var ilgilenen varsa yazsın,0,1,0,0,0
3,çok hızlı gidenlere yavaş demek için geride du...,0,1,0,0,0
4,nolu ile fetöcü öğrencilerin tüm borcu silindi...,0,1,0,0,0


In [3]:
# Find the maximum length of the text column
max_length = df['text'].apply(len).max()
df = df[['text','notoffensive']]
print(f"The maximum length of the text column is: {max_length} characters")


The maximum length of the text column is: 397 characters


In [4]:
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import torch
from torch.utils.data import Dataset, DataLoader

# Load the BERTurk tokenizer
tokenizer = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")

# Define a custom dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Split into training and test datasets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['notoffensive'].tolist(), test_size=0.2, random_state=42
)

# Create instances of the dataset class-
train_dataset = TextDataset(train_texts, train_labels, tokenizer, max_length=max_length)
test_dataset = TextDataset(test_texts, test_labels, tokenizer, max_length=max_length)


Downloading tokenizer_config.json: 100%|██████████| 59.0/59.0 [00:00<00:00, 29.1kB/s]
Downloading config.json: 100%|██████████| 385/385 [00:00<00:00, 141kB/s]
Downloading vocab.txt: 100%|██████████| 263k/263k [00:00<00:00, 669kB/s]


In [5]:
# Split test data into validation and test sets
validation_texts, final_test_texts, validation_labels, final_test_labels = train_test_split(
    test_texts, test_labels, test_size=0.5, random_state=42
)

# Create instances of the dataset class for validation and final test sets
validation_dataset = TextDataset(validation_texts, validation_labels, tokenizer, max_length=max_length)
final_test_dataset = TextDataset(final_test_texts, final_test_labels, tokenizer, max_length=max_length)


In [6]:
# Define batch sizes as per your requirement
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
validation_loader = DataLoader(validation_dataset, batch_size=64, shuffle=False)
test_loader = DataLoader(final_test_dataset, batch_size=64, shuffle=False)


In [7]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("dbmdz/bert-base-turkish-uncased", num_labels=2)


Downloading pytorch_model.bin: 100%|██████████| 445M/445M [08:31<00:00, 870kB/s] 
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dbmdz/bert-base-turkish-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3  # Assuming 3 epochs
lr_scheduler = get_scheduler(
    "linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)


In [None]:
import torch
from tqdm.auto import tqdm

# Initialize the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Initialize progress bar and storage lists for losses and accuracies
progress_bar = tqdm(range(num_training_steps))
train_losses, val_losses, train_accuracies, val_accuracies = [], [], [], []

# Training loop
model.train()
for epoch in range(3):  # 3 epochs
    train_loss = 0
    num_correct = 0
    num_total = 0

    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass and calculate loss
        outputs = model(**batch)
        loss = outputs.loss

        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        train_loss += loss.item()
        predictions = torch.argmax(outputs.logits, dim=-1)
        num_correct += (predictions == batch['labels']).sum().item()
        num_total += len(batch['labels'])

        progress_bar.update(1)

    # Calculate the average training loss and accuracy for this epoch
    avg_train_loss = train_loss / len(train_loader)
    train_accuracy = num_correct / num_total

    # Validation loop
    model.eval()
    val_loss = 0
    num_correct = 0
    num_total = 0

    for batch in validation_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.no_grad():
            outputs = model(**batch)

        val_loss += outputs.loss.item()
        predictions = torch.argmax(outputs.logits, dim=-1)
        num_correct += (predictions == batch['labels']).sum().item()
        num_total += len(batch['labels'])

    avg_val_loss = val_loss / len(validation_loader)
    val_accuracy = num_correct / num_total

    # Print losses and accuracies after each epoch
    print(f"Epoch {epoch + 1}:")
    print(f"Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}")
    print(f"Validation Loss: {avg_val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}")

    # Record the losses and accuracies for later plotting
    train_losses.append(avg_train_loss)
    val_losses.append(avg_val_loss)
    train_accuracies.append(train_accuracy)
    val_accuracies.append(val_accuracy)

    model.train()


  0%|          | 0/8496 [00:00<?, ?it/s]

Epoch 1:
Training Loss: 0.1910, Training Accuracy: 0.9252
Validation Loss: 0.1583, Validation Accuracy: 0.9375
Epoch 2:
Training Loss: 0.1273, Training Accuracy: 0.9492
Validation Loss: 0.1630, Validation Accuracy: 0.9361
Epoch 3:
Training Loss: 0.0767, Training Accuracy: 0.9700
Validation Loss: 0.1993, Validation Accuracy: 0.9363


In [None]:
torch.save(model.state_dict(),"/content/drive/MyDrive/model1_v1.pth")

In [10]:
model.load_state_dict(torch.load("model1_v1.pth"))

<All keys matched successfully>

In [20]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix, matthews_corrcoef
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

criterion = torch.nn.CrossEntropyLoss()
test_loss = 0
num_correct = 0
num_total = 0

predictions = []
actuals = []

for batch in test_loader:
    batch = {k: v.to(device) for k, v in batch.items()}

    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    test_loss += criterion(logits, batch['labels']).item()

    preds =  torch.argmax(logits, dim=-1)
    
    num_correct += (preds == batch['labels']).sum().item()
    num_total += len(batch['labels'])

    predictions.extend(preds.cpu().detach().numpy())
    actuals.extend(batch['labels'].cpu().detach().numpy())


avg_test_loss = test_loss / len(test_loader)
test_accuracy = num_correct / num_total

predictions = np.array(predictions)
actuals = np.array(actuals)


In [24]:

f1 = f1_score(actuals, predictions, average='weighted')

precision = precision_score(actuals, predictions, average='weighted')

recall = recall_score(actuals, predictions, average='weighted')

conf_matrix = confusion_matrix(actuals, predictions)

mcc = matthews_corrcoef(actuals, predictions)

print(f"Test Loss: {avg_test_loss:.4f}, Test Accuracy: {test_accuracy:.4f}, Test F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, MCC: {mcc:.4f}")

print("Confusion Matrix:")
print(conf_matrix)

Test Loss: 0.2089, Test Accuracy: 0.9375, Test F1 Score: 0.9373, Precision: 0.9372, Recall: 0.9375, MCC: 0.8475
Confusion Matrix:
[[1452  191]
 [ 163 3857]]
