In [26]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from tqdm import tqdm
from transformers import BertForSequenceClassification, AdamW
from sklearn.metrics import precision_score, recall_score, f1_score
from tqdm import tqdm
import torch
import warnings


In [3]:

# 1. Loading the Dataset
data = pd.read_csv('/kaggle/input/sonnnnnn/vodafone_data.csv', delimiter=';')

In [10]:
# Displaying the first few rows
print("Original Data:")
print(data.head())


Original Data:
                                         Explanation  Target
0  Vodafone'a ıyy diyen hayatında hiç vodafone ku...       1
1  Her yerde çekiyor diye geçtiğim güne lanet ols...       0
2  Vodafone benim için bitti artık her yerde çeki...       0
3  Vodafone Sizin de içine mi sızdılar yoksa bili...       0
4  Saf olmamak gerekir. Güvenlik açığı demek olay...       0


In [11]:
# 2. Downloading the Turkish Stop Words List
nltk.download('stopwords')
stop_words = set(stopwords.words('turkish'))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# 3. Cleaning Function
def clean_text(text):
    if isinstance(text, str):  # If the input is a string, clean it
        # Remove special characters (keep numbers)
        text = re.sub(r'[^\w\s]', '', text)
        # Convert to lowercase
        text = text.lower()
        # Remove stop words
        text = " ".join([word for word in text.split() if word not in stop_words])
    else:
        text = str(text)  # If the input is not a string, convert it to a string
    return text

# Clean the text in the 'Explanation' column
data['Explanation'] = data['Explanation'].apply(clean_text)

# Display the cleaned data
print("Cleaned Data:")
print(data.head())


Cleaned Data:
                                         Explanation  Target
0  vodafonea ıyy diyen hayatında vodafone kullanm...       1
1  yerde çekiyor geçtiğim güne lanet olsun çekiyo...       0
2  vodafone benim bitti artık yerde çekiyor aldık...       0
3  vodafone sizin içine mi sızdılar yoksa bilinçl...       0
4  saf olmamak gerekir güvenlik açığı demek olayı...       0


In [15]:
# 4. Splitting the Dataset into Training, Validation, and Test Sets
X = data['Explanation']  # Text data
y = data['Target']  # Labels (0, 1, 2)

# Splitting into training and test sets (70% training, 30% test)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Splitting the test set into validation and test sets (15% validation, 15% test)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


In [18]:
# 5. Formatting for BERT
# Loading the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization and formatting function
def bert_format(texts, labels, max_len=128):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded_dict = tokenizer.encode_plus(
            text,  # Text to tokenize
            add_special_tokens=True,  # Adding [CLS] and [SEP]
            max_length=max_len,  # Maximum length
            padding='max_length',  # Padding to the max length
            truncation=True,  # Truncation
            return_attention_mask=True,  # Creating attention mask
            return_tensors='pt',  # Returning PyTorch tensors
        )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])

    # Converting to tensors
    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels.values)

    return input_ids, attention_masks, labels

# Converting the training, validation, and test sets to BERT format
train_inputs, train_masks, train_labels = bert_format(X_train, y_train)
val_inputs, val_masks, val_labels = bert_format(X_val, y_val)
test_inputs, test_masks, test_labels = bert_format(X_test, y_test)

# Creating DataLoaders
batch_size = 16

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)


In [29]:
# 6. Training the Model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Starting training
epochs = 4

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")

    # Training mode
    model.train()
    total_loss = 0
    total_correct = 0

    for step, batch in enumerate(tqdm(train_dataloader)):
        batch_inputs, batch_masks, batch_labels = batch

        model.zero_grad()

        outputs = model(batch_inputs, attention_mask=batch_masks, labels=batch_labels)
        loss = outputs.loss
        logits = outputs.logits
        total_loss += loss.item()

        # Get predictions and calculate accuracy
        predictions = torch.argmax(logits, dim=1)
        correct = (predictions == batch_labels).sum().item()
        total_correct += correct

        loss.backward()
        optimizer.step()

    avg_train_loss = total_loss / len(train_dataloader)
    avg_train_accuracy = total_correct / len(X_train)
    print(f"Average Training Loss: {avg_train_loss}")
    print(f"Average Training Accuracy: {avg_train_accuracy}")

    # Validation mode
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    total_eval_correct = 0

    all_preds = []
    all_labels = []

    for batch in val_dataloader:
        batch_inputs, batch_masks, batch_labels = batch

        with torch.no_grad():
            outputs = model(batch_inputs, attention_mask=batch_masks, labels=batch_labels)

        loss = outputs.loss
        logits = outputs.logits

        total_eval_loss += loss.item()

        # Get predictions and calculate accuracy
        predictions = torch.argmax(logits, dim=1)
        correct = (predictions == batch_labels).sum().item()
        total_eval_correct += correct

        all_preds.extend(predictions.cpu().numpy())
        all_labels.extend(batch_labels.cpu().numpy())

    avg_val_loss = total_eval_loss / len(val_dataloader)
    avg_val_accuracy = total_eval_correct / len(X_val)
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    f1 = f1_score(all_labels, all_preds, average='weighted')

    print(f"Validation Loss: {avg_val_loss}")
    print(f"Validation Accuracy: {avg_val_accuracy}")
    print(f"Validation Precision: {precision}")
    print(f"Validation Recall: {recall}")
    print(f"Validation F1 Score: {f1}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/4


100%|██████████| 212/212 [19:13<00:00,  5.44s/it]


Average Training Loss: 0.7502650156054856
Average Training Accuracy: 0.6614429331756357


  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 0.7019997433475826
Validation Accuracy: 0.6882758620689655
Validation Precision: 0.5749364491308065
Validation Recall: 0.6882758620689655
Validation F1 Score: 0.6111047986089106
Epoch 2/4


100%|██████████| 212/212 [19:06<00:00,  5.41s/it]


Average Training Loss: 0.6198162399935272
Average Training Accuracy: 0.7409816676522768


  _warn_prf(average, modifier, msg_start, len(result))


Validation Loss: 0.6901961733465609
Validation Accuracy: 0.72
Validation Precision: 0.5969323940180448
Validation Recall: 0.72
Validation F1 Score: 0.6523276842063999
Epoch 3/4


100%|██████████| 212/212 [18:55<00:00,  5.36s/it]


Average Training Loss: 0.5071898913889561
Average Training Accuracy: 0.7903607332939089
Validation Loss: 0.6175025416457135
Validation Accuracy: 0.76
Validation Precision: 0.7743671285398821
Validation Recall: 0.76
Validation F1 Score: 0.7327264037574498
Epoch 4/4


100%|██████████| 212/212 [18:57<00:00,  5.36s/it]


Average Training Loss: 0.37615716782451236
Average Training Accuracy: 0.8518628030751035
Validation Loss: 0.6767186690931735
Validation Accuracy: 0.7655172413793103
Validation Precision: 0.7590840619764833
Validation Recall: 0.7655172413793103
Validation F1 Score: 0.7531716069004789


In [30]:
# 7. Modeli Kaydetme
model.save_pretrained('model1')
tokenizer.save_pretrained('tokenizer1')

('tokenizer1/tokenizer_config.json',
 'tokenizer1/special_tokens_map.json',
 'tokenizer1/vocab.txt',
 'tokenizer1/added_tokens.json')

In [31]:
8.
def predict(text, model, tokenizer, max_len=128):
    encoded_dict = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoded_dict['input_ids']
    attention_mask = encoded_dict['attention_mask']

    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_class = torch.argmax(logits, dim=1).item()

    return predicted_class

# Kullanıcıdan giriş alma ve tahmin yapma
user_input = input("Tahmin yapmak için bir metin girin: ")
prediction = predict(user_input, model, tokenizer)
print(f"Tahmin edilen sınıf: {prediction}")

Tahmin yapmak için bir metin girin:  vodafone berbat


Tahmin edilen sınıf: 0
