In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast
import matplotlib.pyplot as plt
from PIL import Image

# specify GPU
device = torch.device("cuda")



import kagglehub
import os
# Download latest version
path = kagglehub.dataset_download("nelgiriyewithana/mcdonalds-store-reviews")

print("Path to dataset files:", path)
print(os.listdir(path))


file_path= "/root/.cache/kagglehub/datasets/nelgiriyewithana/mcdonalds-store-reviews/versions/1/McDonald_s_Reviews.csv"
df=pd.read_csv(file_path,encoding="ISO-8859-1")
df.head()






In [None]:
df['rating'] = df['rating'].replace({
    '1 star': 1, 
    '2 stars': 2, 
    '3 stars': 3, 
    '4 stars': 4, 
    '5 stars': 5
}).astype(int)
df.head()


In [None]:
from matplotlib import pyplot as plt
import seaborn as sns
df.groupby('rating').size().plot(kind='barh', color=sns.palettes.mpl_palette('Dark2'))
plt.gca().spines[['top', 'right',]].set_visible(False)



In [None]:

from wordcloud import WordCloud, ImageColorGenerator
wordcloud = WordCloud(max_font_size=50, max_words=100, background_color="white").generate(str(df['cleaned_review']))
plt.figure()
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()



In [None]:

from collections import Counter

def term_frequency(text):
    """Calculates the term frequency for a given text."""
    words = text.split()
    word_counts = Counter(words)
    total_words = len(words)
    # Use a different variable name to avoid conflict with external 'tf'
    term_freq = {word: count / total_words for word, count in word_counts.items()}
    return term_freq


df['term_frequency'] = df['cleaned_review'].apply(term_frequency)

print(df[['cleaned_review', 'term_frequency']].head())



import matplotlib.pyplot as plt

all_words = []
for tf in df['term_frequency']:
    all_words.extend(list(tf.keys()))

word_counts = Counter(all_words)
top_words_all = dict(sorted(word_counts.items(), key=lambda item: item[1], reverse=True)[:10])

plt.figure(figsize=(10, 6))
plt.bar(top_words_all.keys(), top_words_all.values())
plt.xlabel("Words")
plt.ylabel("Total Frequency")
plt.title("Top 10 Words Across All Reviews")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset, random_split
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, f1_score
import re
import nltk
import contractions
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import kagglehub
import os

nltk.download('stopwords')
nltk.download('punkt_tab')  
nltk.download('wordnet')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
path = kagglehub.dataset_download("nelgiriyewithana/mcdonalds-store-reviews")
file_path = os.path.join(path, "McDonald_s_Reviews.csv")
df = pd.read_csv(file_path, encoding="ISO-8859-1")

#boş yorumlar  ve eksik veriler silinir.
df['rating'] = df['rating'].astype(str).str.extract(r'(\d+)').astype(int)
df.dropna(subset=['review'], inplace=True)
df = df[df['review'].str.strip() != ""]

# Lemmatizer ve stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = contractions.fix(text) #kısa kelimeleri açar don't->do not
    text = text.lower()
    # özel karakterleri temizler
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)   
    # birden fazla boşluğu tek bir boşlukla değiştirip baştaki ve sondaki boşlukları kaldırır
    text = re.sub(r'\s+', ' ', text).strip()
    words = word_tokenize(text) #cümleyi kelimelere ayırır
    # Stopwords ve noktalama işaretlerini çıkarır ve kelimeleri lemmatize eder
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words and word.isalpha()]
    return " ".join(words)

df['review'] = df['review'].apply(clean_text)
#etiketleme
df['label'] = df['rating'].map({1: 0, 2: 0, 3: 1, 4: 2, 5: 2})

df.to_csv("cleaned_review.csv", index=False)

#VERİ TOKENİZASYONU#

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def encode_texts(texts, tokenizer, max_length=256): #metinleri BERT'in anlayacağı formata çevirir
    encodings = tokenizer.batch_encode_plus(
        texts.tolist(), #listeye çevirir
        max_length=max_length,
        truncation=True, #max aşılırsa keser
        padding='max_length', #doldurma 
        return_tensors='pt'
    )
    return encodings #input_ids ve attention_mask döndürülür

encodings = encode_texts(df['review'], tokenizer)
labels = torch.tensor(df['label'].values) 

#bert için gerekli 3 bileşeni oluşturur
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'], labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

#veri batch olarak modele aktarılır . parça para 
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Model oluşturma
class SentimentModel(nn.Module):
    def __init__(self, num_labels=3):
        super(SentimentModel, self).__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
    
    def forward(self, input_ids, attention_mask):
        return self.bert(input_ids, attention_mask=attention_mask).logits

model = SentimentModel().to(device)
optimizer = optim.AdamW(model.parameters(), lr=2e-5) #model ağırlıkları optimize edklir
criterion = nn.CrossEntropyLoss()#kayıp fonk

# Model eğitimi
#her batch için loss ve accuarcu hesaplar 
def train(model, train_loader, val_loader, optimizer, criterion, epochs=3):
    for epoch in range(epochs):
        model.train() #model eğitim modunda (katmanların yanlış çalışmasını engeller)
        total_loss, total_correct, train_preds, train_labels = 0, 0, [], []
        for batch in train_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs, labels)#crossentropy
            loss.backward() #kayıp fonk elen gradyanları hesaplar
            optimizer.step()
            total_loss += loss.item()
            total_correct += (outputs.argmax(dim=1) == labels).sum().item()

            # Eğitimdeki tahminleri toplama
            train_preds.extend(outputs.argmax(dim=1).cpu().numpy())
            train_labels.extend(labels.cpu().numpy())

        # Eğitim doğruluğu ve F1 skoru hesaplama
        train_acc = accuracy_score(train_labels, train_preds)
        train_f1 = f1_score(train_labels, train_preds, average='weighted')  

        # Doğrulama doğruluğu ve F1 skoru hesaplama
        val_acc, val_f1 = evaluate(model, val_loader)

        print(f"Epoch {epoch + 1}: Loss: {total_loss / len(train_loader):.4f}, "
              f"Train Accuracy: {train_acc:.4f}, Train F1: {train_f1:.4f}, "
              f"Val Accuracy: {val_acc:.4f}, Val F1: {val_f1:.4f}")

# Modeli değerlendirme
def evaluate(model, val_loader):
    model.eval()
    true_vals, predictions = [], [] #gerçek etiket ve model tahmini
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [x.to(device) for x in batch]
            logits = model(input_ids, attention_mask)#model çıktılarını hesaplar
            preds = logits.argmax(dim=1).cpu().numpy() #model tahminleri alınır
            predictions.extend(preds)
            true_vals.extend(labels.cpu().numpy())
    
    # Accuracy ve F1 skorunu hesapla
    val_acc = accuracy_score(true_vals, predictions)
    val_f1 = f1_score(true_vals, predictions, average='weighted')
    return val_acc, val_f1

# Modeli eğit
train(model, train_loader, val_loader, optimizer, criterion, epochs=3)

# Modeli kaydet ağırlıklarıyla 
from google.colab import drive
drive.mount('/content/drive')
torch.save(model.state_dict(), '/content/drive/My Drive/bert_model2.pt')


In [None]:
Epoch 1: Loss: 0.5212, Train Accuracy: 0.7971, Train F1: 0.7749, Val Accuracy: 0.8251, Val F1: 0.8155
Epoch 2: Loss: 0.4016, Train Accuracy: 0.8444, Train F1: 0.8348, Val Accuracy: 0.8329, Val F1: 0.8230
Epoch 3: Loss: 0.3011, Train Accuracy: 0.8856, Train F1: 0.8819, Val Accuracy: 0.8371, Val F1: 0.8316
Mounted at /content/drive