<a href="https://colab.research.google.com/github/LeylaY1996/sentiment-analysis-aws-reviews/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Veri setini yükle
csv_file_path = '/content/Reviews.csv'  # Dosya yolunuza göre güncelleyin
df = pd.read_csv(csv_file_path, on_bad_lines='skip',quoting=3)  # daha yeni versiyonlar için
# Veri setinin boyutunu yazdır
print("Orijinal veri seti boyutu:", df.shape)

# %20 oranında rastgele bir alt küme seç
sampled_df = df.sample(frac=0.2, random_state=42)  # random_state ile tekrarlanabilirlik sağlar

# Yeni veri setinin boyutunu yazdır
print("Yeni veri seti boyutu:", sampled_df.shape)

# İstediğiniz gibi veriyi kaydedin veya kullanın
sampled_df.to_csv('sampled_reviews.csv', index=False)  # Yeni dosyayı kaydet

Orijinal veri seti boyutu: (2851, 10)
Yeni veri seti boyutu: (570, 10)


In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

# NLTK stopwords ve punctuation yükle
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation

def clean_text(text):
    """Metni temizler: küçük harfe çevirir, noktalama işaretlerini kaldırır ve durak kelimeleri filtreler."""
    text = text.lower()  # Küçük harfe çevir
    text = ''.join([char for char in text if char not in punctuation])  # Noktalama işaretlerini kaldır
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Durak kelimeleri filtrele
    return text

def prepare_data(csv_file):
    """Veri setini hazırlar: yükler, temizler ve etiketler."""

    # Veri setini yükle
    df = pd.read_csv(csv_file)

    # Gerekli sütunları seçin (örneğin: 'Text' ve 'Score')
    df = df[['Text', 'Score']]

    # Skorları ikili hale getirin (örneğin: 1-3 olumsuz, 4-5 olumlu)
    df['Sentiment'] = df['Score'].apply(lambda x: 1 if x >= 4 else 0)

    # Yorumları temizle
    df['cleaned_review'] = df['Text'].apply(clean_text)

    # Sadece temizlenmiş yorumlar ve duygu etiketlerini içeren DataFrame döndür
    return df[['cleaned_review', 'Sentiment']]

# Kullanım örneği
csv_file_path = '/content/sampled_reviews.csv'  # Dosya yolunuza göre güncelleyin
prepared_data = prepare_data(csv_file_path)

print(prepared_data.head())  # İlk 5 satırı yazdır

                                      cleaned_review  Sentiment
0  natural real food see difference cat loves muc...          1
1  happy product cooked potatoes flat didnt look ...          0
2  jelly tastes heavenly wont find anything good ...          1
3  wonderful dark chocolate flavor much deeper fl...          1
4  excellent service goat milk delivered premium ...          1


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

def svm_sentiment_analysis(df):
    """Verilen veri setinde SVM ile duygu analizi yapar ve sonuçları döner."""

    # Veriyi eğitim ve test setine ayır
    X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['Sentiment'], test_size=0.2, random_state=42)

    # TF-IDF vektörleştirme
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # SVM modelini oluştur ve eğit
    model = SVC(kernel='linear')
    model.fit(X_train_tfidf, y_train)

    # Tahmin yap
    y_pred = model.predict(X_test_tfidf)

    # Sonuçları değerlendir
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    # Karmaşıklık matrisi
    cm = confusion_matrix(y_test, y_pred)

    return accuracy, report, cm

# SVM analizi
accuracy, report, cm = svm_sentiment_analysis(prepared_data)

print("Accuracy:", accuracy)
print(report)
print("Confusion Matrix:\n", cm)

Accuracy: 0.8245614035087719
              precision    recall  f1-score   support

           0       1.00      0.09      0.17        22
           1       0.82      1.00      0.90        92

    accuracy                           0.82       114
   macro avg       0.91      0.55      0.53       114
weighted avg       0.86      0.82      0.76       114

Confusion Matrix:
 [[ 2 20]
 [ 0 92]]


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

def naive_bayes_sentiment_analysis(df):
    """Verilen veri setinde Naive Bayes ile duygu analizi yapar ve sonuçları döner."""

    # Veriyi eğitim ve test setine ayır
    X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['Sentiment'], test_size=0.2, random_state=42)

    # TF-IDF vektörleştirme
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Naive Bayes modelini oluştur ve eğit
    model = MultinomialNB()
    model.fit(X_train_tfidf, y_train)

    # Tahmin yap
    y_pred = model.predict(X_test_tfidf)

    # Sonuçları değerlendir
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, zero_division=1)

    # Karmaşıklık matrisi
    cm = confusion_matrix(y_test, y_pred)

    return accuracy, report, cm

# Naive Bayes analizi
accuracy, report, cm = naive_bayes_sentiment_analysis(prepared_data)

print("Accuracy:", accuracy)
print(report)
print("Confusion Matrix:\n", cm)

Accuracy: 0.8070175438596491
              precision    recall  f1-score   support

           0       1.00      0.00      0.00        22
           1       0.81      1.00      0.89        92

    accuracy                           0.81       114
   macro avg       0.90      0.50      0.45       114
weighted avg       0.84      0.81      0.72       114

Confusion Matrix:
 [[ 0 22]
 [ 0 92]]


In [12]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer

# RNN model fonksiyonu
def rnn_sentiment_analysis(df):
    """Verilen veri setinde RNN (LSTM) ile duygu analizi yapar ve sonuçları döner."""

    # Etiketleri sayısal değerlere dönüştür
    label_encoder = LabelEncoder()
    df['Sentiment'] = label_encoder.fit_transform(df['Sentiment'])

    # Veriyi eğitim ve test setine ayır
    X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['Sentiment'], test_size=0.2, random_state=42)

    # Metni sayısal verilere dönüştür
    vectorizer = TfidfVectorizer(max_features=5000)  # max_features ile öznitelik sayısını sınırlayın
    X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
    X_test_tfidf = vectorizer.transform(X_test).toarray()

    # RNN için veriyi uygun biçime getirin
    max_length = 100  # Maximum uzunluk
    X_train_padded = pad_sequences(X_train_tfidf, maxlen=max_length)
    X_test_padded = pad_sequences(X_test_tfidf, maxlen=max_length)

    # Modeli oluştur
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128))  # input_length kaldırıldı
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))

    # Modeli derle
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Modeli eğit
    model.fit(X_train_padded, y_train, epochs=5, batch_size=64, validation_data=(X_test_padded, y_test))

    # Test verisi üzerinde tahmin yap
    y_pred = (model.predict(X_test_padded) > 0.5).astype("int32").flatten()

    # Sonuçları değerlendir
    accuracy = np.mean(y_pred == y_test)
    report = classification_report(y_test, y_pred, zero_division=1)
    cm = confusion_matrix(y_test, y_pred)

    return accuracy, report, cm

# RNN analizi
accuracy, report, cm = rnn_sentiment_analysis(prepared_data)
print("Accuracy:", accuracy)
print(report)
print("Confusion Matrix:\n", cm)

Epoch 1/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 268ms/step - accuracy: 0.8368 - loss: 0.5881 - val_accuracy: 0.8070 - val_loss: 0.5055
Epoch 2/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 212ms/step - accuracy: 0.8007 - loss: 0.5226 - val_accuracy: 0.8070 - val_loss: 0.4907
Epoch 3/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 216ms/step - accuracy: 0.8222 - loss: 0.4681 - val_accuracy: 0.8070 - val_loss: 0.5067
Epoch 4/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 241ms/step - accuracy: 0.8285 - loss: 0.4686 - val_accuracy: 0.8070 - val_loss: 0.4925
Epoch 5/5
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 403ms/step - accuracy: 0.8172 - loss: 0.4829 - val_accuracy: 0.8070 - val_loss: 0.4917
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 196ms/step
Accuracy: 0.8070175438596491
              precision    recall  f1-score   support

           0       1.00      0.00      0.00    

In [6]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Örnek bir duygu sözlüğü
positive_words = {'good', 'great', 'excellent', 'happy', 'love', 'wonderful', 'amazing', 'fantastic'}
negative_words = {'bad', 'terrible', 'horrible', 'sad', 'hate', 'awful', 'disappointing', 'poor'}

def lexicon_based_sentiment_analysis(df):
    """Verilen veri setinde lexicon-based duygu analizi yapar."""

    def get_sentiment(text):
        """Metindeki kelimeleri analiz ederek duygu puanı döndürür."""
        words = text.split()
        score = 0

        for word in words:
            if word in positive_words:
                score += 1  # Pozitif kelime varsa puanı artır
            elif word in negative_words:
                score -= 1  # Negatif kelime varsa puanı azalt

        return 1 if score > 0 else 0  # Pozitif ise 1, olumsuz ise 0 döndür

    # Her bir yorum için duygu analizi yap
    df['Lexicon_Sentiment'] = df['cleaned_review'].apply(get_sentiment)

    # Doğruluğu hesapla
    accuracy = accuracy_score(df['Sentiment'], df['Lexicon_Sentiment'])

    # Sınıflandırma raporu ve karmaşıklık matrisi
    report = classification_report(df['Sentiment'], df['Lexicon_Sentiment'])
    cm = confusion_matrix(df['Sentiment'], df['Lexicon_Sentiment'])

    return df[['cleaned_review', 'Sentiment', 'Lexicon_Sentiment']], accuracy, report, cm

# Lexicon-based analizi
lexicon_results, lexicon_accuracy, lexicon_report, lexicon_cm = lexicon_based_sentiment_analysis(prepared_data)

# Sonuçları yazdır
print("Lexicon-Based Accuracy:", lexicon_accuracy)
print(lexicon_report)
print("Confusion Matrix:\n", lexicon_cm)
print(lexicon_results.head())

Lexicon-Based Accuracy: 0.6228070175438597
              precision    recall  f1-score   support

           0       0.29      0.76      0.42       101
           1       0.92      0.59      0.72       469

    accuracy                           0.62       570
   macro avg       0.60      0.68      0.57       570
weighted avg       0.81      0.62      0.67       570

Confusion Matrix:
 [[ 77  24]
 [191 278]]
                                      cleaned_review  Sentiment  \
0  natural real food see difference cat loves muc...          1   
1  happy product cooked potatoes flat didnt look ...          0   
2  jelly tastes heavenly wont find anything good ...          1   
3  wonderful dark chocolate flavor much deeper fl...          1   
4  excellent service goat milk delivered premium ...          1   

   Lexicon_Sentiment  
0                  0  
1                  1  
2                  1  
3                  1  
4                  1  


In [7]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Örnek bir özellik ve ona bağlı duygu sözlüğü
aspects = {
    'hız': {'good', 'great', 'fast', 'quick', 'excellent', 'amazing', 'slow', 'not fast'},
    'kalite': {'bad', 'poor', 'terrible', 'awful', 'good', 'excellent', 'not good'}
}

def aspect_based_sentiment_analysis(df):
    """Verilen veri setinde aspect-based duygu analizi yapar."""

    def get_aspect_sentiment(text):
        """Metindeki her bir özellik için duygu puanı döndürür."""
        words = text.split()
        aspect_scores = {aspect: 0 for aspect in aspects.keys()}

        for word in words:
            for aspect, keywords in aspects.items():
                if word in keywords:
                    # Pozitif kelime için puanı artır
                    aspect_scores[aspect] += 1 if 'not' not in word else -1

        # Her bir bileşen için 1 (olumlu) veya 0 (olumsuz) döndür
        return int(aspect_scores['hız'] > 0), int(aspect_scores['kalite'] > 0)

    # Her bir yorum için aspect-based duygu analizi yap
    df[['Aspect_Hız', 'Aspect_Kalite']] = df['cleaned_review'].apply(get_aspect_sentiment).apply(pd.Series)

    # Doğruluk hesapla
    accuracy_hız = accuracy_score(df['Sentiment'], df['Aspect_Hız'])
    accuracy_kalite = accuracy_score(df['Sentiment'], df['Aspect_Kalite'])

    # Sınıflandırma raporu ve karmaşıklık matrisleri
    report_hız = classification_report(df['Sentiment'], df['Aspect_Hız'])
    cm_hız = confusion_matrix(df['Sentiment'], df['Aspect_Hız'])

    report_kalite = classification_report(df['Sentiment'], df['Aspect_Kalite'])
    cm_kalite = confusion_matrix(df['Sentiment'], df['Aspect_Kalite'])

    return df[['cleaned_review', 'Sentiment', 'Aspect_Hız', 'Aspect_Kalite']], accuracy_hız, accuracy_kalite, report_hız, report_kalite, cm_hız, cm_kalite

# Aspect-based analizi
aspect_results, aspect_accuracy_hız, aspect_accuracy_kalite, report_hız, report_kalite, cm_hız, cm_kalite = aspect_based_sentiment_analysis(prepared_data)

# Sonuçları yazdır
print("Aspect-Based Accuracy (Hız):", aspect_accuracy_hız)
print(report_hız)
print("Confusion Matrix (Hız):\n", cm_hız)

print("Aspect-Based Accuracy (Kalite):", aspect_accuracy_kalite)
print(report_kalite)
print("Confusion Matrix (Kalite):\n", cm_kalite)

print(aspect_results.head())

Aspect-Based Accuracy (Hız): 0.5333333333333333
              precision    recall  f1-score   support

           0       0.24      0.77      0.37       101
           1       0.91      0.48      0.63       469

    accuracy                           0.53       570
   macro avg       0.58      0.63      0.50       570
weighted avg       0.79      0.53      0.58       570

Confusion Matrix (Hız):
 [[ 78  23]
 [243 226]]
Aspect-Based Accuracy (Kalite): 0.356140350877193
              precision    recall  f1-score   support

           0       0.19      0.78      0.30       101
           1       0.85      0.26      0.40       469

    accuracy                           0.36       570
   macro avg       0.52      0.52      0.35       570
weighted avg       0.73      0.36      0.39       570

Confusion Matrix (Kalite):
 [[ 79  22]
 [345 124]]
                                      cleaned_review  Sentiment  Aspect_Hız  \
0  natural real food see difference cat loves muc...          1       