<a href="https://colab.research.google.com/github/LeylaY1996/sentiment-analysis-aws-reviews/blob/main/Untitled2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

# Veri setini yükle
csv_file_path = '/content/Reviews.csv'  # Dosya yolunuza göre güncelleyin
df = pd.read_csv(csv_file_path, on_bad_lines='skip',quoting=3)  # daha yeni versiyonlar için
# Veri setinin boyutunu yazdır
print("Orijinal veri seti boyutu:", df.shape)

# %20 oranında rastgele bir alt küme seç
sampled_df = df.sample(frac=0.2, random_state=42)  # random_state ile tekrarlanabilirlik sağlar

# Yeni veri setinin boyutunu yazdır
print("Yeni veri seti boyutu:", sampled_df.shape)

# İstediğiniz gibi veriyi kaydedin veya kullanın
sampled_df.to_csv('sampled_reviews.csv', index=False)  # Yeni dosyayı kaydet

Orijinal veri seti boyutu: (48487, 10)
Yeni veri seti boyutu: (9697, 10)


In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

# NLTK stopwords ve punctuation yükle
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
punctuation = string.punctuation

def clean_text(text):
    """Metni temizler: küçük harfe çevirir, noktalama işaretlerini kaldırır ve durak kelimeleri filtreler."""
    text = text.lower()  # Küçük harfe çevir
    text = ''.join([char for char in text if char not in punctuation])  # Noktalama işaretlerini kaldır
    text = ' '.join([word for word in text.split() if word not in stop_words])  # Durak kelimeleri filtrele
    return text

def prepare_data(csv_file):
    """Veri setini hazırlar: yükler, temizler ve etiketler."""

    # Veri setini yükle
    df = pd.read_csv(csv_file)

    # Gerekli sütunları seçin (örneğin: 'Text' ve 'Score')
    df = df[['Text', 'Score']]

    # Skorları ikili hale getirin (örneğin: 1-3 olumsuz, 4-5 olumlu)
    df['Sentiment'] = df['Score'].apply(lambda x: 1 if x >= 4 else 0)

    # Yorumları temizle
    df['cleaned_review'] = df['Text'].apply(clean_text)

    # Sadece temizlenmiş yorumlar ve duygu etiketlerini içeren DataFrame döndür
    return df[['cleaned_review', 'Sentiment']]

# Kullanım örneği
csv_file_path = '/content/sampled_reviews.csv'  # Dosya yolunuza göre güncelleyin
prepared_data = prepare_data(csv_file_path)

print(prepared_data.head())  # İlk 5 satırı yazdır

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


                                      cleaned_review  Sentiment
0  369 per box stop shop also everyone keeps sayi...          1
1  nice sweet treat nutritional value fantastic t...          1
2  good deal cook using agave nectar due low suga...          1
3  great deal via subscribe save set 2 boxes mont...          1
4  mix spoonful wellness food toppermixer wellnes...          1


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

def svm_sentiment_analysis(df):
    """Verilen veri setinde SVM ile duygu analizi yapar."""

    # Veriyi eğitim ve test setine ayır
    X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['Sentiment'], test_size=0.2, random_state=42)

    # TF-IDF vektörleştirme
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # SVM modelini oluştur ve eğit
    model = SVC(kernel='linear')
    model.fit(X_train_tfidf, y_train)

    # Tahmin yap
    y_pred = model.predict(X_test_tfidf)

    # Sonuçları değerlendir
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return accuracy, report

# SVM analizi
accuracy, report = svm_sentiment_analysis(prepared_data)

print("Accuracy:", accuracy)
print(report)

Accuracy: 0.9072164948453608
              precision    recall  f1-score   support

           0       0.80      0.56      0.66       311
           1       0.92      0.97      0.95      1629

    accuracy                           0.91      1940
   macro avg       0.86      0.77      0.80      1940
weighted avg       0.90      0.91      0.90      1940



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

def naive_bayes_sentiment_analysis(df):
    """Verilen veri setinde Naive Bayes ile duygu analizi yapar."""

    # Veriyi eğitim ve test setine ayır
    X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['Sentiment'], test_size=0.2, random_state=42)

    # TF-IDF vektörleştirme
    vectorizer = TfidfVectorizer()
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # Naive Bayes modelini oluştur ve eğit
    model = MultinomialNB()
    model.fit(X_train_tfidf, y_train)

    # Tahmin yap
    y_pred = model.predict(X_test_tfidf)

    # Sonuçları değerlendir
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    return accuracy, report

# Sadece Naive Bayes analizi
accuracy, report = naive_bayes_sentiment_analysis(prepared_data)

print("Accuracy:", accuracy)
print(report)

Accuracy: 0.8402061855670103
              precision    recall  f1-score   support

           0       1.00      0.00      0.01       311
           1       0.84      1.00      0.91      1629

    accuracy                           0.84      1940
   macro avg       0.92      0.50      0.46      1940
weighted avg       0.87      0.84      0.77      1940



In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Veri setini hazırla (daha önce hazırlandığını varsayıyoruz)
# prepared_data, temizlenmiş yorumlar ve etiketler içermektedir

def rnn_sentiment_analysis(df):
    """Verilen veri setinde RNN (LSTM) ile duygu analizi yapar."""

    # Veriyi eğitim ve test setine ayır
    X_train, X_test, y_train, y_test = train_test_split(df['cleaned_review'], df['Sentiment'], test_size=0.2, random_state=42)

    # Metni sayısal verilere dönüştür
    vectorizer = TfidfVectorizer(max_features=5000)  # max_features ile öznitelik sayısını sınırlayın
    X_train_tfidf = vectorizer.fit_transform(X_train).toarray()
    X_test_tfidf = vectorizer.transform(X_test).toarray()

    # RNN için veriyi uygun biçime getirin
    max_length = 100  # Maximum uzunluk
    X_train_padded = pad_sequences(X_train_tfidf, maxlen=max_length)
    X_test_padded = pad_sequences(X_test_tfidf, maxlen=max_length)

    # Modeli oluştur
    model = Sequential()
    model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_length))
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1, activation='sigmoid'))

    # Modeli derle
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Modeli eğit
    model.fit(X_train_padded, y_train, epochs=5, batch_size=64, validation_data=(X_test_padded, y_test))

    # Test verisi üzerinde tahmin yap
    y_pred = (model.predict(X_test_padded) > 0.5).astype("int32")

    # Sonuçları değerlendir
    accuracy = np.mean(y_pred.flatten() == y_test)

    return accuracy

# RNN analizi
accuracy = rnn_sentiment_analysis(prepared_data)
print("Accuracy:", accuracy)

Epoch 1/5




[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 302ms/step - accuracy: 0.7938 - loss: 0.4970 - val_accuracy: 0.8397 - val_loss: 0.4406
Epoch 2/5
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 323ms/step - accuracy: 0.8233 - loss: 0.4677 - val_accuracy: 0.8397 - val_loss: 0.4524
Epoch 3/5
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 336ms/step - accuracy: 0.8174 - loss: 0.4787 - val_accuracy: 0.8397 - val_loss: 0.4403
Epoch 4/5
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 305ms/step - accuracy: 0.8238 - loss: 0.4674 - val_accuracy: 0.8397 - val_loss: 0.4466
Epoch 5/5
[1m122/122[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 305ms/step - accuracy: 0.8215 - loss: 0.4710 - val_accuracy: 0.8397 - val_loss: 0.4408
[1m61/61[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 61ms/step
Accuracy: 0.8396907216494846


In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score

# Örnek bir duygu sözlüğü
positive_words = {'good', 'great', 'excellent', 'happy', 'love', 'wonderful', 'amazing', 'fantastic'}
negative_words = {'bad', 'terrible', 'horrible', 'sad', 'hate', 'awful', 'disappointing', 'poor'}

def lexicon_based_sentiment_analysis(df):
    """Verilen veri setinde lexicon-based duygu analizi yapar."""

    def get_sentiment(text):
        """Metindeki kelimeleri analiz ederek duygu puanı döndürür."""
        words = text.split()
        score = 0

        for word in words:
            if word in positive_words:
                score += 1  # Pozitif kelime varsa puanı artır
            elif word in negative_words:
                score -= 1  # Negatif kelime varsa puanı azalt

        return 1 if score > 0 else 0  # Pozitif ise 1, olumsuz ise 0 döndür

    # Her bir yorum için duygu analizi yap
    df['Lexicon_Sentiment'] = df['cleaned_review'].apply(get_sentiment)

    # Doğruluğu hesapla
    accuracy = accuracy_score(df['Sentiment'], df['Lexicon_Sentiment'])

    return df[['cleaned_review', 'Sentiment', 'Lexicon_Sentiment']], accuracy

# Lexicon-based analizi
lexicon_results, lexicon_accuracy = lexicon_based_sentiment_analysis(prepared_data)

# Sonuçları yazdır
print("Lexicon-Based Accuracy:", lexicon_accuracy)
print(lexicon_results.head())

Lexicon-Based Accuracy: 0.6329792719397752
                                      cleaned_review  Sentiment  \
0  369 per box stop shop also everyone keeps sayi...          1   
1  nice sweet treat nutritional value fantastic t...          1   
2  good deal cook using agave nectar due low suga...          1   
3  great deal via subscribe save set 2 boxes mont...          1   
4  mix spoonful wellness food toppermixer wellnes...          1   

   Lexicon_Sentiment  
0                  0  
1                  1  
2                  1  
3                  1  
4                  1  


In [None]:
# Örnek bir özellik ve ona bağlı duygu sözlüğü
aspects = {
    'hız': {'good', 'great', 'fast', 'quick', 'excellent', 'amazing'},
    'kalite': {'bad', 'poor', 'terrible', 'awful', 'good', 'excellent'}
}

def aspect_based_sentiment_analysis(df):
    """Verilen veri setinde aspect-based duygu analizi yapar."""

    def get_aspect_sentiment(text):
        """Metindeki her bir özellik için duygu puanı döndürür."""
        words = text.split()
        aspect_scores = {aspect: 0 for aspect in aspects.keys()}

        for word in words:
            for aspect, keywords in aspects.items():
                if word in keywords:
                    aspect_scores[aspect] += 1  # Pozitif kelime varsa puanı artır
                elif word in [k for k in aspects[aspect] if k.startswith('not ')]:
                    aspect_scores[aspect] -= 1  # Negatif kelime varsa puanı azalt

        # Her bir bileşen için 1 veya 0 döndür
        return int(aspect_scores['hız'] > 0), int(aspect_scores['kalite'] > 0)

    # Her bir yorum için aspect-based duygu analizi yap
    df[['Aspect_Hız', 'Aspect_Kalite']] = df['cleaned_review'].apply(get_aspect_sentiment).apply(pd.Series)

    # Doğruluğu hesapla (hız veya kalite için olumlu tahmin)
    accuracy_hız = accuracy_score(df['Sentiment'], df['Aspect_Hız'])
    accuracy_kalite = accuracy_score(df['Sentiment'], df['Aspect_Kalite'])

    return df[['cleaned_review', 'Sentiment', 'Aspect_Hız', 'Aspect_Kalite']], accuracy_hız, accuracy_kalite

# Aspect-based analizi
aspect_results, aspect_accuracy_hız, aspect_accuracy_kalite = aspect_based_sentiment_analysis(prepared_data)

# Sonuçları yazdır
print("Aspect-Based Accuracy (Hız):", aspect_accuracy_hız)
print("Aspect-Based Accuracy (Kalite):", aspect_accuracy_kalite)
print(aspect_results.head())

Aspect-Based Accuracy (Hız): 0.5266577291945963
Aspect-Based Accuracy (Kalite): 0.33051459214189954
                                      cleaned_review  Sentiment  Aspect_Hız  \
0  369 per box stop shop also everyone keeps sayi...          1           0   
1  nice sweet treat nutritional value fantastic t...          1           0   
2  good deal cook using agave nectar due low suga...          1           1   
3  great deal via subscribe save set 2 boxes mont...          1           1   
4  mix spoonful wellness food toppermixer wellnes...          1           0   

   Aspect_Kalite  
0              0  
1              0  
2              1  
3              0  
4              0  
