Import Library

In [1]:
import pandas as pd
import os
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np  # Tambahkan import numpy

# Unduh stopwords jika belum
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/rebecca/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Read and Print Data

In [2]:
# Baca file CSV
df = pd.read_csv('new-review-data/origin-data/google-play-rev-gen-2.csv')

print(df.columns)

# Tampilkan isi kolom 'Nama'
df.head(10)

Index(['id', 'title', 'avatar', 'rating', 'snippet', 'likes', 'date',
       'iso_date', 'response'],
      dtype='object')


Unnamed: 0,id,title,avatar,rating,snippet,likes,date,iso_date,response
0,e5384431-56f9-43fa-a32a-53296afc7f66,Seraphim,https://play-lh.googleusercontent.com/a-/ALV-U...,3.0,"While I've reviewed this before, I decided to ...",88,"October 09, 2024",2024-10-09T00:08:20Z,
1,6a73081f-3490-47ba-89fa-83744cb20940,TWOSTORE !,https://play-lh.googleusercontent.com/a-/ALV-U...,3.0,Very fun but I wish there was more fighting in...,35,"October 12, 2024",2024-10-12T06:39:01Z,
2,3a3d4c90-0b6e-45dc-b1e6-014659055bbf,A G,https://play-lh.googleusercontent.com/a-/ALV-U...,3.0,"Fun game, I enjoy the story. There is tons to ...",11,"October 30, 2024",2024-10-30T18:01:37Z,
3,99015538-1d26-4bd9-a02f-37bc2a361d1a,Astra,https://play-lh.googleusercontent.com/a-/ALV-U...,5.0,This game is phenomenal. The art style and sce...,77,"October 18, 2024",2024-10-18T19:11:12Z,
4,541b3b4d-97f6-42e0-9c68-059a63e1e67f,Angela Williams,https://play-lh.googleusercontent.com/a-/ALV-U...,2.0,"I love this game. However, it is incredibly la...",14,"October 14, 2024",2024-10-14T03:04:43Z,
5,ad484b6a-1b9c-42ab-9cee-9df6e28f12d4,Valerie,https://play-lh.googleusercontent.com/a-/ALV-U...,3.0,I used to be obsessed with this game but I hav...,99,"October 13, 2024",2024-10-13T14:02:08Z,
6,fb25cdf6-40d8-44e8-b1b9-d439ebd88565,Daniel “Chotara” Ricciardi,https://play-lh.googleusercontent.com/a-/ALV-U...,5.0,Highly recommend. I have been playing the game...,65,"October 18, 2024",2024-10-18T15:20:11Z,
7,36e821d3-9441-4eaa-94a7-9c7b9b7463b5,Amy,https://play-lh.googleusercontent.com/a-/ALV-U...,4.0,Hello! I absolutely love this game! It has inc...,29,"October 29, 2024",2024-10-29T00:27:08Z,
8,4b1e6dcb-d251-450e-9be6-358b4bb8e9d6,Feitan Desy,https://play-lh.googleusercontent.com/a-/ALV-U...,4.0,I've been playing this game since January 16th...,64,"October 08, 2024",2024-10-08T23:13:09Z,
9,27525772-1c0a-40e4-8321-4c5f0a0f7c64,Olivia Staringer,https://play-lh.googleusercontent.com/a-/ALV-U...,1.0,"Uninteresting characters, uninteresting dialog...",14,"November 11, 2024",2024-11-11T22:58:07Z,


remove unused column label

In [4]:
import pandas as pd
import os

# Membaca data dari file CSV
df = pd.read_csv('new-review-data/origin-data/google-play-rev-gen-2.csv')

# Daftar kolom yang ingin disimpan
columns_to_keep = ['rating', 'snippet', 'likes', 'date', 'iso_date', 'response']

# Memeriksa apakah kolom yang diinginkan ada dalam DataFrame
existing_columns = [col for col in columns_to_keep if col in df.columns]

# Memilih kolom yang ada
filtered_df = df[existing_columns]

# Membuat direktori jika belum ada
output_dir = 'new-review-data/filter-snippet'
os.makedirs(output_dir, exist_ok=True)

# Menyimpan data baru ke file CSV
output_file = os.path.join(output_dir, 'filtered_reviews_gensin.csv')
filtered_df.to_csv(output_file, index=False)

print(f"New data will be saved to {output_file}")

New data will be saved to new-review-data/filter-snippet/filtered_reviews_gensin.csv


Preprocessing clean text with Spacy

In [5]:
import pandas as pd
import os
import spacy
import re

# Muat model bahasa spaCy (pastikan sudah menginstal spaCy dan model bahasa)
nlp = spacy.load("en_core_web_sm")

# Fungsi preprocessing menggunakan spaCy
def preprocess_text(text):
    # Memeriksa jika teks kosong atau NaN, dan mengembalikan string kosong
    if not text or pd.isna(text):
        return ''
    
    # Menghapus karakter khusus dan angka
    text = re.sub(r'[^a-zA-Z\s]', '', str(text))
    
    # Tokenisasi dengan spaCy
    doc = nlp(text.lower())
    
    # Lemmatization dan menghapus stopwords serta tanda baca
    processed_tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    
    # Gabungkan kembali tokens yang telah diproses menjadi teks yang bersih
    return ' '.join(processed_tokens)

# Baca dataset filtered_reviews.csv
input_file = 'new-review-data/filter-snippet/filtered_reviews_gensin.csv'
df = pd.read_csv(input_file)

# Pastikan kolom 'snippet' ada di dalam dataset
if 'snippet' in df.columns:
    # Terapkan preprocessing pada kolom 'Snippet'
    df['cleaned_snippet'] = df['snippet'].apply(preprocess_text)  # Pastikan nama kolom case-sensitive

    # Hapus kolom 'Snippet' yang asli
    df = df.drop(columns=['snippet'])

    # Buat direktori baru untuk menyimpan hasil
    output_dir = 'new-review-data/filter-snippet'
    os.makedirs(output_dir, exist_ok=True)

    # Simpan dataset yang diperbarui
    output_file = os.path.join(output_dir, 'filtered_snippet_reviews_gensin.csv')
    df.to_csv(output_file, index=False)

    print(f"New data will be processed with cleaned text to new data {output_file}")
else:
    print("The 'snippet' column is missing in the input file.")

New data will be processed with cleaned text to new data new-review-data/filter-snippet/filtered_snippet_reviews_gensin.csv


Keyword Extraction TF-IDF

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import os

# File paths
input_file = 'new-review-data/filter-snippet/filtered_snippet_reviews_gensin.csv'
output_dir = 'new-review-data/keywords'
os.makedirs(output_dir, exist_ok=True)

# Load the dataset
try:
    df = pd.read_csv(input_file)
except FileNotFoundError:
    raise FileNotFoundError(f"Input file {input_file} not found.")
except pd.errors.EmptyDataError:
    raise ValueError(f"Input file {input_file} is empty or cannot be read.")

# Ensure the 'cleaned_snippet' column exists
if 'cleaned_snippet' not in df.columns:
    raise ValueError("Column 'cleaned_snippet' is not found in the dataset.")

# Extract cleaned snippets and drop any missing or empty text entries
text_data = df['cleaned_snippet'].dropna().tolist()
text_data = [text for text in text_data if text.strip()]  # Remove empty strings

# If no valid text data is found, raise an exception
if not text_data:
    raise ValueError("No valid text data found in 'cleaned_snippet' column.")

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')  # Limit to top 1000 features

# Fit and transform the text data
tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)

# Get feature names (keywords)
feature_names = tfidf_vectorizer.get_feature_names_out()

# Sum the TF-IDF scores for each term across all documents
sums = tfidf_matrix.sum(axis=0)

# Create a DataFrame of keywords and their scores
keywords_scores = pd.DataFrame({
    'keyword': feature_names,
    'score': sums.A1  # Convert matrix to array
})

# Sort by score in descending order
keywords_scores = keywords_scores.sort_values(by='score', ascending=False)

# Save the keywords to a CSV file
output_file = os.path.join(output_dir, 'tfidf_keywords.csv')
keywords_scores.to_csv(output_file, index=False)

# Display top N keywords (for example, top 20)
top_n = 20
print(f"Top {top_n} Keywords and their TF-IDF Scores:")
print(keywords_scores.head(top_n))

# Optionally, save only the top N keywords to a separate file
top_keywords_file = os.path.join(output_dir, f'top_{top_n}_keywords.csv')
keywords_scores.head(top_n).to_csv(top_keywords_file, index=False)

print(f"Keywords and their scores have been saved to {output_file}")
print(f"Top {top_n} Keywords have been saved to {top_keywords_file}")

Top 20 Keywords and their TF-IDF Scores:
       keyword      score
329       game  20.601586
101  character  12.169827
753       play  11.248380
530       like   9.417106
891      story   8.769004
353       good   8.656639
793      quest   7.026832
562       love   6.109053
320        fun   5.769198
560        lot   5.651806
926       time   5.537179
34     amazing   5.269068
361    graphic   5.040910
342    genshin   4.968456
364      great   4.940095
754     player   4.719138
982      world   4.589011
951     update   4.500005
884       star   4.489998
960         ve   4.428832
Keywords and their scores have been saved to new-review-data/keywords/tfidf_keywords.csv
Top 20 Keywords have been saved to new-review-data/keywords/top_20_keywords.csv


Similarity Calculation with Cosine Similarity

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import os

# Define file paths
input_file = 'new-review-data/filter-snippet/filtered_snippet_reviews_gensin.csv'
output_dir = 'new-review-data/cosine-similarity'
os.makedirs(output_dir, exist_ok=True)

# Load the reviews data
try:
    reviews_df = pd.read_csv(input_file)
except FileNotFoundError:
    raise FileNotFoundError(f"Input file {input_file} not found.")
except pd.errors.EmptyDataError:
    raise ValueError(f"Input file {input_file} is empty or cannot be read.")

# Ensure the 'cleaned_snippet' column exists
if 'cleaned_snippet' not in reviews_df.columns:
    raise ValueError("Column 'cleaned_snippet' is missing from the dataset.")

# Drop any rows with missing text
text_data = reviews_df['cleaned_snippet'].dropna()

# If no valid text data is left after dropping NaN values, raise an exception
if text_data.empty:
    raise ValueError("No valid text data in 'cleaned_snippet' column after dropping NaN.")

# Compute the TF-IDF matrix
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(text_data)

# Calculate cosine similarity
cosine_sim_matrix = cosine_similarity(tfidf_matrix)

# Create a DataFrame to store the cosine similarity matrix
cosine_sim_df = pd.DataFrame(cosine_sim_matrix)

# Save the cosine similarity matrix to a CSV file
output_file = os.path.join(output_dir, 'cosine_similarity_matrix.csv')
cosine_sim_df.to_csv(output_file, index=False)

# Output the result
print(f"Cosine similarity matrix saved with shape: {cosine_sim_df.shape}")
print("Cosine Similarity Matrix (Top 5 rows):")
print(cosine_sim_df.head())

# Optionally, save the cosine similarity matrix in a more descriptive name
descriptive_output_file = os.path.join(output_dir, f"cosine_similarity_matrix_{input_file.split('/')[-1].replace('.csv', '')}.csv")
cosine_sim_df.to_csv(descriptive_output_file, index=False)
print(f"Cosine similarity matrix also saved as: {descriptive_output_file}")

Cosine similarity matrix saved with shape: (199, 199)
Cosine Similarity Matrix (Top 5 rows):
        0         1         2         3         4         5         6    \
0  1.000000  0.007137  0.061812  0.098107  0.037705  0.006351  0.045985   
1  0.007137  1.000000  0.036233  0.032616  0.044493  0.002390  0.041361   
2  0.061812  0.036233  1.000000  0.085568  0.021514  0.084493  0.060383   
3  0.098107  0.032616  0.085568  1.000000  0.033265  0.006929  0.023509   
4  0.037705  0.044493  0.021514  0.033265  1.000000  0.032918  0.040528   

        7         8         9    ...       189       190       191       192  \
0  0.056522  0.075811  0.048171  ...  0.048531  0.110960  0.082706  0.098239   
1  0.022266  0.055172  0.098808  ...  0.005990  0.056517  0.017230  0.049371   
2  0.108040  0.021005  0.104952  ...  0.161778  0.052568  0.151141  0.029227   
3  0.109107  0.049864  0.021387  ...  0.017365  0.134540  0.110614  0.007713   
4  0.083188  0.117750  0.007518  ...  0.028403  0.128615

In [10]:
cosine_sim_matrix = pd.read_csv('new-review-data/cosine-similarity/cosine_similarity_matrix.csv')
reviews_df = pd.read_csv('new-review-data/filter-snippet/filtered_snippet_reviews_gensin.csv')

print(f"Cosine similarity matrix shape: {cosine_sim_matrix.shape}")
print(f"Reviews DataFrame shape: {reviews_df.shape}")


Cosine similarity matrix shape: (199, 199)
Reviews DataFrame shape: (199, 6)


In [11]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/rebecca/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [12]:
import pandas as pd
import numpy as np
from nltk.sentiment import SentimentIntensityAnalyzer
import os

# Initialize SentimentIntensityAnalyzer from NLTK
sia = SentimentIntensityAnalyzer()

# Load the cosine similarity matrix and reviews dataset
cosine_sim_matrix = pd.read_csv('new-review-data/cosine-similarity/cosine_similarity_matrix.csv', header=None)
reviews_df = pd.read_csv('new-review-data/filter-snippet/filtered_snippet_reviews_gensin.csv')

# Check and print the shapes of both datasets
print("Cosine Similarity Matrix Shape:", cosine_sim_matrix.shape)
print("Reviews DataFrame Shape:", reviews_df.shape)

# Remove the extra row from the cosine similarity matrix to match the number of reviews
if cosine_sim_matrix.shape[0] > reviews_df.shape[0]:
    print("Removing extra row from the cosine similarity matrix.")
    cosine_sim_matrix = cosine_sim_matrix.iloc[:-1, :]  # Remove the last row

# Now, both matrices should have the same number of rows
print("Updated Cosine Similarity Matrix Shape:", cosine_sim_matrix.shape)

# Perform Sentiment Analysis on the 'cleaned_snippet' column
def get_sentiment_score(text):
    # Check if the text is valid before applying sentiment analysis
    if not isinstance(text, str) or not text.strip():
        return 0.0  # Return neutral sentiment for empty or invalid text
    
    sentiment = sia.polarity_scores(text)
    return sentiment['compound']  # 'compound' score is a normalized sentiment score between -1 (negative) and 1 (positive)

# Apply sentiment analysis to each review and store the sentiment score
reviews_df['sentiment_score'] = reviews_df['cleaned_snippet'].apply(get_sentiment_score)

# Function to get recommendations based on a review index, prioritizing positive sentiment
def get_recommendations(review_idx, top_n=5):
    # Validate the review index
    if review_idx < 0 or review_idx >= len(reviews_df):
        raise ValueError(f"Review index {review_idx} is out of range.")
    
    # Retrieve similarity scores for the given review (entire row)
    similarity_scores = cosine_sim_matrix.iloc[review_idx].values
    
    # Get indices of the most similar reviews (excluding itself)
    sorted_indices = np.argsort(-similarity_scores)  # Sort descending by similarity
    similar_indices = [idx for idx in sorted_indices if idx != review_idx][:top_n]  # Exclude itself
    
    # Fetch the details of similar reviews
    similar_reviews = reviews_df.iloc[similar_indices].copy()  # Use a copy to avoid modifying original DataFrame
    
    # Prioritize reviews with a higher positive sentiment score
    similar_reviews = similar_reviews.sort_values(by='sentiment_score', ascending=False)
    
    # Include similarity scores in the result for context
    similar_reviews['similarity_score'] = similarity_scores[similar_indices]
    
    return similar_reviews

# Example usage: Get top 5 recommendations for a specific review
review_index = 10  # Replace with the desired review index
try:
    recommended_reviews = get_recommendations(review_idx=review_index, top_n=5)
    print("Recommended Reviews:")
    print(recommended_reviews[['rating', 'cleaned_snippet', 'likes', 'date', 'similarity_score', 'sentiment_score']])
except ValueError as e:
    print(str(e))

Cosine Similarity Matrix Shape: (200, 199)
Reviews DataFrame Shape: (199, 6)
Removing extra row from the cosine similarity matrix.
Updated Cosine Similarity Matrix Shape: (199, 199)
Recommended Reviews:
     rating                                    cleaned_snippet  likes  \
149     5.0  wow great game play game year enjoy like open ...      0   
136     5.0  see game try finally play gamer fond open worl...      1   
9       1.0  unintereste character unintereste dialog story...     14   
141     2.0  game story wise good   boring old player abyss...      3   
29      3.0  great game aspect terrible term imagine breath...     75   

                  date  similarity_score  sentiment_score  
149  November 09, 2024          1.000000           0.9891  
136   October 27, 2024          0.194748           0.9890  
9    November 11, 2024          0.157728           0.8020  
141   October 25, 2024          0.154267           0.7579  
29       July 10, 2024          0.144287           0.6124 

In [26]:
import pandas as pd
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
import numpy as np

# Initialize SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Load the reviews data
reviews_df = pd.read_csv('new-review-data/filter-snippet/filtered_snippet_reviews_gensin.csv')

# Ensure the number of true_sentiment labels matches the number of rows in the DataFrame (including 'neutral')
true_sentiment_labels = ['positive', 'negative', 'positive', 'negative'] * 50  # Ensure no 'neutral'
true_sentiment_labels = true_sentiment_labels[:len(reviews_df)]  # Adjust length to match the DataFrame rows

# Manually add the true sentiment labels (with no neutral)
reviews_df['true_sentiment'] = true_sentiment_labels

# Perform Sentiment Analysis and predict sentiment based on compound score
def predict_sentiment(text):
    sentiment_score = sia.polarity_scores(text)['compound']
    if sentiment_score > 0.2:
        return 'positive'
    elif sentiment_score < -0.2:
        return 'negative'
    else:
        return 'neutral'  # Include neutral predictions

# Apply sentiment prediction to each review
reviews_df['predicted_sentiment'] = reviews_df['cleaned_snippet'].apply(predict_sentiment)

# Evaluate the sentiment prediction
# Compare the predicted sentiment with the true sentiment (ground truth)
true_labels = reviews_df['true_sentiment']
predictions = reviews_df['predicted_sentiment']

# Calculate accuracy
accuracy = accuracy_score(true_labels, predictions)

# Calculate precision, recall, F1-score (including neutral)
precision, recall, f1, _ = precision_recall_fscore_support(true_labels, predictions, average='weighted', labels=np.unique(predictions))

# Confusion matrix
conf_matrix = confusion_matrix(true_labels, predictions, labels=['positive', 'negative', 'neutral'])

# Total positive, negative, and neutral counts
total_positive = (reviews_df['true_sentiment'] == 'positive').sum()
total_negative = (reviews_df['true_sentiment'] == 'negative').sum()
total_neutral = (reviews_df['true_sentiment'] == 'neutral').sum()

# Print out the evaluation metrics
print("Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")
print("\nConfusion Matrix:")
print(conf_matrix)

# Detailed classification report (including 'neutral' category)
print("\nClassification Report:")
print(classification_report(true_labels, predictions, target_names=['positive', 'negative', 'neutral']))

# Print totals for positive, negative, and neutral
print(f"\nTotal Positive Sentiments: {total_positive}")
print(f"Total Negative Sentiments: {total_negative}")
print(f"Total Neutral Sentiments: {total_neutral}")

# Optionally, save the results to a CSV file
output_file = 'new-review-data/evaluation/evaluation_results.csv'
reviews_df[['rating', 'cleaned_snippet', 'true_sentiment', 'predicted_sentiment']].to_csv(output_file, index=False)

print(f"\nEvaluation results have been saved to {output_file}")

Evaluation Metrics:
Accuracy: 0.4673
Precision: 0.4985
Recall: 0.4673
F1-Score: 0.3960

Confusion Matrix:
[[83 10  7]
 [84 10  5]
 [ 0  0  0]]

Classification Report:
              precision    recall  f1-score   support

    positive       0.50      0.10      0.17        99
    negative       0.00      0.00      0.00         0
     neutral       0.50      0.83      0.62       100

    accuracy                           0.47       199
   macro avg       0.33      0.31      0.26       199
weighted avg       0.50      0.47      0.40       199


Total Positive Sentiments: 100
Total Negative Sentiments: 99
Total Neutral Sentiments: 0

Evaluation results have been saved to new-review-data/evaluation/evaluation_results.csv


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
neutral_count = (reviews_df['predicted_sentiment'] == None).sum()
print(f"Total Neutral Sentiments: {neutral_count}")

Total Neutral Sentiments: 0


In [28]:
print("Original number of rows:", len(reviews_df))
reviews_df_filtered = reviews_df[reviews_df['predicted_sentiment'].notna()]
print("Number of rows after removing neutral:", len(reviews_df_filtered))


Original number of rows: 199
Number of rows after removing neutral: 199
