In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [2]:
# Membaca dataset dengan pengecekan encoding
df = pd.read_csv('google-play-rev-gen-2.csv', encoding='utf-8')

# Memeriksa beberapa baris pertama
df.head(10)

Unnamed: 0,id,title,avatar,rating,snippet,likes,date,iso_date,response
0,e5384431-56f9-43fa-a32a-53296afc7f66,Seraphim,https://play-lh.googleusercontent.com/a-/ALV-U...,3.0,"While I've reviewed this before, I decided to ...",88,"October 09, 2024",2024-10-09T00:08:20Z,
1,6a73081f-3490-47ba-89fa-83744cb20940,TWOSTORE !,https://play-lh.googleusercontent.com/a-/ALV-U...,3.0,Very fun but I wish there was more fighting in...,35,"October 12, 2024",2024-10-12T06:39:01Z,
2,3a3d4c90-0b6e-45dc-b1e6-014659055bbf,A G,https://play-lh.googleusercontent.com/a-/ALV-U...,3.0,"Fun game, I enjoy the story. There is tons to ...",11,"October 30, 2024",2024-10-30T18:01:37Z,
3,99015538-1d26-4bd9-a02f-37bc2a361d1a,Astra,https://play-lh.googleusercontent.com/a-/ALV-U...,5.0,This game is phenomenal. The art style and sce...,77,"October 18, 2024",2024-10-18T19:11:12Z,
4,541b3b4d-97f6-42e0-9c68-059a63e1e67f,Angela Williams,https://play-lh.googleusercontent.com/a-/ALV-U...,2.0,"I love this game. However, it is incredibly la...",14,"October 14, 2024",2024-10-14T03:04:43Z,
5,ad484b6a-1b9c-42ab-9cee-9df6e28f12d4,Valerie,https://play-lh.googleusercontent.com/a-/ALV-U...,3.0,I used to be obsessed with this game but I hav...,99,"October 13, 2024",2024-10-13T14:02:08Z,
6,fb25cdf6-40d8-44e8-b1b9-d439ebd88565,Daniel “Chotara” Ricciardi,https://play-lh.googleusercontent.com/a-/ALV-U...,5.0,Highly recommend. I have been playing the game...,65,"October 18, 2024",2024-10-18T15:20:11Z,
7,36e821d3-9441-4eaa-94a7-9c7b9b7463b5,Amy,https://play-lh.googleusercontent.com/a-/ALV-U...,4.0,Hello! I absolutely love this game! It has inc...,29,"October 29, 2024",2024-10-29T00:27:08Z,
8,4b1e6dcb-d251-450e-9be6-358b4bb8e9d6,Feitan Desy,https://play-lh.googleusercontent.com/a-/ALV-U...,4.0,I've been playing this game since January 16th...,64,"October 08, 2024",2024-10-08T23:13:09Z,
9,27525772-1c0a-40e4-8321-4c5f0a0f7c64,Olivia Staringer,https://play-lh.googleusercontent.com/a-/ALV-U...,1.0,"Uninteresting characters, uninteresting dialog...",14,"November 11, 2024",2024-11-11T22:58:07Z,


In [3]:
print(df.columns)

Index(['id', 'title', 'avatar', 'rating', 'snippet', 'likes', 'date',
       'iso_date', 'response'],
      dtype='object')


Labeling Data

In [3]:
def classify_rating(rating):
    if rating in [4, 5]:
        return 'positive'
    elif rating in [1, 2]:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    return 'unknown'

df['label'] = df['rating'].apply(classify_rating)


# Display the first few rows to check the new 'label' column
print(df[['rating', 'label']].head(10))

   rating     label
0     3.0   neutral
1     3.0   neutral
2     3.0   neutral
3     5.0  positive
4     2.0  negative
5     3.0   neutral
6     5.0  positive
7     4.0  positive
8     4.0  positive
9     1.0  negative


In [4]:
# Kalkulasi jumlah data per kelas
label_counts = df['label'].value_counts()

# Tampilkan jumlah data per label
print(label_counts)

# Jika ingin menghitung jumlah spesifik untuk bintang
rating_counts = df['rating'].value_counts()
print("\nJumlah data berdasarkan rating:")
print(rating_counts)

label
positive    122
negative     44
neutral      33
Name: count, dtype: int64

Jumlah data berdasarkan rating:
rating
5.0    80
4.0    42
3.0    33
1.0    23
2.0    21
Name: count, dtype: int64


Cleaning Text

In [5]:
# import pandas as pd
import spacy

# 3. Cleaning Text Menggunakan spaCy
nlp = spacy.load("en_core_web_sm")  # Pastikan model sudah diunduh: python -m spacy download en_core_web_sm

def clean_text_spacy(text):
    doc = nlp(text.lower())
    tokens = [
        token.lemma_  # Gunakan lemmatization
        for token in doc
        if not token.is_stop and not token.is_punct and not token.like_num and token.is_alpha
    ]
    return ' '.join(tokens)

# Bersihkan kolom 'snippet' dan tambahkan kolom baru
df['cleaned_snippet'] = df['snippet'].fillna('').apply(clean_text_spacy)

# Tampilkan beberapa hasil
print(df[['snippet', 'cleaned_snippet']].head(10))

                                             snippet  \
0  While I've reviewed this before, I decided to ...   
1  Very fun but I wish there was more fighting in...   
2  Fun game, I enjoy the story. There is tons to ...   
3  This game is phenomenal. The art style and sce...   
4  I love this game. However, it is incredibly la...   
5  I used to be obsessed with this game but I hav...   
6  Highly recommend. I have been playing the game...   
7  Hello! I absolutely love this game! It has inc...   
8  I've been playing this game since January 16th...   
9  Uninteresting characters, uninteresting dialog...   

                                     cleaned_snippet  
0  review decide edit response current opinion ga...  
1  fun wish fighting quest bunch talk run come qu...  
2  fun game enjoy story ton ton content clear bor...  
3  game phenomenal art style scenery stunning yes...  
4  love game incredibly laggy point unplayable gr...  
5  obsess game play month boring repetitive gamep... 

In [6]:
# 2. Feature Extraction dengan TF-IDF
tfidf = TfidfVectorizer(max_features=5000)  # Batasi jumlah fitur ke 5000 kata paling relevan
X = tfidf.fit_transform(df['cleaned_snippet']).toarray()
y = df['label']

In [7]:
# 3. Splitting Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [8]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Split data sebelum oversampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Oversampling dengan SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Distribusi label setelah oversampling:")
print(pd.Series(y_train_resampled).value_counts())


Distribusi label setelah oversampling:
label
neutral     98
positive    98
negative    98
Name: count, dtype: int64


In [37]:
from sklearn.svm import SVC

# Model SVM dengan class_weight balanced
svm_model = SVC(kernel='rbf', C=1, gamma=0.1, random_state=42)
svm_model.fit(X_train_resampled, y_train_resampled)

# Evaluasi model
y_pred = svm_model.predict(X_test)

In [38]:
from sklearn.metrics import classification_report, f1_score, balanced_accuracy_score

# Evaluasi
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("F1-Score (Weighted):", f1_score(y_test, y_pred))
print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

    negative       0.33      0.33      0.33         9
     neutral       0.00      0.00      0.00         7
    positive       0.68      0.88      0.76        24

    accuracy                           0.60        40
   macro avg       0.34      0.40      0.37        40
weighted avg       0.48      0.60      0.53        40



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


ValueError: Target is multiclass but average='binary'. Please choose another average setting, one of [None, 'micro', 'macro', 'weighted'].

In [58]:
# # 4. Modelling dengan SVM (Support Vector Machine)
# # Grid Search untuk optimasi parameter
# parameters = {
#     'C': [0.1, 1, 10],  # Regularisasi
#     'kernel': ['linear', 'rbf'],  # Linear dan RBF kernel
#     'gamma': [0.01, 0.1, 1]  # Gamma untuk kernel RBF
# }

# svm_model = SVC(random_state=42)
# grid_search = GridSearchCV(svm_model, parameters, cv=3, scoring='accuracy', verbose=1, n_jobs=-1)
# grid_search.fit(X_train, y_train)

# # Model terbaik dari GridSearch
# best_model = grid_search.best_estimator_


In [59]:
# # 5. Evaluasi Model
# y_pred = best_model.predict(X_test)
# print("Best Parameters:", grid_search.best_params_)
# print("\nClassification Report:")
# print(classification_report(y_test, y_pred))
