In [53]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import warnings

warnings.filterwarnings('ignore')

In [54]:
# Load the dataset
data = pd.read_csv('review_data_clean.csv')
data.head()

Unnamed: 0,content,content_clean,score,label,content_length,content_clean_length
0,It's cool. pretty addicted to it try it! There...,cool pretti addict tri there mani thing,-1,Negative,15,7
1,fayjzugz GC hi hd for dust be so hd si,fayjzugz gc hi hd dust hd si,1,Positive,10,7
2,Nice game,nice game,6,Positive,2,2
3,Unlimited hero event continue. Bd hole do this...,unlimit hero event continu bd hole hero max,0,Neutral,10,8
4,So nice game,nice game,6,Positive,3,2


In [55]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14567 entries, 0 to 14566
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   content               14566 non-null  object
 1   content_clean         14567 non-null  object
 2   score                 14567 non-null  int64 
 3   label                 14567 non-null  object
 4   content_length        14567 non-null  int64 
 5   content_clean_length  14567 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 683.0+ KB


In [56]:
# Menghapus feature
data.drop(['content', 'content_length', 'score','content_clean_length'], axis=1, inplace=True)
data.head()

Unnamed: 0,content_clean,label
0,cool pretti addict tri there mani thing,Negative
1,fayjzugz gc hi hd dust hd si,Positive
2,nice game,Positive
3,unlimit hero event continu bd hole hero max,Neutral
4,nice game,Positive


In [57]:
x = data['content_clean'].values
y = data['label'].values

In [58]:
# Split data menjadi test dan train
review_train, review_test, label_train, label_test = train_test_split(x, y, test_size=0.2)

In [59]:
review_train

array(['coc boleh tak kasih diskaun diamond', 'inshal', 'nice game', ...,
       'fun game play highli recommend', 'otp come id', 'best man'],
      dtype=object)

In [60]:
review_test

array(['game polit bargain chip fire new commun manag', 'good',
       'hero troop get dumber much level want rage fun game', ...,
       'cupiri', 'great game', 'experi much better'], dtype=object)

In [61]:
# Tokenisasi
review_train_texts = [' '.join(review.split()) for review in review_train]
review_test_texts = [' '.join(review.split()) for review in review_test]

# Menggunakan CountVectorizer
count_vectorizer = CountVectorizer(max_features=200)
X_train_counts = count_vectorizer.fit_transform(review_train_texts)
X_test_counts = count_vectorizer.transform(review_test_texts)

# Menggunakan TF-IDF Transformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

# Membuat rata-rata vektor
def get_average_vector(tokens_list, vectorizer):
    vectors = vectorizer.transform([' '.join(tokens) for tokens in tokens_list])
    avg_vec = np.mean(vectors.toarray(), axis=0)
    return avg_vec

review_train_tokens = [tokens.split() for tokens in review_train_texts]
review_test_tokens = [tokens.split() for tokens in review_test_texts]

X_train_vectorized = get_average_vector(review_train_tokens, count_vectorizer)
X_test_vectorized = get_average_vector(review_test_tokens, count_vectorizer)


In [62]:
def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name):
    model.fit(X_train, y_train)
    
    train_accuracy = accuracy_score(y_train, model.predict(X_train))
    test_accuracy = accuracy_score(y_test, model.predict(X_test))
    
    print(f"Model: {model_name}")
    print(f"Train Accuracy: {train_accuracy}")
    print(f"Test Accuracy: {test_accuracy}")
    print("Classification Report:")
    print(classification_report(y_test, model.predict(X_test)))
    print("\n")

models = {
    "RandomForest": RandomForestClassifier(),
    "SVM": SVC(),
    "AdaBoost": AdaBoostClassifier(),
    "GradientBoosting": GradientBoostingClassifier()
}

# Definisikan grid parameter untuk masing-masing model
param_grids = {
    "RandomForest": {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20]},
    "SVM": {'C': [1, 10, 100], 'gamma': ['scale', 'auto']},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
    "GradientBoosting": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]}
}

# Train dan evaluate semua model dengan TF-IDF setelah tuning
print("=== Evaluasi dengan TF-IDF ===")
best_models_tfidf = {}
for model_name, model in models.items():
    grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train_tfidf, label_train)
    
    best_model = grid_search.best_estimator_
    best_models_tfidf[model_name] = best_model
    
    train_and_evaluate(best_model, X_train_tfidf, label_train, X_test_tfidf, label_test, model_name)
    print(f"Best parameters: {grid_search.best_params_}")
    print("\n")

# Train dan evaluate semua model dengan CountVectorizer setelah tuning
print("=== Evaluasi dengan CountVectorizer ===")
best_models_counts = {}
for model_name, model in models.items():
    grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=5, scoring='accuracy', verbose=1, n_jobs=-1)
    grid_search.fit(X_train_counts, label_train)
    
    best_model = grid_search.best_estimator_
    best_models_counts[model_name] = best_model
    
    train_and_evaluate(best_model, X_train_counts, label_train, X_test_counts, label_test, model_name)
    print(f"Best parameters: {grid_search.best_params_}")
    print("\n")

=== Evaluasi dengan TF-IDF ===
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Model: RandomForest
Train Accuracy: 0.9830086672959752
Test Accuracy: 0.9416609471516816
Classification Report:
              precision    recall  f1-score   support

    Negative       0.88      0.54      0.67       200
     Neutral       0.89      0.95      0.92       699
    Positive       0.97      0.98      0.97      2015

    accuracy                           0.94      2914
   macro avg       0.91      0.82      0.85      2914
weighted avg       0.94      0.94      0.94      2914



Best parameters: {'max_depth': None, 'n_estimators': 200}


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Model: SVM
Train Accuracy: 0.9498841500042907
Test Accuracy: 0.9444063143445436
Classification Report:
              precision    recall  f1-score   support

    Negative       0.91      0.59      0.72       200
     Neutral       0.87      0.97      0.92       699
    Positive       0.97     

In [63]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader



ModuleNotFoundError: No module named 'torch'