In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score
import warnings

warnings.filterwarnings('ignore')

In [2]:
# Load the dataset
data = pd.read_csv('review_data_clean.csv')
data.head()

Unnamed: 0,content,content_clean,score,label,content_length,content_clean_length
0,It's cool. pretty addicted to it try it! There...,cool pretti addict tri there mani thing,-1,Negative,15,7
1,fayjzugz GC hi hd for dust be so hd si,fayjzugz gc hi hd dust hd si,1,Positive,10,7
2,Nice game,nice game,6,Positive,2,2
3,Unlimited hero event continue. Bd hole do this...,unlimit hero event continu bd hole hero max,0,Neutral,10,8
4,So nice game,nice game,6,Positive,3,2


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14567 entries, 0 to 14566
Data columns (total 6 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   content               14567 non-null  object
 1   content_clean         14567 non-null  object
 2   score                 14567 non-null  int64 
 3   label                 14567 non-null  object
 4   content_length        14567 non-null  int64 
 5   content_clean_length  14567 non-null  int64 
dtypes: int64(3), object(3)
memory usage: 683.0+ KB


In [4]:
# Menghapus feature
data.drop(['content', 'content_length', 'score','content_clean_length'], axis=1, inplace=True)
data.head()

Unnamed: 0,content_clean,label
0,cool pretti addict tri there mani thing,Negative
1,fayjzugz gc hi hd dust hd si,Positive
2,nice game,Positive
3,unlimit hero event continu bd hole hero max,Neutral
4,nice game,Positive


In [5]:
x = data['content_clean'].values
y = data['label'].values

In [6]:
# Split data menjadi test dan train
review_train, review_test, label_train, label_test = train_test_split(x, y, test_size=0.2)

In [7]:
# Tokenisasi
review_train_texts = [' '.join(review.split()) for review in review_train]
review_test_texts = [' '.join(review.split()) for review in review_test]

In [8]:
# Menggunakan CountVectorizer
count_vectorizer = CountVectorizer(max_features=200)
X_train_counts = count_vectorizer.fit_transform(review_train_texts)
X_test_counts = count_vectorizer.transform(review_test_texts)

# Menggunakan TF-IDF Transformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)


In [9]:
# Membuat rata-rata vektor
def get_average_vector(tokens_list, vectorizer):
    vectors = vectorizer.transform([' '.join(tokens) for tokens in tokens_list])
    avg_vec = np.mean(vectors.toarray(), axis=0)
    return avg_vec

review_train_tokens = [tokens.split() for tokens in review_train_texts]
review_test_tokens = [tokens.split() for tokens in review_test_texts]

X_train_vectorized = get_average_vector(review_train_tokens, count_vectorizer)
X_test_vectorized = get_average_vector(review_test_tokens, count_vectorizer)

In [10]:
# Function untuk training model
def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name):
    model.fit(X_train, y_train)
    
    train_accuracy = accuracy_score(y_train, model.predict(X_train))
    test_accuracy = accuracy_score(y_test, model.predict(X_test))
    
    print(f"Model: {model_name}")
    print(f"Train Accuracy: {train_accuracy}")
    print(f"Test Accuracy: {test_accuracy}")
    print("Classification Report:")
    print(classification_report(y_test, model.predict(X_test)))
    print("\n")

# Definisikan model machine learning yang akan digunakan
models = {
    "RandomForest": RandomForestClassifier(),
    "SVM": SVC(),
    "AdaBoost": AdaBoostClassifier(),
    "GradientBoosting": GradientBoostingClassifier()
}

# Definisikan grid parameter untuk masing-masing model
param_grids = {
    "RandomForest": {'n_estimators': [100, 200, 300], 'max_depth': [None, 10, 20]},
    "SVM": {'C': [1, 10, 100], 'gamma': ['scale', 'auto']},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
    "GradientBoosting": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]}
}

In [12]:
# Train dan evaluasi semua model dengan TF-IDF setelah tuning
print("=== Evaluasi dengan TF-IDF ===")
best_models_tfidf = {}
for model_name, model in models.items():
    grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=5, scoring='accuracy', verbose=1, n_jobs=1)
    grid_search.fit(X_train_tfidf, label_train)
    
    best_model = grid_search.best_estimator_
    best_models_tfidf[model_name] = best_model
    
    train_and_evaluate(best_model, X_train_tfidf, label_train, X_test_tfidf, label_test, model_name)
    print(f"Best parameters: {grid_search.best_params_}")
    print("\n")

=== Evaluasi dengan TF-IDF ===
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Model: RandomForest
Train Accuracy: 0.9822363339912469
Test Accuracy: 0.9450926561427591
Classification Report:
              precision    recall  f1-score   support

    Negative       0.88      0.49      0.63       188
     Neutral       0.89      0.96      0.93       661
    Positive       0.97      0.98      0.97      2065

    accuracy                           0.95      2914
   macro avg       0.91      0.81      0.84      2914
weighted avg       0.94      0.95      0.94      2914



Best parameters: {'max_depth': None, 'n_estimators': 200}


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Model: SVM
Train Accuracy: 0.9486827426413799
Test Accuracy: 0.9468085106382979
Classification Report:
              precision    recall  f1-score   support

    Negative       0.87      0.55      0.67       188
     Neutral       0.89      0.97      0.92       661
    Positive       0.97     

In [14]:
# Train dan evaluasi semua model dengan CountVectorizer setelah tuning
print("=== Evaluasi dengan CountVectorizer ===")
best_models_counts = {}
for model_name, model in models.items():
    grid_search = GridSearchCV(model, param_grid=param_grids[model_name], cv=5, scoring='accuracy', verbose=1, n_jobs=1)
    grid_search.fit(X_train_counts, label_train)
    
    best_model = grid_search.best_estimator_
    best_models_counts[model_name] = best_model
    
    train_and_evaluate(best_model, X_train_counts, label_train, X_test_counts, label_test, model_name)
    print(f"Best parameters: {grid_search.best_params_}")
    print("\n")

=== Evaluasi dengan CountVectorizer ===
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Model: RandomForest
Train Accuracy: 0.9822363339912469
Test Accuracy: 0.9450926561427591
Classification Report:
              precision    recall  f1-score   support

    Negative       0.84      0.53      0.65       188
     Neutral       0.90      0.96      0.93       661
    Positive       0.97      0.98      0.97      2065

    accuracy                           0.95      2914
   macro avg       0.90      0.82      0.85      2914
weighted avg       0.94      0.95      0.94      2914



Best parameters: {'max_depth': None, 'n_estimators': 300}


Fitting 5 folds for each of 6 candidates, totalling 30 fits
Model: SVM
Train Accuracy: 0.959409594095941
Test Accuracy: 0.9519560741249142
Classification Report:
              precision    recall  f1-score   support

    Negative       0.88      0.57      0.69       188
     Neutral       0.89      0.98      0.93       661
    Positive       0

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional, GlobalMaxPooling1D

# Define LSTM model
lstm_model = Sequential([
    Embedding(input_dim=200, output_dim=100),  # Adjust input_dim and output_dim as needed
    Bidirectional(LSTM(64, return_sequences=True)),
    GlobalMaxPooling1D(),
    Dense(64, activation='relu'),
    Dense(3, activation='softmax')  # num_classes = 3 for Positive, Neutral, Negative
])

# Compile LSTM model
lstm_model.compile(optimizer='adam',
                   loss='sparse_categorical_crossentropy',
                   metrics=['accuracy'])



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\Asus\anaconda3\envs\base-env\Lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\Asus\anaconda3\envs\base-env\Lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c:\Users\Asus\anaconda3\envs\base-env\Lib\site-packages\ipykernel\kernelapp.py", line 739, in start
    self.io_loop.start()
  File 

ImportError: 
A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.0.0 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.



ImportError: numpy.core._multiarray_umath failed to import

ImportError: numpy.core.umath failed to import

In [None]:
import tensorflow_hub as hub

# Example with BERT from TensorFlow Hub
bert_model = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3", trainable=True)

# Build model using BERT
input_text = tf.keras.layers.Input(shape=(), dtype=tf.string)
preprocessor = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
encoder_inputs = preprocessor(input_text)
encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4", trainable=True)
outputs = encoder(encoder_inputs)
net = outputs['pooled_output']
net = tf.keras.layers.Dense(3, activation='softmax')(net)

bert_model = tf.keras.Model(input_text, net)
bert_model.compile(optimizer='adam',
                   loss='sparse_categorical_crossentropy',
                   metrics=['accuracy'])

In [None]:
class AccuracyThresholdCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        if logs['accuracy'] > 0.93 and logs['val_accuracy'] > 0.93:
            print(f"\nReached 93% accuracy, stopping training.")
            self.model.stop_training = True

# Example usage:
callbacks = [AccuracyThresholdCallback()]

In [None]:
import matplotlib.pyplot as plt

def plot_history(history):
    plt.figure(figsize=(12, 6))
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.grid(True)
    plt.show()

# Example usage:
history = lstm_model.fit(X_train_tfidf, label_train, epochs=10, validation_data=(X_test_tfidf, label_test), callbacks=callbacks)
plot_history(history)