In [1]:
import pandas as pd
import numpy as np
import warnings

In [2]:
import os 
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [3]:
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression, SGDClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import NearestCentroid
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [4]:
df = pd.read_excel('/Users/docongluong/Downloads/food_clean.xlsx', index_col=None)
df.head()

Unnamed: 0,review_text_clean,label
0,gà tắm mắm phô mai kéo sợi siêu ngon giá mềm,1
1,gà bq hàn phô mai kéo sợi siêu ngon giá mềm,1
2,gà với khoai tây quá mặn,0
3,mình vừa đặt 1 phần gà 92k và vô cùng thất vọn...,0
4,đồ ăn chuẩn vị hàn quốc ngon giá cả hợp lí,1


In [5]:
text_data = np.array(df['review_text_clean'])
text_data

array(['gà tắm mắm  phô mai kéo sợi siêu ngon  giá mềm',
       'gà bq hàn  phô mai kéo sợi siêu ngon  giá mềm',
       'gà với khoai tây quá mặn', ...,
       'quán nằm trong đường trần quang diệu  mà phải đi sâu xíu  nếu không có ai giới thiệu chắc cũng không biết được  nhưng vì quán ngon và cũng khá nổi tiếng trong khu vực bình thuỷ nên là bạn bè hay rủ rê mình lại đấy \nđồ ăn ngon  nêm nếm đậm đà  đặc biệt ở đây phục vụ rất nhiều các món làm từ mực   đặc sản của quán  cũng là gương mặt thương hiệu của quán  ngoài ra  quán còn phục vụ rất nhiều món khác thích hợp cho những buổi nhâm nhi cùng bạn bè hay gia đình  \nvề giá cả thì phải chăng và hợp lý so với thị trường  \nnói chung là  ngon  ',
       'trời mưa lạnh thèm ăn bún bò  lần đầu lại đây mua ăn  lúc đó là 5rưỡi chiều  mua một tái nạm đem về  hết chả  nước để trong bọc nguội ngắt luôn  8 lát thịt vừa tái vừa nạm  35k  không hành tây  về nhà nấu sôi rồi ngồi ăn thấy hết thèm  hết ngon miệng  không biết có phải tại đến không đún

In [6]:
tfidf = TfidfVectorizer(max_features=None)
bag_of_words = tfidf.fit_transform(text_data)  # Sparse matrix

In [7]:
X = bag_of_words
X.shape

(31534, 17241)

In [8]:
y = np.array(df['label'])
y.shape

(31534,)

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_train[:5], y_train[:5]

(<5x17241 sparse matrix of type '<class 'numpy.float64'>'
 	with 272 stored elements in Compressed Sparse Row format>,
 array([1, 1, 1, 1, 0]))

In [10]:
def build_n_evaluate_model(model, X_train, y_train, X_test, y_test):
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions on training and test sets
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate accuracy and metrics
    train_accuracy = accuracy_score(y_train, y_train_pred)
    test_accuracy = accuracy_score(y_test, y_test_pred)
    report = classification_report(y_test, y_test_pred, output_dict=True)
    
    recall = report['1']['recall']
    precision = report['1']['precision']
    f1 = report['1']['f1-score']
    
    print(f'-- Accuracy train: {train_accuracy:.4f}')
    print(f'-- Accuracy test: {test_accuracy:.4f}')
    print(f'-- Recall: {recall:.4f}, Precision: {precision:.4f}, F1: {f1:.4f}')
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, y_test_pred))
    print("Classification report:")
    print(classification_report(y_test, y_test_pred))

In [11]:
model1 = LogisticRegression(class_weight='balanced')
build_n_evaluate_model(model1, X_train, y_train, X_test, y_test)

-- Accuracy train: 0.9032
-- Accuracy test: 0.8850
-- Recall: 0.8921, Precision: 0.9514, F1: 0.9208
Confusion Matrix:
[[2049  323]
 [ 765 6324]]
Classification report:
              precision    recall  f1-score   support

           0       0.73      0.86      0.79      2372
           1       0.95      0.89      0.92      7089

    accuracy                           0.89      9461
   macro avg       0.84      0.88      0.86      9461
weighted avg       0.90      0.89      0.89      9461



In [13]:
import lightgbm as lgb
scale_pos_weight = (len(y_train) - sum(y_train)) / sum(y_train)

# Initialize LGBMClassifier with scale_pos_weight parameter before SMOTE
model2 = lgb.LGBMClassifier(scale_pos_weight=scale_pos_weight)

# Train the model on the original imbalanced data
model2.fit(X_train, y_train)
# Evaluate the model using the `build_n_evaluate_model()` function on the resampled data
build_n_evaluate_model(model2, X_train, y_train, X_test, y_test)

[LightGBM] [Info] Number of positive: 16432, number of negative: 5641
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.208109 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 215978
[LightGBM] [Info] Number of data points in the train set: 22073, number of used features: 2887
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.744439 -> initscore=1.069169
[LightGBM] [Info] Start training from score 1.069169
[LightGBM] [Info] Number of positive: 16432, number of negative: 5641
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.184737 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 215978
[LightGBM] [Info] Number of data points in the train set: 22073, number of used features: 2887
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.744439 -> initscore=1.069169

In [14]:
model3 = RandomForestClassifier(n_estimators=100, random_state=42)
build_n_evaluate_model(model3, X_train, y_train, X_test, y_test)

-- Accuracy train: 0.9994
-- Accuracy test: 0.8700
-- Recall: 0.9819, Precision: 0.8633, F1: 0.9188
Confusion Matrix:
[[1270 1102]
 [ 128 6961]]
Classification report:
              precision    recall  f1-score   support

           0       0.91      0.54      0.67      2372
           1       0.86      0.98      0.92      7089

    accuracy                           0.87      9461
   macro avg       0.89      0.76      0.80      9461
weighted avg       0.87      0.87      0.86      9461



In [15]:
from sklearn.svm import SVC
model4 = SVC(class_weight='balanced', kernel='linear')
build_n_evaluate_model(model4, X_train, y_train, X_test, y_test)

-- Accuracy train: 0.9206
-- Accuracy test: 0.8850
-- Recall: 0.8983, Precision: 0.9455, F1: 0.9213
Confusion Matrix:
[[2005  367]
 [ 721 6368]]
Classification report:
              precision    recall  f1-score   support

           0       0.74      0.85      0.79      2372
           1       0.95      0.90      0.92      7089

    accuracy                           0.89      9461
   macro avg       0.84      0.87      0.85      9461
weighted avg       0.89      0.89      0.89      9461



In [25]:
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import keras
from keras import layers
# Step 1: Train the model on unbalanced data (Original training data)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = {i: class_weights[i] for i in range(len(class_weights))}

# Define the MLP Neural Network model
model_nn = keras.Sequential([
    layers.Dense(128, activation='relu', input_dim=X_train.shape[1]),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')  # Binary classification
])

# Compile the model
model_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with class weights (on unbalanced data)
model_nn.fit(X_train, y_train, epochs=10, batch_size=32, class_weight=class_weight_dict, verbose=1)

# Evaluate the model on unbalanced test set
y_pred_nn = (model_nn.predict(X_test) > 0.5).astype("int32")
accuracy_nn = accuracy_score(y_test, y_pred_nn)
conf_matrix_nn = confusion_matrix(y_test, y_pred_nn)
class_report_nn = classification_report(y_test, y_pred_nn)

print("** Model on Unbalanced Data **")
print(f"Accuracy: {accuracy_nn:.4f}")
print("Confusion Matrix:")
print(conf_matrix_nn)
print("Classification Report:")
print(class_report_nn)


2025-01-17 03:41:21.173311: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 14ms/step - accuracy: 0.8459 - loss: 0.4355
Epoch 2/10
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 15ms/step - accuracy: 0.9141 - loss: 0.2446
Epoch 3/10
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 14ms/step - accuracy: 0.9389 - loss: 0.1774
Epoch 4/10
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.9613 - loss: 0.1174
Epoch 5/10
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 14ms/step - accuracy: 0.9755 - loss: 0.0780
Epoch 6/10
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.9840 - loss: 0.0512
Epoch 7/10
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 12ms/step - accuracy: 0.9884 - loss: 0.0347
Epoch 8/10
[1m690/690[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 13ms/step - accuracy: 0.9931 - loss: 0.0235
Epoch 9/10
[1m690/690[0m [

In [26]:
Counter(y_train)

Counter({1: 16432, 0: 5641})

In [27]:
sm = SMOTE()
X_resample, y_resample = sm.fit_resample(X_train, y_train)

In [28]:
Counter(y_resample)

Counter({1: 16432, 0: 16432})

In [29]:
model1_o = LogisticRegression(class_weight='balanced')
model1_o.fit(X_resample, y_resample)
build_n_evaluate_model(model1_o, X_resample, y_resample, X_test, y_test)

-- Accuracy train: 0.9246
-- Accuracy test: 0.8860
-- Recall: 0.9072, Precision: 0.9386, F1: 0.9226
Confusion Matrix:
[[1951  421]
 [ 658 6431]]
Classification report:
              precision    recall  f1-score   support

           0       0.75      0.82      0.78      2372
           1       0.94      0.91      0.92      7089

    accuracy                           0.89      9461
   macro avg       0.84      0.86      0.85      9461
weighted avg       0.89      0.89      0.89      9461



In [35]:
scale_pos_weight = (len(y_resample) - sum(y_resample)) / sum(y_resample)

# Initialize LGBMClassifier with scale_pos_weight parameter
model2_o = lgb.LGBMClassifier(scale_pos_weight=scale_pos_weight)

# Train the model using the resampled data from SMOTE
model2_o.fit(X_resample, y_resample)

# Evaluate the model using the `build_n_evaluate_model()` function
build_n_evaluate_model(model2_o, X_resample, y_resample, X_test, y_test)

[LightGBM] [Info] Number of positive: 16432, number of negative: 16432
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.383440 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 270198
[LightGBM] [Info] Number of data points in the train set: 32864, number of used features: 3503
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 16432, number of negative: 16432
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.327846 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 270198
[LightGBM] [Info] Number of data points in the train set: 32864, number of used features: 3503
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
-- Accuracy train: 0.9387
-- Accuracy test: 0.8864

In [36]:
model3_o = RandomForestClassifier(n_estimators=100, random_state=42)
model3_o.fit(X_resample, y_resample)
build_n_evaluate_model(model3_o, X_resample, y_resample, X_test, y_test)

-- Accuracy train: 0.9995
-- Accuracy test: 0.8808
-- Recall: 0.9571, Precision: 0.8917, F1: 0.9233
Confusion Matrix:
[[1548  824]
 [ 304 6785]]
Classification report:
              precision    recall  f1-score   support

           0       0.84      0.65      0.73      2372
           1       0.89      0.96      0.92      7089

    accuracy                           0.88      9461
   macro avg       0.86      0.80      0.83      9461
weighted avg       0.88      0.88      0.88      9461



In [37]:
model4_o = SVC(class_weight='balanced', kernel='linear')
model4_o.fit(X_resample, y_resample)
build_n_evaluate_model(model4_o, X_resample, y_resample, X_test, y_test)

-- Accuracy train: 0.9399
-- Accuracy test: 0.8892
-- Recall: 0.9161, Precision: 0.9348, F1: 0.9253
Confusion Matrix:
[[1919  453]
 [ 595 6494]]
Classification report:
              precision    recall  f1-score   support

           0       0.76      0.81      0.79      2372
           1       0.93      0.92      0.93      7089

    accuracy                           0.89      9461
   macro avg       0.85      0.86      0.86      9461
weighted avg       0.89      0.89      0.89      9461



In [39]:
# Step 3: Retrain the model on SMOTE-resampled data
model_nn.fit(X_resample, y_resample, epochs=10, batch_size=32, class_weight=class_weight_dict, verbose=1)

# Evaluate the model after SMOTE
y_pred_nn_smote = (model_nn.predict(X_test) > 0.5).astype("int32")
accuracy_nn_smote = accuracy_score(y_test, y_pred_nn_smote)
conf_matrix_nn_smote = confusion_matrix(y_test, y_pred_nn_smote)
class_report_nn_smote = classification_report(y_test, y_pred_nn_smote)

print("\n** Model after SMOTE **")
print(f"Accuracy: {accuracy_nn_smote:.4f}")
print("Confusion Matrix:")
print(conf_matrix_nn_smote)
print("Classification Report:")
print(class_report_nn_smote)

Epoch 1/10
[1m1027/1027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13ms/step - accuracy: 0.9941 - loss: 0.0208
Epoch 2/10
[1m1027/1027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12ms/step - accuracy: 0.9965 - loss: 0.0113
Epoch 3/10
[1m1027/1027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 12ms/step - accuracy: 0.9965 - loss: 0.0105
Epoch 4/10
[1m1027/1027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 12ms/step - accuracy: 0.9968 - loss: 0.0091
Epoch 5/10
[1m1027/1027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 12ms/step - accuracy: 0.9970 - loss: 0.0084
Epoch 6/10
[1m1027/1027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 13ms/step - accuracy: 0.9973 - loss: 0.0087
Epoch 7/10
[1m1027/1027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13ms/step - accuracy: 0.9974 - loss: 0.0070
Epoch 8/10
[1m1027/1027[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13ms/step - accuracy: 0.9985 - loss: 0.0055
Epoch 9/

In [40]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import numpy as np

class KerasClassifierWrapper(BaseEstimator, ClassifierMixin):
    def __init__(self, model, epochs=10, batch_size=32):
        self.model = model
        self.epochs = epochs
        self.batch_size = batch_size

    def fit(self, X, y):
        # Fit the Keras model
        self.model.fit(X, y, epochs=self.epochs, batch_size=self.batch_size, verbose=0)
        return self

    def predict(self, X):
        # Make predictions using the Keras model
        return np.argmax(self.model.predict(X), axis=1)

# Wrap your MLP model with the custom wrapper
model_nn_sklearn = KerasClassifierWrapper(model_nn, epochs=10, batch_size=32)

base_models = [
    ('lr', model1_o),  # Logistic Regression
    ('lgbm', model2_o),  # LightGBM
    ('rf', model3_o),  # Random Forest
    ('svc', model4_o),  # Support Vector Classifier
    ('mlp', model_nn_sklearn)   # MLP wrapped with the custom KerasClassifierWrapper
]

# Define the meta-model (Logistic Regression)
meta_model = LogisticRegression()

# Create the Stacking Classifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model)

# Train the stacking model on the resampled data (after SMOTE)
stacking_model.fit(X_resample, y_resample)

# Evaluate the stacking model
y_pred_stacking = stacking_model.predict(X_test)
accuracy_stacking = accuracy_score(y_test, y_pred_stacking)
conf_matrix_stacking = confusion_matrix(y_test, y_pred_stacking)
class_report_stacking = classification_report(y_test, y_pred_stacking)

# Print the results
print("\n** Stacking Model after SMOTE **")
print(f"Accuracy: {accuracy_stacking:.4f}")
print("Confusion Matrix:")
print(conf_matrix_stacking)
print("Classification Report:")
print(class_report_stacking)


[LightGBM] [Info] Number of positive: 16432, number of negative: 16432
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.330333 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 270198
[LightGBM] [Info] Number of data points in the train set: 32864, number of used features: 3503
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 13145, number of negative: 13146
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.266886 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 238429
[LightGBM] [Info] Number of data points in the train set: 26291, number of used features: 3188
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499981 -> initscore=-0.000076
[LightGBM] [Info] Start training from score -0.000076
[LightGBM] [Info] Number of positive: 13145, number of nega