In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import classification_report, accuracy_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

import matplotlib.pyplot as plt
from wordcloud import WordCloud


In [2]:
file_path = "fake_reviews_dataset - fake_reviews_dataset.csv"
df = pd.read_csv(file_path)

df.head()


Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,category,rating,label,text_
1,Home_and_Kitchen_5,5,CG,"Love this! Well made, sturdy, and very comfor..."
2,Home_and_Kitchen_5,5,CG,"love it, a great upgrade from the original. I..."
3,Home_and_Kitchen_5,5,CG,This pillow saved my back. I love the look and...
4,Home_and_Kitchen_5,1,CG,"Missing information on how to use it, but it i..."


In [3]:
df.columns = ["category", "rating", "authenticity", "review"]

df = df[df["category"] != "category"]
df = df.dropna()

df.head()


Unnamed: 0,category,rating,authenticity,review
1,Home_and_Kitchen_5,5,CG,"Love this! Well made, sturdy, and very comfor..."
2,Home_and_Kitchen_5,5,CG,"love it, a great upgrade from the original. I..."
3,Home_and_Kitchen_5,5,CG,This pillow saved my back. I love the look and...
4,Home_and_Kitchen_5,1,CG,"Missing information on how to use it, but it i..."
5,Home_and_Kitchen_5,5,CG,Very nice set. Good quality. We have had the s...


In [4]:
df.info()
df["category"].value_counts()
df["authenticity"].value_counts()


<class 'pandas.core.frame.DataFrame'>
Index: 40432 entries, 1 to 40432
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   category      40432 non-null  object
 1   rating        40432 non-null  object
 2   authenticity  40432 non-null  object
 3   review        40432 non-null  object
dtypes: object(4)
memory usage: 1.5+ MB


authenticity
CG    20216
OR    20216
Name: count, dtype: int64

In [5]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf.fit_transform(df["review"]).toarray()

y_category = df["category"]


In [6]:
le = LabelEncoder()
y_cat_encoded = le.fit_transform(y_category)

y_cat_onehot = to_categorical(y_cat_encoded)


In [7]:
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(
    X, y_cat_onehot, test_size=0.2, random_state=42
)


In [8]:
cat_model = Sequential()

cat_model.add(Dense(512, activation='relu', input_shape=(5000,)))
cat_model.add(Dropout(0.3))

cat_model.add(Dense(256, activation='relu'))
cat_model.add(Dropout(0.3))

cat_model.add(Dense(y_cat_onehot.shape[1], activation='softmax'))

cat_model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

cat_model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [9]:
history_cat = cat_model.fit(
    X_train_c, y_train_c,
    epochs=5,
    batch_size=256,
    validation_split=0.1
)


Epoch 1/5
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 52ms/step - accuracy: 0.6201 - loss: 1.2133 - val_accuracy: 0.7703 - val_loss: 0.6505
Epoch 2/5
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 46ms/step - accuracy: 0.8193 - loss: 0.5196 - val_accuracy: 0.7737 - val_loss: 0.6384
Epoch 3/5
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 42ms/step - accuracy: 0.8624 - loss: 0.3921 - val_accuracy: 0.7641 - val_loss: 0.6706
Epoch 4/5
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 48ms/step - accuracy: 0.8928 - loss: 0.3120 - val_accuracy: 0.7555 - val_loss: 0.7220
Epoch 5/5
[1m114/114[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 45ms/step - accuracy: 0.9160 - loss: 0.2485 - val_accuracy: 0.7493 - val_loss: 0.7981


In [11]:
y_pred_prob = cat_model.predict(X_test_c)
y_pred = np.argmax(y_pred_prob, axis=1)
y_true = np.argmax(y_test_c, axis=1)

# If you want to see the predicted category names:
y_pred_labels = le.inverse_transform(y_pred)
y_true_labels = le.inverse_transform(y_true)

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_true, y_pred, target_names=le.classes_))


[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step
Accuracy: 0.7617163348584147

Classification Report:



AttributeError: 'LabelEncoder' object has no attribute 'transform'