In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('/content/fake_reviews_dataset - fake_reviews_dataset.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3
0,category,rating,label,text_
1,Home_and_Kitchen_5,5,CG,"Love this! Well made, sturdy, and very comfor..."
2,Home_and_Kitchen_5,5,CG,"love it, a great upgrade from the original. I..."
3,Home_and_Kitchen_5,5,CG,This pillow saved my back. I love the look and...
4,Home_and_Kitchen_5,1,CG,"Missing information on how to use it, but it i..."


In [4]:
df.columns = df.iloc[0]
df = df.reset_index(drop=True)
df = df.iloc[1:]

In [5]:
df.head()

Unnamed: 0,category,rating,label,text_
1,Home_and_Kitchen_5,5,CG,"Love this! Well made, sturdy, and very comfor..."
2,Home_and_Kitchen_5,5,CG,"love it, a great upgrade from the original. I..."
3,Home_and_Kitchen_5,5,CG,This pillow saved my back. I love the look and...
4,Home_and_Kitchen_5,1,CG,"Missing information on how to use it, but it i..."
5,Home_and_Kitchen_5,5,CG,Very nice set. Good quality. We have had the s...


In [6]:
df.shape

(40432, 4)

In [7]:
df.isnull().sum()

Unnamed: 0_level_0,0
0,Unnamed: 1_level_1
category,0
rating,0
label,0
text_,0


In [8]:
df['text_'][1]

'Love this!  Well made, sturdy, and very comfortable.  I love it!Very pretty'

In [9]:
df['category'].value_counts()

Unnamed: 0_level_0,count
category,Unnamed: 1_level_1
Kindle_Store_5,4730
Books_5,4370
Pet_Supplies_5,4254
Home_and_Kitchen_5,4056
Electronics_5,3988
Sports_and_Outdoors_5,3946
Tools_and_Home_Improvement_5,3858
Clothing_Shoes_and_Jewelry_5,3848
Toys_and_Games_5,3794
Movies_and_TV_5,3588


In [10]:
df = df.drop_duplicates()

In [11]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [12]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [13]:
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    words = text.split()

    words = [word for word in words if word not in stop_words]

    words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(words)

In [14]:
df['text_'] = df['text_'].apply(preprocess_text)

In [15]:
df['text_'][1]

'love well made sturdy comfortable love itvery pretty'

In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
tfidf = TfidfVectorizer(max_features=5000)

In [18]:
tfidf_matrix = tfidf.fit_transform(df['text_'])

In [19]:
tfidf_matrix.shape

(40419, 5000)

In [20]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['category'] = le.fit_transform(df['category'])

In [21]:
X = tfidf_matrix
y = df['rating']

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42,shuffle=True)

In [23]:
history = Sequential([
    layers.Dense(32, activation='relu', input_shape=(X.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

In [24]:
history.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['accuracy'])
history.summary()

In [25]:
history.fit(X_train,y_train,epochs=10,batch_size=32,validation_data=(X_test,y_test),verbose=1)

ValueError: Invalid dtype: object

# Task
Create a binary classification model to detect fake reviews. This involves encoding the 'label' column (where 'CG' is 0 for fake and 'OR' is 1 for genuine) as the target variable and using the `tfidf_matrix` as features. Split the data into training and testing sets, then define, compile, and train a Keras Sequential model for binary classification. Finally, evaluate the model's performance using accuracy, precision, recall, F1-score, and a confusion matrix to summarize its ability to detect fake reviews.

## Prepare Target Variable

### Subtask:
Encode the 'label' column into numerical values (0 and 1) for binary classification, where 'CG' can be 0 (fake) and 'OR' can be 1 (genuine).


**Reasoning**:
The subtask requires encoding the 'label' column into numerical values (0 and 1) for binary classification, with 'CG' as 0 and 'OR' as 1. The `LabelEncoder` from `sklearn.preprocessing` is suitable for this, and it has already been imported in a previous cell.



# Task
The user has approved the plan. I will proceed with generating the code to address the first subtask: Prepare Target Variable.

**User provided response:**

```python
df['label'] = df['label'].map({'CG': 0, 'OR': 1})
```

## Prepare Target Variable

### Subtask:
Encode the 'label' column into numerical values (0 and 1) for binary classification, where 'CG' can be 0 (fake) and 'OR' can be 1 (genuine). This ensures the target variable is ready for analysis and model training.


**Reasoning**:
To encode the 'label' column as requested, with 'CG' as 0 and 'OR' as 1, I will use the `map` function on the DataFrame column.



In [None]:
df['label'] = df['label'].map({'CG': 0, 'OR': 1})
print("Encoded 'label' column head:")
print(df['label'].head())
print("Encoded 'label' column value counts:")
print(df['label'].value_counts())

**Reasoning**:
The previous attempt to encode the 'label' column resulted in all NaN values, indicating an issue with the DataFrame's state or the content of the 'label' column. To ensure accurate encoding and proceed with the task, I will re-initialize the DataFrame from the original CSV, re-apply all necessary preprocessing steps (column renaming, row dropping, text preprocessing, and category encoding), and then correctly map the 'label' column to numerical values (0 for 'CG', 1 for 'OR'). This will ensure the target variable is properly prepared for model training.



In [None]:
df = pd.read_csv('/content/fake_reviews_dataset - fake_reviews_dataset.csv')

# Re-apply previous column renaming and row dropping steps
df.columns = df.iloc[0]
df = df.reset_index(drop=True)
df = df.iloc[1:]

# Re-apply text preprocessing
df['text_'] = df['text_'].apply(preprocess_text)

# Re-apply category encoding
df['category'] = le.fit_transform(df['category'])

# Now, apply the correct encoding for the 'label' column
df['label'] = df['label'].map({'CG': 0, 'OR': 1})

print("Encoded 'label' column head after correction:")
print(df['label'].head())
print("\nEncoded 'label' column value counts after correction:")
print(df['label'].value_counts())
print("\nDataFrame head after correction:")
print(df.head())

## Analyze Category-wise Label Distribution

### Subtask:
Group the DataFrame by 'category' and the encoded 'label' to count the occurrences of fake (0) and genuine (1) reviews within each category. This will provide a tabular summary of review types per category.


**Reasoning**:
To analyze the category-wise label distribution, I will group the DataFrame by 'category' and 'label', count the occurrences using `size()`, and then reset the index for a clean tabular summary. Finally, I will display the head of the result.



In [None]:
category_label_distribution = df.groupby(['category', 'label']).size().reset_index(name='count')
print("Category-wise label distribution (head):")
print(category_label_distribution.head())

## Visualize Category-wise Label Distribution

### Subtask:
Create a stacked bar chart or similar visualization to show the proportion of fake vs. genuine reviews for each product category. Include appropriate labels, title, and a legend to clearly distinguish between fake and genuine reviews.


**Reasoning**:
To visualize the distribution of fake vs. genuine reviews by category, I will first pivot the `category_label_distribution` DataFrame to prepare it for plotting. Then, I will create a stacked bar chart, adding appropriate labels, a title, and a legend for clarity.



In [None]:
category_label_pivot = category_label_distribution.pivot(index='category', columns='label', values='count')

plt.figure(figsize=(12, 7))
category_label_pivot.plot(kind='bar', stacked=True, figsize=(12, 7))
plt.title('Distribution of Fake vs. Genuine Reviews by Category', fontsize=16)
plt.xlabel('Product Category', fontsize=12)
plt.ylabel('Number of Reviews', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Review Type', labels=['Fake (0)', 'Genuine (1)'])
plt.tight_layout()
plt.show()
print("Stacked bar chart showing fake vs. genuine review distribution by category.")

## Define Features and Target

### Subtask:
Set the `X` variable to the already processed `tfidf_matrix` (features) and the `y` variable to the encoded 'label' column (target).


**Reasoning**:
To define the features and target variables for the model, I will assign the `tfidf_matrix` to `X` and the `df['label']` to `y`.



In [None]:
X = tfidf_matrix
y = df['label']

print("Shape of X (features):")
print(X.shape)
print("Head of y (target):")
print(y.head())

## Split Data

### Subtask:
Split the data into training and testing sets (X_train, X_test, y_train, y_test) to prepare for model training and evaluation, ensuring a random state for reproducibility.


**Reasoning**:
To ensure consistency between the features (`X`) and the target variable (`y`) for data splitting, I need to re-align the DataFrame `df` by applying `drop_duplicates()` to match the number of samples used for `tfidf_matrix` generation. After making `df` consistent, I will re-assign `X` and `y` to ensure both have the same number of rows before splitting.



In [None]:
df = df.drop_duplicates()

# Re-transform text_ using the fitted tfidf vectorizer
tfidf_matrix = tfidf.transform(df['text_'])

X = tfidf_matrix
y = df['label']

print(f"Shape of X (features) after consistency check: {X.shape}")
print(f"Shape of y (target) after consistency check: {y.shape}")

**Reasoning**:
I will split the data into training and testing sets using `train_test_split` with the specified parameters and then print the shapes of the resulting sets to verify the split.



In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_train: {y_train.shape}")
print(f"Shape of y_test: {y_test.shape}")

## Define Binary Classification Model

### Subtask:
Define a Keras Sequential model appropriate for binary classification, with a single output unit and a 'sigmoid' activation function.


**Reasoning**:
I will define a Keras Sequential model for binary classification as instructed, adding a dense layer with ReLU activation, a dropout layer, and a final dense layer with sigmoid activation, using `X_train.shape[1]` for the input shape.



In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([
    layers.Dense(32, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dropout(0.5),
    layers.Dense(1, activation='sigmoid')
])

print("Keras Sequential model defined successfully.")

**Reasoning**:
The Keras Sequential model has been defined, and now it needs to be compiled with an appropriate optimizer, loss function, and metrics for binary classification.



In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()
print("Keras Sequential model compiled successfully.")

**Reasoning**:
Now that the model is defined and compiled, I will train it using the `fit` method with the training and validation data, specifying the number of epochs and batch size.



In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=1)
print("Model training complete.")

In [9]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

print("GPUs Available:", tf.config.list_physical_devices('GPU'))

df = pd.read_csv('/content/fake_reviews_dataset - fake_reviews_dataset.csv')
df.columns = df.iloc[0]
df = df.reset_index(drop=True)
df = df.iloc[1:]

df['label'] = df['label'].map({'CG': 0, 'OR': 1})

X_text = df['text_']
y = df['label']

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text, y, test_size=0.2, random_state=42, stratify=y
)

tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_test_tfidf = tfidf.transform(X_test_text)

tfidf_model = Sequential([
    Dense(32, activation='relu', input_shape=(X_train_tfidf.shape[1],)),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

tfidf_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
tfidf_model.fit(X_train_tfidf, y_train, epochs=5, batch_size=32, verbose=1)
tfidf_preds = (tfidf_model.predict(X_test_tfidf) > 0.5).astype(int)

max_words = 10000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_text)

X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_test_seq = tokenizer.texts_to_sequences(X_test_text)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post')

embedding_model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_len),
    GlobalAveragePooling1D(),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

embedding_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
embedding_model.fit(X_train_pad, y_train, epochs=5, batch_size=32, verbose=1)
embedding_preds = (embedding_model.predict(X_test_pad) > 0.5).astype(int)

def evaluate_model(name, y_true, y_pred):
    print(f"\n{name} PERFORMANCE")
    print("-" * 30)
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred))
    print("Recall   :", recall_score(y_true, y_pred))
    print("F1-score :", f1_score(y_true, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))

evaluate_model("TF-IDF Model", y_test, tfidf_preds)
evaluate_model("Embedding Model", y_test, embedding_preds)


GPUs Available: [PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m480s[0m 473ms/step - accuracy: 0.8106 - loss: 0.4831
Epoch 2/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 135ms/step - accuracy: 0.9154 - loss: 0.2228
Epoch 3/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 68ms/step - accuracy: 0.9290 - loss: 0.1860
Epoch 4/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 53ms/step - accuracy: 0.9382 - loss: 0.1666
Epoch 5/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 48ms/step - accuracy: 0.9428 - loss: 0.1520
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 117ms/step
Epoch 1/5




[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 4ms/step - accuracy: 0.7115 - loss: 0.5299
Epoch 2/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8618 - loss: 0.3082
Epoch 3/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8801 - loss: 0.2669
Epoch 4/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.8945 - loss: 0.2483
Epoch 5/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.9125 - loss: 0.2071
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step

TF-IDF Model PERFORMANCE
------------------------------
Accuracy : 0.9065166316310127
Precision: 0.9041062208015737
Recall   : 0.9094731634924561
F1-score : 0.9067817509247842
Confusion Matrix:
 [[3654  390]
 [ 366 3677]]

Embedding Model PERFORMANCE
------------------------------
Accuracy : 0.8825275132929393
Precision: 0.8218