In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Download NLTK stop words and tokenizer
nltk.download('stopwords')
nltk.download('punkt')

# Load the dataset (make sure to adjust the file path to where you saved the dataset)
try:
    df = pd.read_csv('https://raw.githubusercontent.com/SayamAlt/Fake-Reviews-Detection/main/fake%20reviews%20dataset.csv')
    print("Dataset loaded successfully.")
except FileNotFoundError:
    print("File not found. Please ensure the file path is correct.")
    exit()

# Print the column names to inspect the DataFrame
print("Columns in the dataset:", df.columns)

# For demonstration, we'll assume 'text_' contains the review and 'label' is 1 for genuine and 0 for fake.
# Adjust this based on your dataset's actual column names.
# Example: if your column name is different, change 'text_' to the correct column name.
if 'text_' not in df.columns:
    print("The column 'text_' does not exist in the dataset. Please check the column names.")
    exit()

# Creating synthetic labels for demonstration purposes
np.random.seed(42)  # For reproducibility
df['label'] = np.random.randint(0, 2, df.shape[0])
print("Creating synthetic labels for demonstration purposes.")

# Preprocess function to clean the review text
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    words = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

# Apply the preprocess function to the review text
df['text_'] = df['text_'].astype(str).apply(preprocess_text)
print("Text preprocessing completed.")

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['text_'], df['label'], test_size=0.3, random_state=42)
print("Data splitting into training and testing sets completed.")

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print("Text vectorization using TF-IDF completed.")

# Train the Logistic Regression classifier
classifier = LogisticRegression(max_iter=200)
classifier.fit(X_train_tfidf, y_train)
print("Model training completed.")

# Predict on the test data
y_pred = classifier.predict(X_test_tfidf)
print("Model prediction completed.")

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:\n", classification_rep)

# Save the preprocessed DataFrame to a CSV file (optional)
df.to_csv('preprocessed_amazon_reviews.csv', index=False)
print("Preprocessed data saved to 'preprocessed_amazon_reviews.csv'.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KETHANA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KETHANA\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dataset loaded successfully.
Columns in the dataset: Index(['category', 'rating', 'label', 'text_'], dtype='object')
Creating synthetic labels for demonstration purposes.
Text preprocessing completed.
Data splitting into training and testing sets completed.
Text vectorization using TF-IDF completed.
Model training completed.
Model prediction completed.
Accuracy: 0.5032976092333058
Classification Report:
               precision    recall  f1-score   support

           0       0.51      0.50      0.50      6107
           1       0.50      0.51      0.51      6023

    accuracy                           0.50     12130
   macro avg       0.50      0.50      0.50     12130
weighted avg       0.50      0.50      0.50     12130

Preprocessed data saved to 'preprocessed_amazon_reviews.csv'.
