In [3]:
#Import Libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score
import joblib

In [5]:
# Load dataset
data = pd.read_csv('C:/Users/JOE/Desktop/Final Project/fake reviews dataset.csv')
print(data.columns)


Index(['category', 'rating', 'label', 'text_'], dtype='object')


In [6]:
# --- Text Preprocessing ---
# Download necessary NLTK packages
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """
    Clean the input text by tokenizing, removing stopwords, and lemmatizing.
    Args:
        text (str): Input text.
    Returns:
        str: Cleaned text.
    """
    words = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    cleaned_words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(cleaned_words)

# Apply the cleaning function to the 'text_' column (adjust column name as needed)
data['cleaned_text'] = data['text_'].apply(clean_text)

# Preview the cleaned data
print(data[['text_', 'cleaned_text']].head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\JOE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\JOE\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\JOE\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                               text_  \
0  Love this!  Well made, sturdy, and very comfor...   
1  love it, a great upgrade from the original.  I...   
2  This pillow saved my back. I love the look and...   
3  Missing information on how to use it, but it i...   
4  Very nice set. Good quality. We have had the s...   

                                    cleaned_text  
0  love well made sturdy comfortable love pretty  
1   love great upgrade original mine couple year  
2        pillow saved back love look feel pillow  
3    missing information use great product price  
4            nice set good quality set two month  


In [7]:
# --- Classification (Traditional Machine Learning) ---
# Split data into features (X) and target labels (y)
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['label'], test_size=0.2, random_state=42)

# Initialize the vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train a Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train_vect, y_train)

# Evaluate the model
y_pred = model.predict(X_test_vect)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Save the model and vectorizer
joblib.dump(model, 'text_classifier_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
print("Model and vectorizer saved.")

Accuracy: 0.8430814888092
Classification Report:
              precision    recall  f1-score   support

          CG       0.82      0.88      0.85      4016
          OR       0.87      0.81      0.84      4071

    accuracy                           0.84      8087
   macro avg       0.84      0.84      0.84      8087
weighted avg       0.84      0.84      0.84      8087

Model and vectorizer saved.


In [9]:
# --- Classification (Deep Learning with LSTM) ---
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

# Prepare data for LSTM
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# Tokenize and pad sequences
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
max_sequence_length = 100
X_train_seq_pad = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_seq_pad = pad_sequences(X_test_seq, maxlen=max_sequence_length)


# Build the LSTM model
model_lstm = Sequential([
    Embedding(input_dim=5000, output_dim=128),  # Removed input_length
    LSTM(128),
    Dense(1, activation='sigmoid')
])

# Compile and train the model
model_lstm.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_lstm.fit(X_train_seq_pad, y_train, validation_data=(X_test_seq_pad, y_test), epochs=5, batch_size=32)

# Evaluate the LSTM model
y_pred_lstm = (model_lstm.predict(X_test_seq_pad) > 0.5).astype("int32")
print("Accuracy (LSTM):", accuracy_score(y_test, y_pred_lstm))
print("Classification Report (LSTM):")
print(classification_report(y_test, y_pred_lstm))


Epoch 1/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 65ms/step - accuracy: 0.7981 - loss: 0.4243 - val_accuracy: 0.8907 - val_loss: 0.2630
Epoch 2/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 64ms/step - accuracy: 0.9160 - loss: 0.2045 - val_accuracy: 0.9084 - val_loss: 0.2305
Epoch 3/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 62ms/step - accuracy: 0.9407 - loss: 0.1494 - val_accuracy: 0.9085 - val_loss: 0.2469
Epoch 4/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m65s[0m 64ms/step - accuracy: 0.9594 - loss: 0.1069 - val_accuracy: 0.9094 - val_loss: 0.2555
Epoch 5/5
[1m1011/1011[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m62s[0m 61ms/step - accuracy: 0.9689 - loss: 0.0846 - val_accuracy: 0.9044 - val_loss: 0.2781
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 24ms/step
Accuracy (LSTM): 0.9044144923952022
Classification Report (LSTM):
              precision    recall

In [10]:
# --- Topic Modeling (Using LDA) ---
from sklearn.decomposition import LatentDirichletAllocation

# Transform text with TF-IDF
X_tfidf = vectorizer.fit_transform(data['cleaned_text'])

# Apply LDA for topic modeling
lda = LatentDirichletAllocation(n_components=5, random_state=42)
lda.fit(X_tfidf)

# Display the top words in each topic
n_top_words = 10
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    print(f"Topic #{topic_idx + 1}:")
    print(", ".join([feature_names[i] for i in topic.argsort()[-n_top_words:]]))

Topic #1:
price, good, speaker, get, product, use, great, one, sound, work
Topic #2:
good, work, small, little, nice, light, size, comfortable, great, fit
Topic #3:
son, little, product, cat, toy, one, bought, great, dog, love
Topic #4:
one, blade, show, saw, great, watch, acting, film, good, movie
Topic #5:
reading, enjoyed, author, good, well, series, character, story, read, book
