In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [29]:
# 🔒 1. LOCK ALL RANDOMNESS
import numpy as np
import random
import os

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)


**Step 1: Load and basic inception**

In [30]:
df = pd.read_csv("Reviews.csv")
df.head(10)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,4,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,5,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,5,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,5,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,5,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...


In [31]:
# Shape of the dataset
print(f'Total columns: {df.shape[1]}\nTotal rows: {df.shape[0]}')

Total columns: 10
Total rows: 568454


In [33]:
# Word count
df['reviewwordcount'] = df['Text'].apply(lambda x: len(str(x).split()))

# Duplicates
df['duplicatetext'] = df.duplicated(subset='Text', keep=False)

# User review count
df['userreviewcount'] = df['UserId'].map(df['UserId'].value_counts())

# Helpfulness ratio
df['helpfulnessratio'] = df.apply(lambda x: x['HelpfulnessNumerator'] / x['HelpfulnessDenominator']
                                  if x['HelpfulnessDenominator'] != 0 else 0, axis=1)
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,reviewwordcount,duplicatetext,userreviewcount,helpfulnessratio
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,48,False,1,1.0
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,31,False,1,0.0
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,94,False,3,1.0
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,41,True,3,1.0
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,27,False,2,0.0


**Step 2: Create Labels for Fake vs. Genuine Reviews**

***Heuristics for Labeling***

We’ll use the following logic:

***Mark as Fake (label = 0) if:***

User has only 1 review in the dataset

OR HelpfulnessDenominator == 0 (nobody rated it useful)

OR the review is < 20 words long

OR review text is identical to others (copied/pasted)

***Mark as Genuine (label = 1) if:***

User has multiple reviews

AND review is longer and detailed

AND some helpful votes



In [36]:
def smart_label(row):
    suspicious_score = 0

    # Each signal adds to the suspicion score
    if row['userreviewcount'] == 1:
        suspicious_score += 1
    if row['reviewwordcount'] < 20:
        suspicious_score += 1
    if row['helpfulnessratio'] < 0.1:
        suspicious_score += 1
    if row['duplicatetext']:
        suspicious_score += 1

    # Try threshold of 2 first (you can experiment with 2 or 3)
    if suspicious_score >= 2:
        return 0  # fake
    else:
        return 1  # genuine

In [37]:
df['label'] = df.apply(smart_label, axis=1)
df.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,reviewwordcount,duplicatetext,userreviewcount,helpfulnessratio,label
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,48,False,1,1.0,1
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,31,False,1,0.0,0
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,94,False,3,1.0,1
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,41,True,3,1.0,1
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,27,False,2,0.0,1


In [38]:
# Exploring label 
print(f'{df['label'].value_counts()}')
print(f'Fake review: {df['label'].value_counts(normalize=True)[0]:.2%}\nGenuine review: {df['label'].value_counts(normalize=True)[1]:.2%}')

label
1    339280
0    229174
Name: count, dtype: int64
Fake review: 40.32%
Genuine review: 59.68%


**Step 3: Cleaning the Review Text**

In [39]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /Users/yasir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/yasir/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/yasir/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [40]:
# Cleaning feature names
df.columns = df.columns.str.strip().str.lower().str.replace("_","")
df.head()

Unnamed: 0,id,productid,userid,profilename,helpfulnessnumerator,helpfulnessdenominator,score,time,summary,text,reviewwordcount,duplicatetext,userreviewcount,helpfulnessratio,label
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,48,False,1,1.0,1
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,31,False,1,0.0,0
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,94,False,3,1.0,1
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,41,True,3,1.0,1
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,27,False,2,0.0,1


In [41]:
# Unimportant words
stop_words = set(stopwords.words("english"))
print(stop_words)

{'here', 'it', "haven't", 'there', "they'll", 'will', 'about', 'but', 'they', "isn't", 'is', 'all', "she'll", 'itself', 've', 'our', "i've", 'were', 'who', 'against', 'off', 'below', 'as', 'didn', 'how', 'on', 'to', "mightn't", 'doesn', "she's", 'with', 'if', 'don', 'an', "couldn't", 'your', 'm', 'very', 'by', "shan't", 'any', 'only', 'my', 'once', "they're", 'mustn', 'hers', 'than', 'mightn', 'more', 'which', "they'd", 'yourself', 'am', 'can', "didn't", 'now', 'theirs', "we'd", 'during', 'such', 'then', "i'd", 'hadn', "you've", 'other', 'the', "they've", "we're", 'weren', 'yourselves', 'not', "he'd", 'be', 'been', "wouldn't", 'has', "you're", 'their', 'before', 'out', 'had', 'some', "doesn't", 'wasn', 'under', 'a', 'further', 'y', 'for', 'his', 'wouldn', "hadn't", 'do', 'does', 'doing', 'have', 'between', "wasn't", 'or', "mustn't", 'shan', 'where', "hasn't", 'o', "don't", "you'll", 'these', "that'll", 'themselves', 'down', 's', "shouldn't", 'ourselves', 'ma', 'just', 'and', 'd', 'her'

In [45]:
Lemmatizer = WordNetLemmatizer()

def clean_text(text):
    #converts to lower case
    text = text.lower()
    #removes HTML tags
    text = re.sub(r'<.*?>',' ',text)
    #removes URL
    text = re.sub(r'http\S+|www\S+|https\S+', ' ', text)
    #removes punctuation
    text = text.translate(str.maketrans(" ", " ", string.punctuation))
    #removes numbers
    text = re.sub(r'\d+', " ",text)
    #tokinize(splitting into single word) and remove stop words
    tokens = nltk.word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t)>2]
    #Lemmatizing (bring word to basic form, running=run)
    tokens = [Lemmatizer.lemmatize(t) for t in tokens]
    cleaned = " ".join(tokens)
    return cleaned

In [46]:
# applying function to the data
df['cleantext'] = df['text'].apply(clean_text)
df[['text', 'cleantext']].sample(5)


Unnamed: 0,text,cleantext
165256,Having tried a couple of other brands of glute...,tried couple brand glutenfree sandwich cooky b...
231465,My cat loves these treats. If ever I can't fin...,cat love treat ever cant find house pop top bo...
427827,A little less than I expected. It tends to ha...,little le expected tends muddy taste expected ...
433954,"First there was Frosted Mini-Wheats, in origin...",first frosted miniwheats original size frosted...
70260,and I want to congratulate the graphic artist ...,want congratulate graphic artist putting entir...


In [47]:
df.head()

Unnamed: 0,id,productid,userid,profilename,helpfulnessnumerator,helpfulnessdenominator,score,time,summary,text,reviewwordcount,duplicatetext,userreviewcount,helpfulnessratio,label,cleantext
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,48,False,1,1.0,1,bought several vitality canned dog food produc...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,31,False,1,0.0,0,product arrived labeled jumbo salted peanutsth...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,94,False,3,1.0,1,confection around century light pillowy citrus...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,41,True,3,1.0,1,looking secret ingredient robitussin believe f...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...,27,False,2,0.0,1,great taffy great price wide assortment yummy ...


**Step 4: Feature Engineering — TF-IDF & n-grams**

In [48]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 3),
    analyzer='word',
    sublinear_tf=True,
    stop_words='english'
)

X_tfidf = vectorizer.fit_transform(df['cleantext'])

# Target variable
y = df['label']

In [59]:
from sklearn.preprocessing import StandardScaler

# Step 1: Extract metadata features
X_meta = df[['reviewwordcount', 'helpfulnessratio', 'userreviewcount']].fillna(0)

# Step 2: Create and fit the scaler
scaler = StandardScaler()
X_meta_scaled = scaler.fit_transform(X_meta)


In [60]:
from scipy.sparse import hstack

# Combine TF-IDF and scaled metadata
X = hstack([X_tfidf, X_meta_scaled])

In [62]:
print("Feature matrix shape:", X.shape)
print("Target shape:", y.shape)

Feature matrix shape: (568454, 5003)
Target shape: (568454,)


In [63]:
feature_names = vectorizer.get_feature_names_out()
print("Sample features:", feature_names[:20])


Sample features: ['ability' 'able' 'able buy' 'able eat' 'able order' 'absolute'
 'absolute favorite' 'absolutely' 'absolutely delicious' 'absolutely love'
 'absolutely loved' 'absorb' 'absorbed' 'acai' 'accept' 'acceptable'
 'access' 'accident' 'accidentally' 'according']


In [64]:
df = df.dropna(subset=['cleantext', 'label'])

**Step 5: SMOTE, SVM Model Training, Tuning & Evaluation**

In [65]:
#Train test split
X_train,X_test,y_train,y_test = train_test_split(
    X,y,
    test_size=0.2,
    stratify=y,
    random_state=42
)
# SMOTE for imbalanced class
smote = SMOTE(random_state=42)
X_train_bal,y_train_bal = smote.fit_resample(X_train,y_train)
print(f'Before smote:\n{y_train.value_counts()}')
print(f'After smote:\n{y_train_bal.value_counts()}')

Before smote:
label
1    271424
0    183339
Name: count, dtype: int64
After smote:
label
1    271424
0    271424
Name: count, dtype: int64


In [66]:
# Model
svm_bal = LinearSVC(
    class_weight='balanced',
    max_iter=10000
)
svm_bal.fit(X_train_bal,y_train_bal)



In [67]:
# Prediction
y_predict_bal = svm_bal.predict(X_test)


In [68]:
# Evaluation
print(f'Classification report:\n{classification_report(y_test,y_predict_bal)}')
print(f'Confusion matrix:\n{confusion_matrix(y_test,y_predict_bal)}')
print(f'ROC-ACC score:\n{roc_auc_score(y_test,y_predict_bal)}')

Classification report:
              precision    recall  f1-score   support

           0       0.76      0.94      0.84     45835
           1       0.95      0.80      0.87     67856

    accuracy                           0.85    113691
   macro avg       0.85      0.87      0.85    113691
weighted avg       0.87      0.85      0.86    113691

Confusion matrix:
[[42932  2903]
 [13634 54222]]
ROC-ACC score:
0.8678693160166406


In [69]:
import joblib

joblib.dump(svm_bal, 'svm_model_best.pkl')
joblib.dump(vectorizer, 'vectorizer_best.pkl')
joblib.dump(scaler, 'scaler_best.pkl')
df.to_csv('labeled_data_final.csv', index=False)

