In [14]:
# =======================================================================
# Install and Import Libraries
# =======================================================================
!pip install scikit-learn pandas nltk --upgrade --quiet

import pandas as pd
import joblib
import nltk
from nltk.corpus import stopwords
import sys

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report

# --- Download stopwords package ---
nltk.download('stopwords', quiet=True)

# =======================================================================
# Dataset Configuration
# =======================================================================
bucket = "fake-review-dataset-penguining"
train_file_path = "train/train.csv"
test_file_path = "test/test.csv"


# =======================================================================
# Load Train and Test Datasets
# =======================================================================
s3_train_path = f"s3://{bucket}/{train_file_path}"
s3_test_path = f"s3://{bucket}/{test_file_path}"

try:
    print(f"Loading training data from {s3_train_path}...")
    train_df = pd.read_csv(s3_train_path)

    print(f"Loading testing data from {s3_test_path}...")
    test_df = pd.read_csv(s3_test_path)

    # --- DIAGNOSTIC STEP ---
    print(f"\nColumns in your training file are: {train_df.columns.tolist()}")
    print(f"Columns in your testing file are: {test_df.columns.tolist()}")

    text_column_name = 'Review_text'
    label_column_name = 'Is_fake'

    # Clean and prepare both dataframes
    train_df = train_df.dropna(subset=[text_column_name, label_column_name])
    train_df[label_column_name] = train_df[label_column_name].astype(int)

    test_df = test_df.dropna(subset=[text_column_name, label_column_name])
    test_df[label_column_name] = test_df[label_column_name].astype(int)

    print("\nTraining and testing data loaded successfully.")

except FileNotFoundError as e:
    print(f"\nFATAL ERROR: File not found. The path '{e.filename}' is incorrect.")
    print("Please check your bucket name and file paths in the configuration section.")
    sys.exit()
except KeyError:
    print(f"\nFATAL ERROR: A required column was not found.")
    print("Please check the 'Columns in your ... file' output above and correct the column names in the script.")
    sys.exit()


# =======================================================================
# Define Features, Preprocess, and Train the Model
# =======================================================================
# Assign the data directly from files
X_train = train_df[text_column_name]
y_train = train_df[label_column_name]
X_test = test_df[text_column_name]
y_test = test_df[label_column_name]

# --- Create a Combined Stop Words List for English and Malay ---
english_stop_words = list(stopwords.words('english'))
malay_stop_words = [
    "ada", "adalah", "akan", "aku", "anda", "apa", "atau", "bahawa", "banyak", "dan", "dengan",
    "di", "dia", "ini", "itu", "jadi", "jika", "juga", "kamu", "kami", "ke", "kepada", "kerana",
    "ketika", "kita", "lagi", "lain", "maka", "mereka", "pada", "pula", "saja", "saya", "seperti",
    "sudah", "telah", "tetapi", "tidak", "untuk", "yang"
]
combined_stop_words = english_stop_words + malay_stop_words

# --- Create the scikit-learn pipeline ---
model_pipeline = Pipeline(steps=[
    ('vectorizer', TfidfVectorizer(max_features=5000, stop_words=combined_stop_words)),
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])

# --- Train the model on the training data ---
print("\nTraining the model...")
model_pipeline.fit(X_train, y_train)
print("Model training complete!")


# =======================================================================
# Evaluate the Model and Save the Final Artifact
# =======================================================================
# --- Evaluate the model on the unseen testing data ---
print("\nEvaluating model performance on the test set...")
predictions = model_pipeline.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Model Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, predictions, target_names=['Real (0)', 'Fake (1)']))

# --- Save the final trained model pipeline ---
joblib.dump(model_pipeline, 'model_pipeline.joblib')

print("-" * 50)
print("Success! Trained model has been saved as 'model_pipeline.joblib'.")

Loading training data from s3://fake-review-dataset-penguining/train/train.csv...
Loading testing data from s3://fake-review-dataset-penguining/test/test.csv...

Columns in your training file are: ['Is_fake', 'Review_text']
Columns in your testing file are: ['Is_fake', 'Review_text']

Training and testing data loaded successfully.

Training the model...
Model training complete!

Evaluating model performance on the test set...
Model Accuracy: 0.9420

Classification Report:
              precision    recall  f1-score   support

    Real (0)       0.98      0.95      0.96      3177
    Fake (1)       0.78      0.92      0.85       666

    accuracy                           0.94      3843
   macro avg       0.88      0.93      0.90      3843
weighted avg       0.95      0.94      0.94      3843

--------------------------------------------------
Success! Trained model has been saved as 'model_pipeline.joblib'.


## User activity model training

In [16]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

# --- Download stopwords package ---
nltk.download('stopwords', quiet=True)

# =======================================================================
# Dataset Configuration
# =======================================================================
bucket = "fake-review-dataset-penguining"
train_file_path = "train/useractivity_train.csv"
test_file_path = "test/useractivity_test.csv"

s3_train_path = f"s3://{bucket}/{train_file_path}"
s3_test_path = f"s3://{bucket}/{test_file_path}"

# =======================================================================
# Load Data
# =======================================================================
print(f"Loading training data from {s3_train_path}...")
train_df = pd.read_csv(s3_train_path)

print(f"Loading testing data from {s3_test_path}...")
test_df = pd.read_csv(s3_test_path)

print("\n✅ Data loaded successfully!")
print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# =======================================================================
# Text Preprocessing
# =======================================================================
english_stop_words = list(stopwords.words('english'))
malay_stop_words = [
    "ada","adalah","akan","aku","anda","apa","atau","bahawa","banyak","dan","dengan",
    "di","dia","ini","itu","jadi","jika","juga","kamu","kami","ke","kepada","kerana",
    "ketika","kita","lagi","lain","maka","mereka","pada","pula","saja","saya","seperti",
    "sudah","telah","tetapi","tidak","untuk","yang"
]
combined_stop_words = english_stop_words + malay_stop_words

text_column = "review_text"
train_df = train_df.dropna(subset=[text_column])
test_df = test_df.dropna(subset=[text_column])

# =======================================================================
# Feature Engineering (numeric)
# =======================================================================
# Convert published_at_date to datetime
for df in [train_df, test_df]:
    df["published_at_date"] = pd.to_datetime(df["published_at_date"], errors="coerce")

# Compute review frequency per reviewer (reviews/day)
def compute_review_frequency(df):
    reviewer_stats = df.groupby("reviewer_id")["published_at_date"].agg(["min", "max", "count"])
    reviewer_stats["active_days"] = (reviewer_stats["max"] - reviewer_stats["min"]).dt.days + 1
    reviewer_stats["reviews_per_day"] = reviewer_stats["count"] / reviewer_stats["active_days"].replace(0, 1)
    return reviewer_stats["reviews_per_day"]

train_df = train_df.merge(compute_review_frequency(train_df), on="reviewer_id", how="left")
test_df = test_df.merge(compute_review_frequency(test_df), on="reviewer_id", how="left")

# Replace missing numeric with 0
numeric_features = ["rating", "total_number_of_reviews_by_reviewer", "is_local_guide", "reviews_per_day"]
for col in numeric_features:
    train_df[col] = train_df[col].fillna(0)
    test_df[col] = test_df[col].fillna(0)

# Standardize numeric features
scaler = StandardScaler()
X_train_num = scaler.fit_transform(train_df[numeric_features])
X_test_num = scaler.transform(test_df[numeric_features])

# =======================================================================
# TF-IDF Vectorization
# =======================================================================
vectorizer = TfidfVectorizer(max_features=5000, stop_words=combined_stop_words)
X_train_tfidf = vectorizer.fit_transform(train_df[text_column])
X_test_tfidf = vectorizer.transform(test_df[text_column])

# =======================================================================
# Combine Text + Numeric Features
# =======================================================================
X_train = hstack([X_train_tfidf, X_train_num])
X_test = hstack([X_test_tfidf, X_test_num])

print("✅ Combined feature matrix shapes:")
print("Train:", X_train.shape)
print("Test:", X_test.shape)

# =======================================================================
# KMeans Clustering
# =======================================================================
n_clusters = 5  # tune this later
kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)

print("\nClustering training data...")
train_clusters = kmeans.fit_predict(X_train)
test_clusters = kmeans.predict(X_test)

# Attach results
train_df["cluster"] = train_clusters
test_df["cluster"] = test_clusters

print("\n✅ Clustering complete!")
print(train_df[["reviewer_id","review_text","rating","total_number_of_reviews_by_reviewer","is_local_guide","reviews_per_day","cluster"]].head(10))

# =======================================================================
# Save Results
# =======================================================================
output_train = "train/useractivity_train_clustered.csv"
output_test = "test/useractivity_test_clustered.csv"

train_df.to_csv(f"s3://{bucket}/{output_train}", index=False)
test_df.to_csv(f"s3://{bucket}/{output_test}", index=False)

print("\nClustered datasets saved:")
print("Train →", output_train)
print("Test  →", output_test)

# =======================================================================
# Cluster Profiling / Summary
# =======================================================================
cluster_summary = train_df.groupby("cluster").agg({
    "rating": "mean",
    "total_number_of_reviews_by_reviewer": "mean",
    "is_local_guide": "mean",  # proportion of local guides
    "reviews_per_day": "mean",
    "review_id": "count"       # cluster size
}).rename(columns={
    "rating": "avg_rating",
    "total_number_of_reviews_by_reviewer": "avg_total_reviews",
    "is_local_guide": "pct_local_guides",
    "reviews_per_day": "avg_reviews_per_day",
    "review_id": "cluster_size"
}).reset_index()

print("\n📊 Cluster Summary:")
print(cluster_summary)


Loading training data from s3://fake-review-dataset-penguining/train/useractivity_train.csv...
Loading testing data from s3://fake-review-dataset-penguining/test/useractivity_test.csv...

✅ Data loaded successfully!
Train shape: (13116, 16)
Test shape: (3280, 16)
✅ Combined feature matrix shapes:
Train: (13116, 5004)
Test: (3280, 5004)

Clustering training data...

✅ Clustering complete!
    reviewer_id                                        review_text  rating  \
0  1.040000e+20  Recommended untuk bersantai di pagi hari. Roti...     5.0   
1  1.080000e+20  Senang parking... Bersebelahan dengan public b...     5.0   
2  1.120000e+20  We had an amazing lunch at here and the staff ...     5.0   
3  1.050000e+20  The BEST eggs I've ever eaten in a hotel resta...     5.0   
4  1.060000e+20                                  Best place to eat     4.0   
5  1.170000e+20                                  Beli kek kat sini     5.0   
6  1.130000e+20  It was great. The staff was excellent and very

In [17]:
# =======================================================================
# Extra Analysis: Account Creation Date & Reviewer Profiling
# =======================================================================

# 1. Get account creation date per reviewer (earliest review date)
reviewer_creation = train_df.groupby("reviewer_id")["published_at_date"].min().reset_index()
reviewer_creation = reviewer_creation.rename(columns={"published_at_date": "account_creation_date"})
train_df = train_df.merge(reviewer_creation, on="reviewer_id", how="left")

# 2. Extract top keywords per cluster
def top_keywords_per_cluster(X_tfidf, labels, vectorizer, top_n=10):
    """Return top keywords for each cluster based on TF-IDF weights"""
    terms = vectorizer.get_feature_names_out()
    cluster_keywords = {}
    for cluster_id in sorted(set(labels)):
        cluster_center = X_tfidf[labels == cluster_id].mean(axis=0)
        top_indices = np.array(cluster_center).ravel().argsort()[-top_n:][::-1]
        cluster_keywords[cluster_id] = [terms[i] for i in top_indices]
    return cluster_keywords

print("\n🔑 Top Keywords Per Cluster:")
keywords = top_keywords_per_cluster(X_train_tfidf, train_clusters, vectorizer, top_n=10)
for cluster_id, words in keywords.items():
    print(f"Cluster {cluster_id}: {', '.join(words)}")

# 3. Flag suspicious reviewers
# Thresholds: adjust as needed
freq_threshold = 1.0   # > 1 review per day on average
reviews_threshold = 200  # > 200 total reviews
recent_threshold_days = 30  # account created less than 30 days ago

latest_date = train_df["published_at_date"].max()

suspicious_flags = []
for _, row in train_df.iterrows():
    is_suspicious = False
    reasons = []
    
    # Too frequent
    if row["reviews_per_day"] > freq_threshold:
        is_suspicious = True
        reasons.append("high_frequency")
        
    # Too many total reviews
    if row["total_number_of_reviews_by_reviewer"] > reviews_threshold:
        is_suspicious = True
        reasons.append("too_many_reviews")
    
    # Very new account
    if (latest_date - row["account_creation_date"]).days < recent_threshold_days:
        is_suspicious = True
        reasons.append("new_account")
    
    suspicious_flags.append(",".join(reasons) if is_suspicious else "normal")

train_df["suspicion_flag"] = suspicious_flags

# 4. Save updated file with suspicious flags
output_train_flagged = "train/useractivity_train_flagged.csv"
train_df.to_csv(f"s3://{bucket}/{output_train_flagged}", index=False)

print("\n🚩 Suspicious accounts flagged and saved:")
print("Train →", output_train_flagged)

# Show some flagged examples
print("\nExamples of suspicious reviewers:")
print(train_df[train_df["suspicion_flag"] != "normal"][["reviewer_id","account_creation_date","reviews_per_day","total_number_of_reviews_by_reviewer","suspicion_flag"]].head(10))



🔑 Top Keywords Per Cluster:
Cluster 0: food, good, sedap, nice, makanan, place, shi, nasi, taste, harga
Cluster 1: food, good, sedap, nice, nasi, place, delicious, makanan, great, ok
Cluster 2: good, sedap, food, service, nice, makanan, best, delicious, terbaik, great
Cluster 3: food, good, nice, ok, delicious, de, shi, sedap, place, price
Cluster 4: tak, order, service, food, makanan, bad, nasi, lambat, dah, sampai

🚩 Suspicious accounts flagged and saved:
Train → train/useractivity_train_flagged.csv

Examples of suspicious reviewers:
     reviewer_id account_creation_date  reviews_per_day  \
12  1.180000e+20   2015-07-09 13:31:14         0.180623   
16  1.130000e+20   2017-07-11 05:40:06         0.236886   
22  1.150000e+20   2017-01-03 08:20:01         0.229415   
26  1.040000e+20   2015-09-30 06:41:48         0.191326   
29  1.090000e+20   2017-09-03 16:36:26         0.266757   
39  1.180000e+20   2015-07-09 13:31:14         0.180623   
43  1.140000e+20   2017-02-18 12:27:33      

In [18]:
def predict_user_cluster(new_user, vectorizer, scaler, kmeans):
    """
    new_user: dict with keys
        - review_text
        - rating
        - total_number_of_reviews_by_reviewer
        - is_local_guide
        - published_at_date (datetime)
        - reviewer_id
    """
    # --- Preprocess text ---
    text_vec = vectorizer.transform([new_user["review_text"]])
    
    # --- Compute reviews_per_day ---
    # assume new_user includes first review date for account_creation_date
    active_days = max((new_user["latest_review_date"] - new_user["account_creation_date"]).days, 1)
    reviews_per_day = new_user["total_number_of_reviews_by_reviewer"] / active_days
    
    # --- Numeric features ---
    num_features = np.array([[
        new_user["rating"],
        new_user["total_number_of_reviews_by_reviewer"],
        new_user["is_local_guide"],
        reviews_per_day
    ]])
    num_scaled = scaler.transform(num_features)
    
    # --- Combine text + numeric ---
    X_new = hstack([text_vec, num_scaled])
    
    # --- Predict cluster ---
    cluster_id = kmeans.predict(X_new)[0]
    
    return cluster_id, reviews_per_day


In [19]:
from datetime import datetime

# Example random user
new_user = {
    "review_text": "Great service, highly recommend!",
    "rating": 5,
    "total_number_of_reviews_by_reviewer": 150,
    "is_local_guide": 1,
    "account_creation_date": datetime(2025, 8, 1),
    "latest_review_date": datetime(2025, 9, 20),
    "reviewer_id": "random_123"
}

cluster_id, reviews_per_day = predict_user_cluster(new_user, vectorizer, scaler, kmeans)

print(f"User belongs to cluster {cluster_id}")
print(f"Reviews per day: {reviews_per_day:.2f}")

# Optionally check suspicion rules again
if reviews_per_day > 1 or new_user["total_number_of_reviews_by_reviewer"] > 200:
    print("🚩 Suspicious activity detected")
else:
    print("✅ Looks normal")


User belongs to cluster 1
Reviews per day: 3.00
🚩 Suspicious activity detected




In [20]:
import joblib

joblib.dump(kmeans, "kmeans_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")
joblib.dump(scaler, "scaler.pkl")


['scaler.pkl']