# DBSCAN Anomaly Detection on TripAdvisor Reviews

1. Loads the provided `tripadvisor_reviews.csv`
2. Scales numeric features
3. Fits a DBSCAN model
4. Marks anomalies as DBSCAN noise points (`label = -1`)


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score


## 1) Load data

In [2]:
data_path = "tripadvisor_reviews.csv"  # load dataset
df = pd.read_csv(data_path)

df.head()


Unnamed: 0,user_id,avg_museum_rating,avg_park_rating,avg_restaurant_rating,avg_nightlife_rating
0,User 1,1.93,2.8,3.29,1.62
1,User 2,2.02,3.2,3.66,1.64
2,User 3,2.22,1.8,1.54,1.53
3,User 4,1.45,2.8,1.29,1.57
4,User 5,1.51,2.2,2.18,1.57


## 2) Prepare features (drop `user_id`) and scale

In [3]:
feature_cols = [c for c in df.columns if c != "user_id"] # exclude user_id
X = df[feature_cols].values
# dbscan based on distance, always need to be scaled
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

print("Rows:", len(df))
print("Features:", feature_cols)


Rows: 980
Features: ['avg_museum_rating', 'avg_park_rating', 'avg_restaurant_rating', 'avg_nightlife_rating']


## 3) Fit DBSCAN with defined parameters

In [4]:
dbscan = DBSCAN(eps=0.8, min_samples=2)
labels = dbscan.fit_predict(X_scaled)
print(labels)
n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
n_noise = int(np.sum(labels == -1))




[ 0  0  0  0  0  0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  0  0  0  0
  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  1  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  2
  0  0  0  0  0  0  0  0  0  0 -1  0  3  0  4  0 -1  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  2  0  0  0  0  0
  0  0  0  5  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0 -1  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0 -1  0  1  0  0  0  0  0  0  0  0  0  0 -1
  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0 -1 -1  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  0  0  0  0  0  0  6  0
  0  0  0  0  0  0  0  4  0  0  7  0  0  0  0  0  0  0  0  0  8  0  0  0
  0  0  0  0  0  0  0  0  0  0 -1  0  0  0  0  0  9

In [5]:
#print("Clusters found:", n_clusters)
print("Anomalies (noise, label=-1):", n_noise, f"({n_noise/len(labels):.2%})")

Anomalies (noise, label=-1): 42 (4.29%)


## 4) Export anomaly list

This table gives the original rows and the DBSCAN label.

In [6]:
out = df.copy()
out["dbscan_label"] = labels
out["is_anomaly"] = labels == -1

anomalies = out[out["is_anomaly"]].copy()
print("Anomaly rows:", len(anomalies))
anomalies.head(20)


Anomaly rows: 42


Unnamed: 0,user_id,avg_museum_rating,avg_park_rating,avg_restaurant_rating,avg_nightlife_rating,dbscan_label,is_anomaly
25,User 26,1.61,3.84,3.8,1.48,-1,True
60,User 61,2.06,2.92,4.11,1.55,-1,True
82,User 83,1.7,2.52,1.45,2.65,-1,True
88,User 89,1.64,3.36,1.14,2.68,-1,True
108,User 109,2.76,1.04,1.43,1.39,-1,True
171,User 172,2.9,1.4,1.45,1.48,-1,True
175,User 176,1.42,2.88,1.26,2.05,-1,True
199,User 200,2.25,3.16,1.22,2.79,-1,True
226,User 227,1.9,3.24,1.27,2.78,-1,True
239,User 240,3.08,1.48,1.18,1.15,-1,True
