**1. Imports**

In [2]:
import pandas as pd

**2. Load Dataset**

In [31]:
ratings = pd.read_csv("/home/martimsbaltazar/Desktop/tese/datasets/BookCrossing/Ratings.csv", sep=';', encoding='latin-1') # Columns: User-ID, ISBN, Rating

users = pd.read_csv("/home/martimsbaltazar/Desktop/tese/datasets/BookCrossing/Users.csv", sep =",")         # Columns: User-ID, Age

# Check if columns were correctly parsed
print("Ratings columns:", ratings.columns.tolist())
print("Users columns:", users.columns.tolist())

Ratings columns: ['User-ID', 'ISBN', 'Rating']
Users columns: ['User-ID', 'Age']


  users = pd.read_csv("/home/martimsbaltazar/Desktop/tese/datasets/BookCrossing/Users.csv", sep =",")         # Columns: User-ID, Age


**3. Filter users with valid age**

In [32]:
print(f"Users before age filtering: {len(users)}")


# Step 1: Remove NA values (already correct)
users = users.dropna(subset=["Age"])

# Step 2: Convert 'Age' column to numeric (invalid values become NaN)
users["Age"] = pd.to_numeric(users["Age"], errors="coerce")

# Step 3: Drop rows where conversion failed (i.e. non-numeric or negative ages)
users = users.dropna(subset=["Age"])
users = users[users["Age"] > 0]

# # Step 4: (Optional) Convert to integer if needed
# users["Age"] = users["Age"].astype(int)

print(f"Users after age filtering: {len(users)}")


Users before age filtering: 278859
Users after age filtering: 166739


**4. Filter ratings to keep only those with valid users**

In [33]:
print(f"Ratings before age filtering: {len(ratings)}")

ratings = ratings[ratings["User-ID"].isin(users["User-ID"])]

print(f"Ratings after age filtering: {len(ratings)}")


Ratings before age filtering: 1149780
Ratings after age filtering: 790570


**5. Keep users with at least 5 ratings**

In [None]:
print(f"Users before min ratings filtering: {len(users)}")

user_rating_counts = ratings["User-ID"].value_counts()
eligible_users = user_rating_counts[user_rating_counts >= 5].index

print(f"Users after min ratings filtering: {len(eligible_users)}")


Users before age filtering: 166739
Users after age filtering: 14680


**6. Reduce to 6000 users**

In [35]:
eligible_users = eligible_users[:6000]
print(f"Selected users: {len(eligible_users)}")


Selected users: 6000


**7. Filter ratings for selected users**

In [36]:
filtered_ratings = ratings[ratings["User-ID"].isin(eligible_users)]

**8. Limit to 4000 most-rated ISBNs**

In [37]:
top_items = filtered_ratings["ISBN"].value_counts().head(4000).index
filtered_ratings = filtered_ratings[filtered_ratings["ISBN"].isin(top_items)]

**9. Final filter — users still with at least 5 ratings**

In [38]:
final_user_counts = filtered_ratings["User-ID"].value_counts()
final_users = final_user_counts[final_user_counts >= 5].index

filtered_ratings = filtered_ratings[filtered_ratings["User-ID"].isin(final_users)]
filtered_users = users[users["User-ID"].isin(final_users)]


**10. Save new dataset**

In [39]:
filtered_ratings.to_csv("filtered_ratings.csv", index=False)
filtered_users.to_csv("filtered_users.csv", index=False)

print("✅ Dataset successfully filtered and saved.")
print(f"Users: {len(filtered_users)}")
print(f"Items: {filtered_ratings['ISBN'].nunique()}")
print(f"Ratings: {len(filtered_ratings)}")

✅ Dataset successfully filtered and saved.
Users: 4393
Items: 4000
Ratings: 163396
