In [2]:
# Import pandas package
import pandas as pd

In [3]:
# Load product information
products = pd.read_csv("archive/product_info.csv")
products.shape

(8494, 27)

In [4]:
# # List of review files
review_files = [
    "archive/reviews_0-250.csv",
    "archive/reviews_250-500.csv",
    "archive/reviews_500-750.csv",
    "archive/reviews_750-1250.csv",
    "archive/reviews_1250-end.csv"
]

# Load and concatenate reviews with low_memory=False
reviews = pd.concat(
    [pd.read_csv(f, low_memory=False) for f in review_files],
    ignore_index=True
)

# Remove unwanted column if present
if "Unnamed: 0" in reviews.columns:
    reviews = reviews.drop(columns=["Unnamed: 0"])

# Load products with low_memory=False
products = pd.read_csv("archive/product_info.csv", low_memory=False)

# Print shapes
print("Reviews:", reviews.shape)
print("Products:", products.shape)

Reviews: (1094411, 18)
Products: (8494, 27)


In [5]:
# Remove unwanted column if present
if "Unnamed: 0" in reviews.columns:
    reviews = reviews.drop(columns=["Unnamed: 0"])
if "Unnamed: 0" in products.columns:
    products = products.drop(columns=["Unnamed: 0"])


In [6]:
print("Total rows:", len(reviews))
print("Duplicate rows (full row duplicates):", reviews.duplicated().sum())
print("Duplicate review IDs (if Unnamed: 0 existed):", "Unnamed: 0" in reviews.columns)
print("Duplicate author_id + product_id:", reviews.duplicated(subset=["author_id", "product_id"]).sum())


Total rows: 1094411
Duplicate rows (full row duplicates): 224
Duplicate review IDs (if Unnamed: 0 existed): False
Duplicate author_id + product_id: 5525


In [7]:
# Merge reviews and products on 'product_id' with left join
merged = reviews.merge(
    products,
    on="product_id",
    how="left",
    suffixes=("_review", "_product")
)

print("Merged:", merged.shape)


Merged: (1094411, 44)


In [8]:
# Keep relevant columns
merged = merged[
    [
        "product_id",
        "product_name_product",
        "brand_name_product",
        "price_usd_product",
        "primary_category",
        "secondary_category",
        "ingredients",
        "rating_review",
        "review_text"
    ]
]

merged.head()


Unnamed: 0,product_id,product_name_product,brand_name_product,price_usd_product,primary_category,secondary_category,ingredients,rating_review,review_text
0,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0,Skincare,Cleansers,"['Water (Aqua), Dipropylene Glycol, Peg-6 Capr...",5,I use this with the Nudestix “Citrus Clean Bal...
1,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",1,I bought this lip mask after reading the revie...
2,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,My review title says it all! I get so excited ...
3,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,I’ve always loved this formula for a long time...
4,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,"If you have dry cracked lips, this is a must h..."


In [9]:
# Save cleaned dataset with ratings rounded to 2 decimals
merged["rating_review"] = pd.to_numeric(merged["rating_review"], errors="coerce").round(2)
merged.to_csv("merged_clean.csv", index=False)
merged.head()

Unnamed: 0,product_id,product_name_product,brand_name_product,price_usd_product,primary_category,secondary_category,ingredients,rating_review,review_text
0,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0,Skincare,Cleansers,"['Water (Aqua), Dipropylene Glycol, Peg-6 Capr...",5,I use this with the Nudestix “Citrus Clean Bal...
1,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",1,I bought this lip mask after reading the revie...
2,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,My review title says it all! I get so excited ...
3,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,I’ve always loved this formula for a long time...
4,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,"If you have dry cracked lips, this is a must h..."


In [10]:
merged["primary_category"].value_counts()


primary_category
Skincare    1094411
Name: count, dtype: int64