In [2]:
# Import pandas package
import pandas as pd

In [3]:
# Load product information
products = pd.read_csv("archive/product_info.csv")
products.shape

(8494, 27)

In [4]:
# Load and combine review files

review_files = [
    "archive/reviews_0-250.csv",
    "archive/reviews_250-500.csv",
    "archive/reviews_500-750.csv",
    "archive/reviews_750-1250.csv",
    "archive/reviews_1250-end.csv"
]

reviews_list = [pd.read_csv(f) for f in review_files]
reviews = pd.concat(reviews_list, ignore_index=True)

if "Unnamed: 0" in reviews.columns:
    reviews = reviews.drop(columns=["Unnamed: 0"])

reviews.shape


  reviews_list = [pd.read_csv(f) for f in review_files]
  reviews_list = [pd.read_csv(f) for f in review_files]
  reviews_list = [pd.read_csv(f) for f in review_files]


(1094411, 18)

In [5]:
reviews.info()
reviews.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1094411 entries, 0 to 1094410
Data columns (total 18 columns):
 #   Column                    Non-Null Count    Dtype  
---  ------                    --------------    -----  
 0   author_id                 1094411 non-null  object 
 1   rating                    1094411 non-null  int64  
 2   is_recommended            926423 non-null   float64
 3   helpfulness               532819 non-null   float64
 4   total_feedback_count      1094411 non-null  int64  
 5   total_neg_feedback_count  1094411 non-null  int64  
 6   total_pos_feedback_count  1094411 non-null  int64  
 7   submission_time           1094411 non-null  object 
 8   review_text               1092967 non-null  object 
 9   review_title              783757 non-null   object 
 10  skin_tone                 923872 non-null   object 
 11  eye_color                 884783 non-null   object 
 12  skin_type                 982854 non-null   object 
 13  hair_color                8

Unnamed: 0,author_id,rating,is_recommended,helpfulness,total_feedback_count,total_neg_feedback_count,total_pos_feedback_count,submission_time,review_text,review_title,skin_tone,eye_color,skin_type,hair_color,product_id,product_name,brand_name,price_usd
0,1741593524,5,1.0,1.0,2,0,2,2023-02-01,I use this with the Nudestix “Citrus Clean Bal...,Taught me how to double cleanse!,,brown,dry,black,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0
1,31423088263,1,0.0,,0,0,0,2023-03-21,I bought this lip mask after reading the revie...,Disappointed,,,,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
2,5061282401,5,1.0,,0,0,0,2023-03-21,My review title says it all! I get so excited ...,New Favorite Routine,light,brown,dry,blonde,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
3,6083038851,5,1.0,,0,0,0,2023-03-20,I’ve always loved this formula for a long time...,Can't go wrong with any of them,,brown,combination,black,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0
4,47056667835,5,1.0,,0,0,0,2023-03-20,"If you have dry cracked lips, this is a must h...",A must have !!!,light,hazel,combination,,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0


In [6]:
#Merge datasets
merged = reviews.merge(
    products,
    on="product_id",
    how="left",
    suffixes=("_review", "_product")
)

merged.shape

(1094411, 44)

In [7]:
# Keep relevant columns
merged = merged[
    [
        "product_id",
        "product_name_product",
        "brand_name_product",
        "price_usd_product",
        "primary_category",
        "secondary_category",
        "ingredients",
        "rating_review",
        "review_text"
    ]
]

merged.head()


Unnamed: 0,product_id,product_name_product,brand_name_product,price_usd_product,primary_category,secondary_category,ingredients,rating_review,review_text
0,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0,Skincare,Cleansers,"['Water (Aqua), Dipropylene Glycol, Peg-6 Capr...",5,I use this with the Nudestix “Citrus Clean Bal...
1,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",1,I bought this lip mask after reading the revie...
2,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,My review title says it all! I get so excited ...
3,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,I’ve always loved this formula for a long time...
4,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,"If you have dry cracked lips, this is a must h..."


In [8]:
# Save cleaned dataset with ratings rounded to 2 decimals
merged["rating_review"] = pd.to_numeric(merged["rating_review"], errors="coerce").round(2)
merged.to_csv("merged_clean.csv", index=False)
merged.head()

Unnamed: 0,product_id,product_name_product,brand_name_product,price_usd_product,primary_category,secondary_category,ingredients,rating_review,review_text
0,P504322,Gentle Hydra-Gel Face Cleanser,NUDESTIX,19.0,Skincare,Cleansers,"['Water (Aqua), Dipropylene Glycol, Peg-6 Capr...",5,I use this with the Nudestix “Citrus Clean Bal...
1,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",1,I bought this lip mask after reading the revie...
2,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,My review title says it all! I get so excited ...
3,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,I’ve always loved this formula for a long time...
4,P420652,Lip Sleeping Mask Intense Hydration with Vitam...,LANEIGE,24.0,Skincare,Lip Balms & Treatments,"['Diisostearyl Malate, Hydrogenated Polyisobut...",5,"If you have dry cracked lips, this is a must h..."
