## Data Cleaning 

In [1]:
import pandas as pd
from datetime import datetime

# Load the data
data = pd.read_csv("amazon_uk_shoes_products_kaggle.csv")  

# Standardize the date format
def parse_review_date(date_str):
    try:
        # Example: "Reviewed on 25 November 2023"
        return datetime.strptime(date_str.replace("Reviewed on ", ""), "%d %B %Y")
    except Exception as e:
        print(f"Error parsing date: {e}")
        return None

data['review_date'] = data['review_date'].apply(parse_review_date)

# Convert review_rating to integers
data['review_rating'] = pd.to_numeric(data['review_rating'], errors='coerce')

# Standardize verified_purchase
data['verified_purchase'] = data['verified_purchase'].apply(lambda x: str(x).strip().lower() == 'true')

# Remove duplicates
data = data.drop_duplicates(subset='uniq_id')

# Handle missing values (example: dropping rows with missing key fields)
data = data.dropna(subset=['url', 'product_name', 'review_text', 'review_rating'])

# Clean text fields
text_columns = ['product_name', 'reviewer_name', 'review_title', 'review_text']
for col in text_columns:
    data[col] = data[col].str.strip().replace(r'\s+', ' ', regex=True)

# Preview the cleaned data
print(data.head())

# Save cleaned data
data.to_csv("amazon_uk_shoes_products_kaggle_cleaned.csv", index=False)


Error parsing date: time data 'Reviewed in the United States on 2 June 2020' does not match format '%d %B %Y'
Error parsing date: time data 'Reviewed in the United States on 28 October 2021' does not match format '%d %B %Y'
Error parsing date: time data 'Reviewed in the United States on 20 January 2021' does not match format '%d %B %Y'
Error parsing date: time data 'Reviewed in the United States on 22 April 2021' does not match format '%d %B %Y'
Error parsing date: time data 'Reviewed in Italy on 2 April 2021' does not match format '%d %B %Y'
Error parsing date: time data 'Reviewed in Italy on 27 September 2021' does not match format '%d %B %Y'
Error parsing date: time data 'Reviewed in Italy on 8 April 2021' does not match format '%d %B %Y'
Error parsing date: time data 'Reviewed in Italy on 5 July 2021' does not match format '%d %B %Y'
Error parsing date: time data 'Reviewed in Italy on 9 November 2021' does not match format '%d %B %Y'
Error parsing date: time data 'Reviewed in Italy

In [2]:
data

Unnamed: 0,url,product_name,reviewer_name,review_title,review_text,review_rating,verified_purchase,review_date,helpful_count,uniq_id,scraped_at
0,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Jocelyn McSayles,Love em,Love these. Was looking for converses and thes...,5,True,,2 people found this helpful,36eae4e5-2894-5279-a0b7-d2b330e2b814,24/12/21 02:26
1,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Kenia Rivera,The plastic ripped,"The shoes are very cute, but after the 2nd day...",2,True,,,f4778bb8-3070-5cb1-b5aa-ffce41a97b57,24/12/21 02:26
2,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Chris Souza,Good quality,Good quality,5,True,,,db5a7525-d40b-5265-84d8-df4f29837a3b,24/12/21 02:26
3,https://www.amazon.co.uk/dp/B07SBX32T5,Klasified Women's Transparent Clear Sneaker Sh...,Amazon Customer,Good,Great,5,True,,,75a42851-6462-54b5-988a-27d336221943,24/12/21 02:26
4,https://www.amazon.co.uk/dp/B08SW434MG,"GUESS Women's Bradly Gymnastics Shoe, White, 7 UK",Graziella,PERFETTE!!,Ho scelto il modello bianco con rifinitura die...,5,True,,2 people found this helpful,232dee43-849e-5d06-ba05-efb3f4814714,24/12/21 02:26
...,...,...,...,...,...,...,...,...,...,...,...
6818,https://www.amazon.co.uk/dp/B07TPYWFVN,"Clarks Vennor Wing, Men’s Low-Top Sneakers, Bl...",mauti72,Schick und leicht,"Schicker Schuh, läuft sich gut.",5,True,,,0850eae1-fa2f-59e6-bf30-ad49151bfa20,24/12/21 02:29
6819,https://www.amazon.co.uk/dp/B07TPYWFVN,"Clarks Vennor Wing, Men’s Low-Top Sneakers, Bl...",Charles Lechesnier,EXCELLENT,Mieux que je ne l'imaginais. Très bonne taille...,5,True,,,4bf117ed-ea7c-517c-967c-2aee3f80ed29,24/12/21 02:29
6820,https://www.amazon.co.uk/dp/B084WB2D93,"Rohde Men's Tivoli-H Mule, 82 Anthracite, 12.5 UK",Rebecca Lützenkirchen,Einfach schöne Hausschuhe,Habe sie als Geschenk gekauft und sie sind seh...,5,True,,,5b129eb2-a438-5377-9c46-217a177615b2,24/12/21 02:29
6821,https://www.amazon.co.uk/dp/B084WB2D93,"Rohde Men's Tivoli-H Mule, 82 Anthracite, 12.5 UK",Sergej Friedel,Langlebig.,Trage diese Hausschuhe fast zwei Monate jeden ...,5,True,,,91144305-98db-5a55-8ec4-16a253beb811,24/12/21 02:29
