In [97]:
import pandas as pd
from datetime import datetime
import time

In [98]:
# Step 1: Load raw data
raw_file = r"C:\Users\HP\Documents\ecommerce_price_analytics\data\raw\product_prices_raw.csv"
df = pd.read_csv(raw_file)

# Step 2: Inspect data
print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105 entries, 0 to 104
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   url     105 non-null    object 
 1   title   105 non-null    object 
 2   price   105 non-null    float64
 3   date    105 non-null    object 
 4   source  105 non-null    object 
dtypes: float64(1), object(4)
memory usage: 4.2+ KB
None
                                                 url  \
0  https://www.amazon.in/Samsung-Galaxy-Silver-12...   
1  https://www.amazon.in/Mi-Vacuum-Mop-Powerful-L...   
2  https://www.amazon.in/PHILIPS-Fryer-NA120-00-T...   
3  https://www.amazon.in/Samsung-Galaxy-Silver-12...   
4  https://www.amazon.in/Samsung-Galaxy-Silver-12...   

                                               title    price        date  \
0  Samsung Galaxy M14 5G (ICY Silver,6GB,128GB St...  23000.0  2025-09-04   
1  MI Xiaomi Robot Vacuum-Mop 2i, 2200 Pa Powerfu...   4999.0  2025-09-04   
2  PHILIPS A

In [99]:
# handle null values
print(df.isnull().sum())
# Example: Drop rows with missing prices
df = df.dropna(subset=['price'])


url       0
title     0
price     0
date      0
source    0
dtype: int64


In [100]:
# Step 3: Handle missing values
df_clean = df.dropna(subset=['title', 'price'])
# Step 4: Fix data types
df_clean['price'] = pd.to_numeric(df_clean['price'], errors='coerce')  # ensure

df_clean['date'] = pd.to_datetime(df_clean['date'], errors='coerce')

In [101]:
# Step 5: Remove duplicates based on url and date
df_clean = df_clean.drop_duplicates(subset=['url', 'date'])

In [102]:
# Step 6: Optional - Reset index
df_clean = df_clean.reset_index(drop=True)

In [103]:
df_clean

Unnamed: 0,url,title,price,date,source
0,https://www.amazon.in/Samsung-Galaxy-Silver-12...,"Samsung Galaxy M14 5G (ICY Silver,6GB,128GB St...",23000.0,2025-09-04,scraped
1,https://www.amazon.in/Mi-Vacuum-Mop-Powerful-L...,"MI Xiaomi Robot Vacuum-Mop 2i, 2200 Pa Powerfu...",4999.0,2025-09-04,scraped
2,https://www.amazon.in/PHILIPS-Fryer-NA120-00-T...,"PHILIPS Air Fryer NA120/00, uses up to 90% les...",4459.0,2025-09-04,scraped
3,https://www.amazon.in/Samsung-Galaxy-Silver-12...,"Samsung Galaxy M14 5G (ICY Silver,6GB,128GB St...",22530.0,2025-08-01,dummy
4,https://www.amazon.in/Samsung-Galaxy-Silver-12...,"Samsung Galaxy M14 5G (ICY Silver,6GB,128GB St...",22614.0,2025-08-02,dummy
...,...,...,...,...,...
100,https://www.amazon.in/PHILIPS-Fryer-NA120-00-T...,"PHILIPS Air Fryer NA120/00, uses up to 90% les...",8585.0,2025-08-30,dummy
101,https://www.amazon.in/PHILIPS-Fryer-NA120-00-T...,"PHILIPS Air Fryer NA120/00, uses up to 90% les...",8205.0,2025-08-31,dummy
102,https://www.amazon.in/PHILIPS-Fryer-NA120-00-T...,"PHILIPS Air Fryer NA120/00, uses up to 90% les...",8100.0,2025-09-01,dummy
103,https://www.amazon.in/PHILIPS-Fryer-NA120-00-T...,"PHILIPS Air Fryer NA120/00, uses up to 90% les...",7965.0,2025-09-02,dummy


In [44]:
print(df_clean)

                                                 url  \
0  https://www.amazon.in/Samsung-Galaxy-Silver-12...   
1  https://www.amazon.in/Test-Exclusive_2020_1144...   
2  https://www.amazon.in/HP-i3-1215U-Anti-Glare-1...   
3  https://www.amazon.in/Lenovo-IdeaPad-Warranty-...   
4  https://www.amazon.in/Bluetooth-Brightness-Wat...   
5  https://www.amazon.in/Noise-ColorFit-Oximeter-...   
6  https://www.amazon.in/Sony-WHCH510-WH-CH510-Wi...   
7  https://www.amazon.in/JBL-C100TWS-Wireless-Hea...   
8  https://www.amazon.in/Mi-Vacuum-Mop-Powerful-L...   
9  https://www.amazon.in/PHILIPS-Fryer-NA120-00-T...   

                                               title    price  \
0  Samsung Galaxy M14 5G (ICY Silver,6GB,128GB St...  23000.0   
1  Redmi Note 10 (Frost White, 6GB RAM, 128GB Sto...  11397.0   
2  HP 15s, 12th Gen Intel Core i3-1215U (8GB DDR4...  32990.0   
3  Lenovo IdeaPad 3 11th Gen Intel Core i3 15.6" ...  37100.0   
4  GOBOULT Drift+ Smart Watch 1.85''HD Screen, Bl...   119

In [81]:
df_clean

Unnamed: 0,url,title,price,date,source
0,https://www.amazon.in/Samsung-Galaxy-Silver-12...,"Samsung Galaxy M14 5G (ICY Silver,6GB,128GB St...",23000,2025-09-04,scraped
1,https://www.amazon.in/Mi-Vacuum-Mop-Powerful-L...,"MI Xiaomi Robot Vacuum-Mop 2i, 2200 Pa Powerfu...",4999,2025-09-04,scraped
2,https://www.amazon.in/PHILIPS-Fryer-NA120-00-T...,"PHILIPS Air Fryer NA120/00, uses up to 90% les...",4459,2025-09-04,scraped
3,https://www.amazon.in/Samsung-Galaxy-Silver-12...,"Samsung Galaxy M14 5G (ICY Silver,6GB,128GB St...",11486,2025-08-01,dummy
4,https://www.amazon.in/Samsung-Galaxy-Silver-12...,"Samsung Galaxy M14 5G (ICY Silver,6GB,128GB St...",11528,2025-08-02,dummy
...,...,...,...,...,...
100,https://www.amazon.in/PHILIPS-Fryer-NA120-00-T...,"GOBOULT Drift+ Smart Watch 1.85''HD Screen, Bl...",13798,2025-08-30,dummy
101,https://www.amazon.in/PHILIPS-Fryer-NA120-00-T...,"GOBOULT Drift+ Smart Watch 1.85''HD Screen, Bl...",13762,2025-08-31,dummy
102,https://www.amazon.in/PHILIPS-Fryer-NA120-00-T...,"GOBOULT Drift+ Smart Watch 1.85''HD Screen, Bl...",13399,2025-09-01,dummy
103,https://www.amazon.in/PHILIPS-Fryer-NA120-00-T...,"GOBOULT Drift+ Smart Watch 1.85''HD Screen, Bl...",13732,2025-09-02,dummy


In [109]:
# Step 7: Save cleaned data to processed folder
processed_file = r"C:\Users\HP\Documents\ecommerce_price_analytics\data\processed\product_prices_clean.csv"
df_clean.to_csv(processed_file, index=False)