In [6]:
import pandas as pd
# Load dataset
df = pd.read_csv("Ecommerce_dataset.csv")

# Initial exploration
df.info()
df.head()
df.isnull().sum()

# Remove duplicate rows
df.drop_duplicates(inplace=True)

# Drop rows with missing critical values
df.dropna(subset=['User_ID', 'Product_ID'], inplace=True)

# Drop unnecessary column
df.drop(columns=['Purchase_Date'], inplace=True)

# Standardize category
df['Category'] = df['Category'].str.lower().str.strip()

# Convert price and clean
df['Final_Price(Rs.)'] = df['Final_Price(Rs.)'].astype(float)
df = df[df['Final_Price(Rs.)'] > 0]

# Create interaction dataset
df_clean = df[['User_ID', 'Product_ID', 'Category', 'Final_Price(Rs.)']].copy()
df_clean['interaction'] = 1

# User-item interaction matrix
user_item_matrix = df_clean.pivot_table(
    index='User_ID',
    columns='Product_ID',
    values='interaction',
    aggfunc='sum',
    fill_value=0
)

# Save outputs
df_clean.to_csv("clean_ecommerce_data.csv", index=False)
user_item_matrix.to_csv("user_item_matrix.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3664 entries, 0 to 3663
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   User_ID           3664 non-null   object 
 1   Product_ID        3664 non-null   object 
 2   Category          3664 non-null   object 
 3   Price (Rs.)       3661 non-null   float64
 4   Discount (%)      3663 non-null   float64
 5   Final_Price(Rs.)  3662 non-null   float64
 6   Payment_Method    3664 non-null   object 
 7   Purchase_Date     3664 non-null   object 
dtypes: float64(3), object(5)
memory usage: 229.1+ KB
