In [18]:
import pandas as pd

# Load your dataset
df = pd.read_csv("file.csv")

# Convert 'Timestamp' column to datetime format (auto-detects format)
df['Timestamp'] = pd.to_datetime(df['Timestamp'], errors='coerce')

# Standardize to ISO 8601 format (YYYY-MM-DDTHH:MM:SS)
df['Timestamp'] = df['Timestamp'].dt.strftime('%Y-%m-%dT%H:%M:%S')

print("✅ Timestamps standardized and saved.")

✅ Timestamps standardized and saved.


In [19]:
# Normalize product and category names: strip whitespace, lowercase, title case
df['Product Name'] = df['Product Name'].str.strip().str.title()
df['Product Category'] = df['Product Category'].str.strip().str.title()


In [20]:
from difflib import get_close_matches

# Get unique known categories after normalization
known_categories = df['Product Category'].dropna().unique().tolist()

# Function to correct typos using fuzzy matching
def correct_category(cat, known_list):
    if pd.isna(cat): return cat
    match = get_close_matches(cat, known_list, n=1, cutoff=0.8)
    return match[0] if match else cat

# Apply the correction to each row
df['Product Category'] = df['Product Category'].apply(lambda x: correct_category(x, known_categories))


In [21]:
# --- Fix Fulfillment Status ---
df['Fulfillment Status'] = df['Fulfillment Status'].fillna("Unknown")

# --- Fix Order Value ---
df['Order Value'] = df['Order Value'].fillna(0.0)

# --- Fix Product Category ---
df['Product Category'] = df['Product Category'].fillna("Uncategorized")

In [22]:
# 1️⃣ Remove invalid (blank or < 3-word) reviews
def is_valid_review(text):
    if pd.isna(text): return False
    return len(text.strip().split()) >= 2

df = df[df['Review Content'].apply(is_valid_review)]

# 2️⃣ Fill missing ratings using product-wise average
df['Rating'] = df.groupby('Product Name')['Rating'].transform(
    lambda x: x.fillna(x.mean())
)

# 3️⃣ If rating is still NaN (e.g., all ratings missing for that product), fill with global average
global_avg = df['Rating'].mean()
df['Rating'] = df['Rating'].fillna(global_avg)

# 4️⃣ (Optional) Round ratings to 1 decimal place
df['Rating'] = df['Rating'].round(1)

# Save cleaned version
df.to_csv("cleaned.csv", index=False)

print("✅ Missing ratings filled using product-wise averages. Invalid reviews removed.")


✅ Missing ratings filled using product-wise averages. Invalid reviews removed.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Rating'] = df.groupby('Product Name')['Rating'].transform(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Rating'] = df['Rating'].fillna(global_avg)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Rating'] = df['Rating'].round(1)
