In [3]:
import pandas as pd
import numpy as np
import os

In [8]:
# Step 1: Create the synthetic shopping dataset
np.random.seed(42)
item_names = [f"item_{i}" for i in range(1, 101)]
categories = np.random.choice(['electronics', 'clothing', 'groceries', 'home'], size=100)
prices = np.round(np.random.uniform(5, 500, size=100), 2)
quantities = np.random.randint(1, 20, size=100)
customers = np.random.choice(['John', 'Alice', 'Bob', 'Jane', 'Charlie'], size=100)
genders = np.random.choice(['M', 'male', 'F', 'female', 'm', 'f'], size=100)

shopping_df = pd.DataFrame({
    'Item Name': item_names,
    'Category': categories,
    'Price': prices,
    'Quantity': quantities,
    'Customer': customers,
    'Gender': genders
})
#print(shopping_df)
print(shopping_df.head())

  Item Name     Category   Price  Quantity Customer Gender
0    item_1    groceries  484.94         1     John      m
1    item_2         home  388.69         1     John      m
2    item_3  electronics  470.05        19     Jane      M
3    item_4    groceries  447.94         2     Jane      m
4    item_5    groceries  300.96        12  Charlie      m


In [4]:
# Step 2: Save the Original Dataset (before cleaning)
def save_dataset(df, file_path):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    df.to_csv(file_path, index=False)
    print(f"Dataset saved successfully to {file_path}")



In [6]:
# File path for the original dataset
original_file_path = r"C:\Users\2217116\Desktop\L7-DataPreparation\Week 4 Labs\Week 4 Labs\original_shopping_data.csv"

# Save the original dataset
save_dataset(shopping_df, original_file_path)

Dataset saved successfully to C:\Users\2217116\Desktop\L7-DataPreparation\Week 4 Labs\Week 4 Labs\original_shopping_data.csv


In [9]:
# Step 3: Clean the dataset
def clean_shopping_data(df):
    for col in df.select_dtypes(include='object').columns:
        df[col] = df[col].str.strip().str.lower()
    df['Gender'] = df['Gender'].map({'m': 'Male', 'male': 'Male', 'f': 'Female', 'female': 'Female'})
    df['Quantity'] = df['Quantity'].astype(int)
    df['Price'] = df['Price'].astype(float)
    return df

cleaned_shopping_df = clean_shopping_data(shopping_df)
print(shopping_df)

   Item Name     Category   Price  Quantity Customer  Gender
0     item_1    groceries  484.94         1     john    Male
1     item_2         home  388.69         1     john    Male
2     item_3  electronics  470.05        19     jane    Male
3     item_4    groceries  447.94         2     jane    Male
4     item_5    groceries  300.96        12  charlie    Male
..       ...          ...     ...       ...      ...     ...
95   item_96     clothing   23.26         5    alice    Male
96   item_97     clothing  306.73        16    alice  Female
97   item_98         home  253.83        19     john    Male
98   item_99     clothing   30.48         4    alice    Male
99  item_100  electronics  142.93         3     john    Male

[100 rows x 6 columns]


In [8]:
# Step 4: Save the Cleaned Dataset
cleaned_file_path = r"C:\Users\2217116\Desktop\L7-DataPreparation\Week 4 Labs\Week 4 Labs\cleaned_shopping_data.csv"
save_dataset(cleaned_shopping_df, cleaned_file_path)

Dataset saved successfully to C:\Users\2217116\Desktop\L7-DataPreparation\Week 4 Labs\Week 4 Labs\cleaned_shopping_data.csv


In [9]:
# Step 5: Display the first 10 rows of the cleaned dataset
print("First 10 rows of the cleaned dataset:")
print(cleaned_shopping_df.head(10))

First 10 rows of the cleaned dataset:
  Item Name     Category   Price  Quantity Customer  Gender
0    item_1    groceries  484.94         1     john    Male
1    item_2         home  388.69         1     john    Male
2    item_3  electronics  470.05        19     jane    Male
3    item_4    groceries  447.94         2     jane    Male
4    item_5    groceries  300.96        12  charlie    Male
5    item_6         home  461.33         6  charlie  Female
6    item_7  electronics   48.80         4      bob    Male
7    item_8  electronics  102.01        11     jane  Female
8    item_9    groceries   27.39        17     john  Female
9   item_10     clothing  166.04         6  charlie    Male
