In [2]:
# In notebooks/Data_Analytics.ipynb

# COMMENTS: As per the assignment, we need to add clear comments.
# This first step is to import the necessary libraries. 
# pandas is for data manipulation and analysis.
import pandas as pd

# REASONING: We need to load the dataset to begin our analysis.
# The '..' in the file path means we go up one directory from 'notebooks' 
# to the main project folder, and then down into the 'data' folder.
# We'll use a try-except block to handle a potential FileNotFoundError gracefully.

try:
    # Make sure your dataset is named 'furniture_dataset.csv'
    df = pd.read_csv('../data/furniture_dataset.csv')
    print("Dataset loaded successfully!")
except FileNotFoundError:
    print("Error: The file 'furniture_dataset.csv' was not found in the 'data' folder.")
    print("Please make sure you have downloaded it and placed it there.")

# REASONING: Display the first few rows to understand the columns and data structure.
# df.head() is a quick way to preview the data.
print("\nFirst 5 rows of the dataset:")
display(df.head())

# REASONING: Use df.info() to get a concise summary of the DataFrame.
# This is crucial for checking data types (e.g., is 'price' a number or text?) 
# and identifying columns with missing values.
print("\nDataset Information:")
df.info()

Dataset loaded successfully!

First 5 rows of the dataset:


Unnamed: 0,title,brand,description,price,categories,images,manufacturer,package_dimensions,country_of_origin,material,color,uniq_id
0,"GOYMFK 1pc Free Standing Shoe Rack, Multi-laye...",GOYMFK,"multiple shoes, coats, hats, and other items E...",$24.99,"['Home & Kitchen', 'Storage & Organization', '...",['https://m.media-amazon.com/images/I/416WaLx1...,GOYMFK,"2.36""D x 7.87""W x 21.6""H",China,Metal,White,02593e81-5c09-5069-8516-b0b29f439ded
1,"subrtex Leather ding Room, Dining Chairs Set o...",subrtex,subrtex Dining chairs Set of 2,,"['Home & Kitchen', 'Furniture', 'Dining Room F...",['https://m.media-amazon.com/images/I/31SejUEW...,Subrtex Houseware INC,"18.5""D x 16""W x 35""H",,Sponge,Black,5938d217-b8c5-5d3e-b1cf-e28e340f292e
2,Plant Repotting Mat MUYETOL Waterproof Transpl...,MUYETOL,,$5.98,"['Patio, Lawn & Garden', 'Outdoor Décor', 'Doo...",['https://m.media-amazon.com/images/I/41RgefVq...,MUYETOL,"26.8""L x 26.8""W",,Polyethylene,Green,b2ede786-3f51-5a45-9a5b-bcf856958cd8
3,"Pickleball Doormat, Welcome Doormat Absorbent ...",VEWETOL,The decorative doormat features a subtle textu...,$13.99,"['Patio, Lawn & Garden', 'Outdoor Décor', 'Doo...",['https://m.media-amazon.com/images/I/61vz1Igl...,Contrence,"24""L x 16""W",,Rubber,A5589,8fd9377b-cfa6-5f10-835c-6b8eca2816b5
4,JOIN IRON Foldable TV Trays for Eating Set of ...,JOIN IRON Store,Set of Four Folding Trays With Matching Storag...,$89.99,"['Home & Kitchen', 'Furniture', 'Game & Recrea...",['https://m.media-amazon.com/images/I/41p4d4VJ...,,"18.9""D x 14.2""W x 26""H",,Iron,Grey Set of 4,bdc9aa30-9439-50dc-8e89-213ea211d66a



Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312 entries, 0 to 311
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   title               312 non-null    object
 1   brand               312 non-null    object
 2   description         159 non-null    object
 3   price               215 non-null    object
 4   categories          312 non-null    object
 5   images              312 non-null    object
 6   manufacturer        205 non-null    object
 7   package_dimensions  306 non-null    object
 8   country_of_origin   125 non-null    object
 9   material            218 non-null    object
 10  color               265 non-null    object
 11  uniq_id             312 non-null    object
dtypes: object(12)
memory usage: 29.4+ KB


In [3]:
print("Count of missing values in each column:")
print(df.isnull().sum())

Count of missing values in each column:
title                   0
brand                   0
description           153
price                  97
categories              0
images                  0
manufacturer          107
package_dimensions      6
country_of_origin     187
material               94
color                  47
uniq_id                 0
dtype: int64


In [6]:
# In your Data_Analytics.ipynb

# --- Data Cleaning and Preprocessing ---

# REASONING: As before, fill NaN values in text columns to prepare for imputation.
df['brand'] = df['brand'].fillna('Unknown Brand')
df['material'] = df['material'].fillna('Unknown Material')
df['color'] = df['color'].fillna('Unknown Color')

# REASONING: Impute missing descriptions using other available data.
for index, row in df[df['description'].isnull()].iterrows():
    imputed_description = f"This is a {row['title']} from {row['brand']}. It is made of {row['material']} and comes in a {row['color']} color."
    df.loc[index, 'description'] = imputed_description

print("Missing descriptions have been imputed.")

# REASONING: Fill missing values in other non-critical text columns.
df['manufacturer'] = df['manufacturer'].fillna('Unknown')
df['country_of_origin'] = df['country_of_origin'].fillna('Unknown')

# --- FIX FOR THE PRICE COLUMN ---
# REASONING: The 'price' column is a string ('object') type due to the '$' symbol.
# To perform mathematical operations like calculating the median, we must first
# convert it to a numeric type. We do this by:
# 1. Replacing the '$' with an empty string.
# 2. Converting the cleaned column to a numeric type. 'errors='coerce'' will
#    turn any problematic values into NaN (Not a Number), which we can then handle.

df['price'] = df['price'].str.replace('$', '', regex=False)
df['price'] = pd.to_numeric(df['price'], errors='coerce')

# REASONING: Now that 'price' is a numeric column, we can calculate the median
# and fill any remaining missing values (the original NaNs and any new ones from 'coerce').
median_price = df['price'].median()
df['price'] = df['price'].fillna(median_price)

print("Price column cleaned and missing values imputed.")

# REASONING: Finally, drop the few rows with missing package dimensions.
df.dropna(subset=['package_dimensions'], inplace=True)

# --- VERIFICATION ---
# REASONING: Always verify the cleaning steps. This should now show 0 missing values.
print("\nMissing values after cleaning:")
print(df.isnull().sum())

# REASONING: We should also check the data types to confirm 'price' is now a number (e.g., float64).
print("\nData types after cleaning:")
df.info()

Missing descriptions have been imputed.
Price column cleaned and missing values imputed.

Missing values after cleaning:
title                 0
brand                 0
description           0
price                 0
categories            0
images                0
manufacturer          0
package_dimensions    0
country_of_origin     0
material              0
color                 0
uniq_id               0
dtype: int64

Data types after cleaning:
<class 'pandas.core.frame.DataFrame'>
Index: 306 entries, 0 to 311
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   title               306 non-null    object 
 1   brand               306 non-null    object 
 2   description         306 non-null    object 
 3   price               306 non-null    float64
 4   categories          306 non-null    object 
 5   images              306 non-null    object 
 6   manufacturer        306 non-null    object 
 7   pack