# Jewelry Analysis

In [13]:
import pandas as pd
import numpy as np
import plotly.express as px

In [15]:
data = pd.read_csv("jewellery.csv")

### Data Cleanising

In [16]:
# Inspect dataframe
data.head(3)

Unnamed: 0,Order datetime,Order ID,Purchased product ID,Quantity,Category ID,Category alias,Brand ID,Price in USD,User ID,Product gender,Main color,Main metal,main gem
0,2018-12-01 11:40:29,1.92472e+18,1.8422e+18,1,1.80683e+18,jewelry.earring,0.0,561.51,1.51592e+18,,red,gold,diamond
1,2018-12-02 13:53:42,1.92551e+18,1.84221e+18,1,1.80683e+18,jewelry.pendant,1.0,54.66,1.51592e+18,f,white,gold,sapphire
2,2018-12-02 17:44:02,1.92563e+18,1.83557e+18,1,1.80683e+18,jewelry.pendant,0.0,88.9,1.51592e+18,f,red,gold,diamond


In [17]:
# Remove unnecessary columns
data = data.drop(columns=['Order ID', 'Purchased product ID', 'Quantity', 
                          'Category ID', 'Brand ID', 'User ID', 'Product gender'])

In [18]:
# Check for missing values
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10060 entries, 0 to 10059
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Order datetime  10060 non-null  object 
 1   Category alias  10060 non-null  object 
 2   Price in USD    10060 non-null  float64
 3   Main color      10060 non-null  object 
 4   Main metal      10060 non-null  object 
 5   main gem        10060 non-null  object 
dtypes: float64(1), object(5)
memory usage: 471.7+ KB


In [19]:
data['Category alias'].unique()

array(['jewelry.earring', 'jewelry.pendant', 'jewelry.necklace',
       'jewelry.ring', 'jewelry.brooch', 'jewelry.bracelet'], dtype=object)

In [20]:
# Cleanse Category column
data['Category alias'] = data['Category alias'].str.replace("jewelry.", "")
data['Category alias'].unique()

array(['earring', 'pendant', 'necklace', 'ring', 'brooch', 'bracelet'],
      dtype=object)

In [35]:
# Change type of date columns
data['Order datetime'] = data['Order datetime'].astype(np.datetime64)

In [49]:
# Change date format to year
data['Order year'] = data['Order datetime'].dt.year
data['Order month'] = data['Order datetime'].dt.month_name()

# rename columns
data = data.rename(columns={'Category alias': 'Category', 'Price in USD': 'Price'})
data.head(3)

Unnamed: 0,Order datetime,Category,Price,Main color,Main metal,main gem,Order year,Order month
0,2018-12-01 11:40:29,earring,561.51,red,gold,diamond,2018,December
1,2018-12-02 13:53:42,pendant,54.66,white,gold,sapphire,2018,December
2,2018-12-02 17:44:02,pendant,88.9,red,gold,diamond,2018,December


In [48]:
data['Order year'].unique()

array([2018, 2019, 2020], dtype=int64)

In [60]:
df = pd.DataFrame(data[data['Order year'] == 2020].groupby(['Order month'])['Price'].sum())
df

Unnamed: 0_level_0,Price
Order month,Unnamed: 1_level_1
February,121124.73
January,431415.95
