In [None]:
import pandas as pd

In [None]:
# products_cl.csv
url = "https://drive.google.com/file/d/1s7Lai4NSlsYjGEPg1QSOUJobNYVsZBOJ/view?usp=sharing"
path = "https://drive.google.com/uc?export=download&id="+url.split("/")[-2]
products_df = pd.read_csv(path)

# brands_cl.csv
url = "https://drive.google.com/file/d/1XGyabaa4mAkjixMk3XPgx_14OoSse3rs/view?usp=sharing"
path = "https://drive.google.com/uc?export=download&id="+url.split("/")[-2]
brands_df = pd.read_csv(path)


In [None]:
import os
# Define paths using os.path.expanduser
orders_path = os.path.expanduser("~/Downloads/orders_qu.csv")
orderlines_path = os.path.expanduser("~/Downloads/orderlines_qu.csv")



In [None]:
products_df

In [None]:
# Read the CSV files into pandas DataFrames
orders_df = pd.read_csv(orders_path)
orderlines_df = pd.read_csv(orderlines_path)


In [None]:
orderlines_df.info()

In [None]:
orders_df.info()

In [None]:
#convert date columns to datetime 
orders_df['created_date'] = pd.to_datetime(orders_df['created_date'])
orderlines_df['date'] = pd.to_datetime(orderlines_df['date'])



### 1. Time Period Covered by the Dataset

In [None]:
# What is the time period that the dataset covers?
# min and max dates in the orders dataset
time_period = orders_df['created_date'].agg(['min', 'max'])
print("Time Period Covered by the Dataset:")
print(time_period)




### 2. Overall Revenue for the Time Period

In [None]:
# Calculate total revenue from the orderlines dataset
overall_revenue = orders_df['total_paid'].sum()

# Round the overall revenue to two decimal places
overall_revenue = round(overall_revenue, 2)
overall_revenue

### 3. Seasonal Patterns in Sales
Analyzing monthly sales to identify any seasonal patterns.

In [None]:

import plotly.express as px

# Assuming orders_df is already loaded and cleaned

# Group by month and calculate total revenue
orders_df['month'] = orders_df['created_date'].dt.to_period('M')
monthly_revenue = orders_df.groupby('month')['total_paid'].sum().reset_index()

# Convert 'month' to string for Plotly compatibility
monthly_revenue['month'] = monthly_revenue['month'].astype(str)

# Plot monthly revenue using Plotly Express
fig = px.line(monthly_revenue, x='month', y='total_paid', 
              title='Monthly Revenue Over Time',
              labels={'month': 'Month', 'total_paid': 'Total Revenue (€)'},
              markers=True)

# Customize layout for better readability
fig.update_layout(
    xaxis_title='Month',
    yaxis_title='Total Revenue (€)',
    hovermode='x unified',
    template='plotly_white',  # Optional: choose a template
    width=800, height=500,    # Adjust figure size
)

# Show the plot
fig.show()


### 4. Most Sold Products
Identify the top 10 most sold products based on quantity.

In [None]:
# Merge orderlines with products to get product names
orderlines_merged = pd.merge(orderlines_df, products_df, on='sku', how='left')

# Group by product name and calculate total quantity sold
top_selling_products = orderlines_merged.groupby('name')['product_quantity'].sum().reset_index()

# Sort the products by total quantity sold in descending order
top_selling_products = top_selling_products.sort_values(by='product_quantity', ascending=False)

print("\nTop 10 Most Sold Products:")
print(top_selling_products.head(10))

### 5. Products that Generate the Most Revenue
Identify the top 10 products that generate the most revenue.

In [None]:
# Group by product name and calculate total revenue generated
top_revenue_products = orderlines_merged.groupby('name')['unit_price_total'].sum().reset_index()

# Sort the products by total revenue generated in descending order
top_revenue_products = top_revenue_products.sort_values(by='unit_price_total', ascending=False)

print("\nTop 10 Products that Generate the Most Revenue:")
print(top_revenue_products.head(10))

### 6. Average Order Value
Calculating the average order value to understand customer spending behavior.

In [None]:
# Calculate Average Order Value (AOV)
average_order_value = orders_df['total_paid'].mean()
print(f"Average Order Value: €{average_order_value:.2f}")


In [None]:
products_df.describe()

### 7. Analyzing price impacts on sales volume and revenue

In [None]:
# Define price ranges for analysis
def price_range(price):
    if price <= 50:
        return '0-50'
    elif 50 < price <= 100:
        return '50-100'
    elif 100 < price <= 200:
        return '100-200'
    elif 200 < price <= 500:
        return '200-500'
    else:
        return '500+'

orderlines_merged['price_range'] = orderlines_merged['price'].apply(price_range)

# Group by price ranges and calculate sales volume and revenue
price_analysis = orderlines_merged.groupby('price_range').agg(
    total_sales_volume=pd.NamedAgg(column='product_quantity', aggfunc='sum'),
    total_revenue=pd.NamedAgg(column='total_price', aggfunc='sum')
).reset_index()

# Sort the price ranges for better visualization
price_analysis['price_range'] = pd.Categorical(price_analysis['price_range'],
                                               categories=['0-50', '50-100', '100-200', '200-500', '500+'],
                                               ordered=True)
price_analysis = price_analysis.sort_values('price_range')

# Identify the highest sales volume and revenue
max_sales_volume = price_analysis['total_sales_volume'].max()
max_revenue = price_analysis['total_revenue'].max()

# Apply color based on the highest values
price_analysis['color_volume'] = price_analysis['total_sales_volume'].apply(lambda x: 'orange' if x == max_sales_volume else 'blue')
price_analysis['color_revenue'] = price_analysis['total_revenue'].apply(lambda x: 'orange' if x == max_revenue else 'blue')

# Visualize the impact of price ranges on sales volume
fig_volume = px.bar(price_analysis, x='price_range', y='total_sales_volume', title='Impact of Price on Sales Volume',
                    labels={'price_range': 'Price Range (€)', 'total_sales_volume': 'Total Sales Volume'},
                    color='color_volume', color_discrete_map={'deep blue': 'deep blue', 'orange': 'orange'})
fig_volume.update_layout(
    width=800, height=600,  # Adjust figure size
    xaxis_title='Price Range (€)',
    yaxis_title='Total Sales Volume',
    hovermode='x unified',
    template='plotly_white',
    showlegend=False  # Hide legend since colors are self-explanatory
)
fig_volume.show()

# Visualize the impact of price ranges on revenue
fig_revenue = px.bar(price_analysis, x='price_range', y='total_revenue', title='Impact of Price on Revenue',
                     labels={'price_range': 'Price Range (€)', 'total_revenue': 'Total Revenue (€)'},
                     color='color_revenue', color_discrete_map={'sky blue': 'sky blue', 'orange': 'orange'})
fig_revenue.update_layout(
    width=800, height=600,  # Adjust figure size
    xaxis_title='Price Range (€)',
    yaxis_title='Total Revenue (€)',
    hovermode='x unified',
    template='plotly_white',
    showlegend=False  # Hide legend since colors are self-explanatory
)
fig_revenue.show()

In [None]:
# show the unique desc columns
products_df['desc'].unique()

In [None]:
# Assuming products_df is your DataFrame
for desc in products_df['desc'].unique():
    print(desc)

In [None]:
# Assuming products_df is your DataFrame
print(products_df['desc'].head(10))

In [None]:
# Assuming products_df is your DataFrame
print(products_df['desc'].sample(10))