In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px 

##############################################################

from sklearn.preprocessing import LabelEncoder,PolynomialFeatures,StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso,Ridge 
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor

In [None]:
colors10 = ['#387478', '#4682B4', '#32CD32', '#FFD700','#001F3F','#B17457','#F2E5BF','#DA8359','#FFD09B','#A66E38']  # You can define your own colors
blue_1=['#2D4356', '#435B66', '#A76F6F', '#EAB2A0']
blue_2=['#0C134F', '#1D267D', '#2D4263', '#347474']
green1=['#1A1A19', '#31511E', '#859F3D', '#88C273']
brown1=['#A79277', '#D1BB9E', '#EAD8C0', '#FFF2E1']
yel_gre1=['#F3CA52', '#F6E9B2', '#0A6847', '#7ABA78']
red_tel=['#C96868', '#FADFA1', '#FFF4EA', '#7EACB5']
cofee=['#EAC696', '#C8AE7D', '#765827', '#65451F']
pastel=['#B5C0D0', '#CCD3CA', '#B4B4B8', '#B3A398']
retro=['#060047', '#B3005E', '#E90064', '#FF5F9E']
white_blue=['#04009A', '#77ACF1', '#77ACF1', '#C0FEFC']
cold_blue=['#240750', '#344C64', '#577B8D', '#57A6A1']
cold_green=['#006769', '#40A578', '#9DDE8B', '#E6FF94']
happy=['#D2E0FB', '#F9F3CC', '#D7E5CA', '#8EACCD']
sky=['#00A9FF', '#89CFF3', '#A0E9FF', '#CDF5FD']
grad_brown=['#8D7B68', '#A4907C', '#C8B6A6', '#F1DEC9']
grad_black=['#2C3333', '#2E4F4F', '#0E8388', '#CBE4DE']
grad_green=['#439A97', '#62B6B7', '#97DECE', '#CBEDD5']
grad_blue=['#164863', '#427D9D', '#9BBEC8', '#DDF2FD']
night=['#003C43', '#135D66', '#77B0AA', '#E3FEF7']


In [None]:
df = pd.read_csv('retail.csv')

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:

# Convert the date columns to datetime format
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Ship Date'] = pd.to_datetime(df['Ship Date'])

# Extract day, month, and year from 'Order Date'
df['Order Day'] = df['Order Date'].dt.day
df['Order Month'] = df['Order Date'].dt.month
df['Order Year'] = df['Order Date'].dt.year

# Extract day, month, and year from 'Ship Date'
df['Ship Day'] = df['Ship Date'].dt.day
df['Ship Month'] = df['Ship Date'].dt.month
df['Ship Year'] = df['Ship Date'].dt.year


In [None]:
df.head()

In [None]:
df["Sales"].min()

In [None]:
df[df['Sales']==0.444]

In [None]:
df.info()

In [None]:
# df.to_csv("sales_with_date_split.csv", index=False)  # index=False if you don't want to save the index

# EDA 

In [None]:
# This will sum only the 'Sales' column by 'Segment'
df.groupby(by=["Segment"])['Sales'].sum()

In [None]:
fig = px.bar(df['Order Day'].value_counts().reset_index(), y='count', x='Order Day',color='Order Day',color_discrete_sequence=night)
fig.update_layout(
    title="Order Day of the sales .",
    title_x=0.5,  # Centers the title
    title_font=dict(size=20)  # Changes the font size
)
fig.show()

In [None]:
fig=px.bar(df['Ship Day'].value_counts().reset_index(), x='Ship Day', y='count', color='Ship Day',color_discrete_sequence=brown1)
fig.update_layout(
    title="Ship Day of the data .",
    title_x=0.5,  # Centers the title
    title_font=dict(size=20)  # Changes the font size
)
fig.show()

In [None]:
fig = px.bar(df['Order Month'].value_counts().reset_index(), y='count', x='Order Month',color='Order Month',color_discrete_sequence=blue_1)
fig.update_layout(
    title="Order Month of the sales .",
    title_x=0.5,  # Centers the title
    title_font=dict(size=20)  # Changes the font size
)
fig.show()

In [None]:
fig=px.bar(df['Ship Month'].value_counts().reset_index(), x='Ship Month', y='count', color='Ship Month',color_discrete_sequence=blue_2)
fig.update_layout(
    title="Ship Month of the data .",
    title_x=0.5,  # Centers the title
    title_font=dict(size=20)  # Changes the font size
)
fig.show()

In [None]:
fig = px.bar(df['Order Year'].value_counts().reset_index(), y='count', x='Order Year',color='Order Year',color_discrete_sequence=colors10)
fig.update_layout(
    title="Order Year of the sales .",
    title_x=0.5,  # Centers the title
    title_font=dict(size=20)  # Changes the font size
)
fig.show()

In [None]:
fig=px.bar(df['Ship Year'].value_counts().reset_index(), x='Ship Year', y='count', color='Ship Year',color_discrete_sequence=night)
fig.update_layout(
    title="Ship Year of the data .",
    title_x=0.5,  # Centers the title
    title_font=dict(size=20)  # Changes the font size
)
fig.show()

In [None]:
fig=px.bar(df['Ship Mode'].value_counts().reset_index(), x='Ship Mode', y='count', color='Ship Mode',color_discrete_sequence=night)
fig.update_layout(
    title="Ship Mode of the data .",
    title_x=0.5,  # Centers the title
    title_font=dict(size=20)  # Changes the font size
)
fig.show()

In [None]:

fig = px.histogram(df['Category'].value_counts().reset_index(), y='count', x='Category',color='Category',color_discrete_sequence=grad_blue)
fig.update_layout(
    title="Category of the Product .",
    title_x=0.5,  # Centers the title
    title_font=dict(size=20)  # Changes the font size
)
fig.show()

In [None]:

fig = px.bar(df['Sub_Category'].value_counts().reset_index(), y='count', x='Sub_Category',color='Sub_Category',color_discrete_sequence=colors10)
fig.update_layout(
    title="Sub-Category of the Product .",
    title_x=0.5,  # Centers the title
    title_font=dict(size=20)  # Changes the font size
)
fig.show()

In [None]:

fig = px.bar(df['Segment'].value_counts().reset_index(), y='count', x='Segment',color='Segment',color_discrete_sequence=night)
fig.update_layout(
    title="Segmentof the sales process .",
    title_x=0.5,  # Centers the title
    title_font=dict(size=20)  # Changes the font size
)
fig.show()

In [None]:
# Prepare the data, selecting cities after the first 100 and up to the last 100
city_counts = df['City'].value_counts().reset_index().iloc[100:-100]
city_counts.columns = ['City', 'count']

# Create the treemap
fig = px.treemap(
    city_counts,
    path=['City'],  # Hierarchical path
    values='count',  # Size of each section is based on count
    color='City',
    color_discrete_sequence=cold_green
)

# Update layout for the title
fig.update_layout(
    title="Cities of Sales (After First 100 and Up to Last 100)",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()


In [None]:
# Prepare the data, selecting the last 100 cities
city_counts = df['City'].value_counts().reset_index().tail(100)
city_counts.columns = ['City', 'count']

# Create the treemap
fig = px.treemap(
    city_counts,
    path=['City'],  # Define hierarchy (flat here, as we only have City)
    values='count',  # Size of each section based on count
    color='City',
    color_discrete_sequence=yel_gre1
)

# Update layout for the title
fig.update_layout(
    title="Last 100 Cities of Sales",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()


In [None]:
# Prepare the data, selecting the top 100 cities
city_counts = df['City'].value_counts().reset_index().head(100)
city_counts.columns = ['City', 'count']

# Create the treemap
fig = px.treemap(
    city_counts,
    path=['City'],  # Define hierarchy (flat here, as we only have City)
    values='count',  # Size of each section based on count
    color='City',
    color_discrete_sequence=cofee
)

# Update layout for the title
fig.update_layout(
    title="Top 100 Cities of Sales",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()


In [None]:
fig = px.sunburst(
    df['State'].value_counts().reset_index(),
    path=['State'],
    values='count',
    color='State',
    color_discrete_sequence=sky
)

fig.update_layout(
    title="Sales by State (Sunburst)",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()


In [None]:

# Prepare the data by counting sales per region
region_counts = df['Region'].value_counts().reset_index()
region_counts.columns = ['Region', 'count']

# Create the pie chart
fig = px.pie(
    region_counts,
    names='Region',  # Set regions as labels
    values='count',   # Set count as the value for each slice
    color='Region',
    color_discrete_sequence=white_blue
)

# Update layout for title
fig.update_layout(
    title="Region of the Sales Process",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()


In [None]:
fig = px.scatter(
    df,
    x='Quantity',
    y='Sales',
    color='Quantity',
    color_discrete_sequence=colors10,
    size='Sales',  # Adjusts size based on sales for added insight
    labels={'Sales':'Sales', 'Quantity':'Quantity'}
)

fig.update_layout(
    title="Sales vs. Quantity",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()


In [None]:
fig = px.scatter(
    df,
    y='Sales',
    x='State',
    size='Quantity',
    color_discrete_sequence=white_blue,
    labels={'Sales':'Sales','State':'State'}
)

fig.update_layout(
    title="Sales vs.State as Bubble Size",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()


In [None]:
# Group by Product Name to get counts, then sort the values in descending order
product_counts = df['Product Name'].value_counts().reset_index()
product_counts.columns = ['Product Name', 'count']
product_counts = product_counts.sort_values(by='count', ascending=False)

fig = px.treemap(
    product_counts,
    path=['Product Name'],
    values='count',
    color='Product Name',
    color_discrete_sequence=colors10
)

fig.update_layout(
    title="Product Order Counts (Treemap)",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()



In [None]:
# Group by Product Name to get counts, then sort the values in descending order
product_counts = df['Product Name'].value_counts().reset_index()
product_counts.columns = ['Product Name', 'count']
product_counts = product_counts.sort_values(by='count', ascending=False)


# Prepare the data by selecting the top 100 products
top_product_counts = product_counts.head(100)

# Create the Treemap
fig = px.treemap(
    top_product_counts,
    path=['Product Name'],  # Define hierarchy (flat here, as we only have Product Name)
    values='count',  # Size of each section based on count
    color='Product Name',
    color_discrete_sequence=blue_2
)

# Update layout for the title
fig.update_layout(
    title="Top 100 Product Names by Order Count (Treemap)",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()


In [None]:
# Group by Product Name to get counts, then sort the values in descending order
product_counts = df['Product Name'].value_counts().reset_index()
product_counts.columns = ['Product Name', 'count']
product_counts = product_counts.sort_values(by='count', ascending=False)

# Prepare the data by selecting the last 100 products
last_product_counts = product_counts.tail(100)

# Create the Treemap
fig = px.treemap(
    last_product_counts,
    path=['Product Name'],  # Define hierarchy (flat here, as we only have Product Name)
    values='count',  # Size of each section based on count
    color='Product Name',
    color_discrete_sequence=colors10
)

# Update layout for the title
fig.update_layout(
    title="Last 100 Product Names by Order Count (Treemap)",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()

In [None]:
# Group by Product Name to get counts, then sort the values in descending order
product_counts = df['Product Name'].value_counts().reset_index()
product_counts.columns = ['Product Name', 'count']
product_counts = product_counts.sort_values(by='count', ascending=False)

# Prepare the data by selecting the middle products (excluding the top 100 and bottom 100)
middle_product_counts = product_counts.iloc[100:-100]

# Create the Treemap
fig = px.treemap(
    middle_product_counts,
    path=['Product Name'],  # Define hierarchy (flat here, as we only have Product Name)
    values='count',  # Size of each section based on count
    color='Product Name',
    color_discrete_sequence=colors10
)

# Update layout for the title
fig.update_layout(
    title="Middle Range Product Names by Order Count (Treemap)",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()

# Analysis to get more insights for sales

In [None]:
df.columns

In [None]:
sales_per_category = df.groupby(['Category','Sub_Category','Product Name','Customer Name'])['Sales'].sum().reset_index()
sales_per_category

# Sales by Category

# what is the sum of sales for Category and Sub-Category and Product Name ? 

In [None]:
fig = px.treemap(sales_per_category, path=['Category', 'Sub_Category','Product Name'], values='Sales',
                 color='Category', color_discrete_sequence=colors10)

fig.update_layout(
    title="Sales by Category and Sub-Category and Product Name",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()


# what is the sum of sales for Category and Sub-Category ? 

In [None]:
fig = px.sunburst(sales_per_category, path=['Category', 'Sub_Category'], values='Sales',
                  color='Category', color_discrete_sequence=colors10)

fig.update_layout(
    title="Sales Distribution per Category and Sub-Category",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()


# order by Customer 

# "What Category did each customer purchase?"

In [None]:
# Group by Product Name to get counts, then sort the values in descending order
Customer_counts = df['Customer Name'].value_counts().reset_index()
Customer_counts.columns = ['Customer Name', 'count']
Customer_counts = Customer_counts.sort_values(by='count', ascending=False)

fig = px.treemap(sales_per_category, path=['Category','Customer Name'], values='Sales',
                 color='Category', color_discrete_sequence=grad_black)

fig.update_layout(
    title="Sales Treemap by Customer Name",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()

# what is the sales for each one of 'Region','State','City' ?

In [None]:
fig = px.treemap(df, path=['Region','State','City'], values='Sales',
                 color='Region', color_discrete_sequence=cold_blue)

fig.update_layout(
    title="Sales Treemap by state and city and region",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()

# what is the sales for each one of 'State','Category','Sub-Category ?

In [None]:
fig = px.treemap(df, path=['State','Category','Sub_Category'], values='Sales',
                  color='State', color_discrete_sequence=yel_gre1)

fig.update_layout(
    title="Sales Distribution per Category and Sub-Category",
    title_x=0.5,
    title_font=dict(size=20)
)

fig.show()
