In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import plotly.subplots as sp
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objects as go
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
pd.set_option('display.max_columns', None)

## Loading Data

In [None]:
Sales = pd.read_csv('sales_data.csv')
pd.set_option('display.max_columns', None)

Sales

In [None]:
## Function to clean catetory
def category(column):
    if column=='Vêtements':
        return 'Clothing'
    elif column=='Électronique':
        return 'Electronics'
    
    else:
        return column


In [None]:
Sales.columns

In [None]:
Sales['catégorie']=Sales['catégorie'].apply(category)

In [None]:
Sales.replace('Vêtements', 'Clothing')

In [None]:
Sales.rename({'catégorie': 'Category'}, axis=1, inplace=True)

In [None]:
Sales['Category'].unique()

## Data Cleaning

In [None]:
def missing_values_analysis(Sales):
    na_columns = [col for col in Sales.columns if Sales[col].isnull().sum()>0]
    n_miss = Sales[na_columns].isnull().sum().sort_values(ascending=True)
    ratio = (Sales[na_columns].isnull().sum()/ Sales.shape[0]*100).sort_values(ascending=True)
    missing_df = pd.concat([n_miss, np.round(ratio,2)],axis =1, keys=['Missing Values','Ratio'])
    missing_df =pd.DataFrame(missing_df)
    return missing_df

def check_dataframe(Sales, head=5, tail=5):
        print('SHAPE'.center(82,'~'))
        print('Rows:{}'.format(Sales.shape[0]))
        print('Columns:{}'.format(Sales.shape[1]))
        print('TYPES')
        print(Sales.dtypes)
        print("".center(82,'~'))
        print(missing_values_analysis(Sales))
        print('DUPLICATED VALUES'.center(83,'~'))
        print(Sales.duplicated().sum())
        print('QUANTILES'.center(82,'~'))
        print(Sales.quantile([0, 0.05, 0.50, 0.99,1]).T)

check_dataframe(Sales)

In [None]:
#Change date to datetime
Sales['Order Date'] = pd.to_datetime(Sales['Order Date'])

In [None]:
#Values for each column
def check_class(dataframe):
    nunique_df =pd.DataFrame({'Variables': dataframe.columns,
                              'Classes':[dataframe[i].nunique() \
                                for i in dataframe.columns]})
    nunique_df = nunique_df.sort_values('Classes',ascending = False)
    nunique_df = nunique_df.reset_index(drop = True)
    return nunique_df

check_class(Sales)

## EDA

In [None]:
#Distribution of each category
sns.pairplot(Sales)

In [None]:
constraints = ['#B34D22', '#EBE00C']

def categorical_variable_summary(Sales, Category):
    fig = make_subplots(rows=1, cols=2,  # 1 row, 2 columns
                        subplot_titles=('Countplot', 'Percentage'),
                        specs=[[{'type': 'xy'}, {'type': 'domain'}]])
    
    fig.add_trace(go.Bar(y=Sales[Category].value_counts().values.tolist(),
                         x=[str(i) for i in Sales[Category].value_counts().index],
                         textfont=dict(size=14),
                         name='Category',
                         textposition='auto',
                         showlegend=False,
                         marker=dict(color=constraints[0])),  # Use the first color in 'constraints'
                   row=1, col=1)
    
    fig.add_trace(go.Pie(labels=Sales[Category].value_counts().keys(),
                         values=Sales[Category].value_counts().values,
                         textfont=dict(size=20),
                         textposition='auto',
                         showlegend=False,
                         name='Category',
                         marker=dict(colors=constraints)),  
                  row=1, col=2)
    
    fig.update_layout(title={'text': Category,
                             'y': 0.9,
                             'x': 0.5,
                             'xanchor': 'center',
                             'yanchor': 'top'},
                      template='plotly_white')  
    
    # Use display() to show the plot in Jupyter Notebook
    display(fig)    

In [None]:
categorical_variable_summary(Sales,'Category')

In [None]:
#Products under Alimentation
Alimentation = Sales.loc[Sales['Category'] =='Alimentation']

Alimentation


In [None]:
Alimentation.describe()

In [None]:
#check the Ordered product 

fig = go.Figure()


fig.add_trace(go.Bar(
    y=Alimentation['Product'].value_counts().values.tolist(),
    x=[str(i) for i in Alimentation['Product'].value_counts().index],
    textfont=dict(size=14),
    name='Product',
    textposition='auto',
    showlegend=False,
    marker=dict(color=constraints[0])
))

# Display the figure
fig.show()



In [None]:
Alimentation.info()

In [None]:
Alimentation

In [None]:
# Striping Month to chech the turnover of the Alimentation Products by months
Alimentation['Month'] = Alimentation['Order Date'].dt.month_name()


In [None]:
# Month with most Alimentation orders 
Most = Alimentation['Month'].value_counts().sort_values()

Most



In [None]:
Most.plot(kind="bar", title="Monthly Orders")
# Rotate the x-labels by 30 degrees, and keep the text aligned horizontally
plt.xticks(rotation=30, horizontalalignment="center")
plt.title("Orders By each Month")
plt.xlabel("Month")
plt.ylabel("Number of orderes")

In [None]:
#Ordered the most in December
Most_Ord = Alimentation.loc[Alimentation['Month'] == 'December']
Product_ = Most_Ord['Product'].value_counts()

Product_

In [None]:
#Turnover for each product ordered in December 
fig = px.line(Most_Ord, x="", y="turnover", title='Turnover of each month')
fig.show()

In [None]:
#Margin for each Product in December

In [None]:
# 