In [1]:
!pip install pandas plotly dash jupyter-dash scikit-learn


Collecting dash
  Downloading dash-2.18.1-py3-none-any.whl.metadata (10 kB)
Collecting jupyter-dash
  Downloading jupyter_dash-0.4.2-py3-none-any.whl.metadata (3.6 kB)
Collecting dash-html-components==2.0.0 (from dash)
  Downloading dash_html_components-2.0.0-py3-none-any.whl.metadata (3.8 kB)
Collecting dash-core-components==2.0.0 (from dash)
  Downloading dash_core_components-2.0.0-py3-none-any.whl.metadata (2.9 kB)
Collecting dash-table==5.0.0 (from dash)
  Downloading dash_table-5.0.0-py3-none-any.whl.metadata (2.4 kB)
Collecting retrying (from dash)
  Downloading retrying-1.3.4-py3-none-any.whl.metadata (6.9 kB)
Collecting ansi2html (from jupyter-dash)
  Downloading ansi2html-1.9.2-py3-none-any.whl.metadata (3.7 kB)
Collecting jedi>=0.16 (from ipython->jupyter-dash)
  Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading dash-2.18.1-py3-none-any.whl (7.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.5/7.5 MB[0m [31m31.4 MB/s[0m eta [3

In [3]:
import pandas as pd

# Load the sales dataset
url = '/SuperStoreOrders.csv'  # Change this URL if needed
df = pd.read_csv(url)

# Display the first few rows
df.head()


Unnamed: 0,order_id,order_date,ship_date,ship_mode,customer_name,segment,state,country,market,region,...,category,sub_category,product_name,sales,quantity,discount,profit,shipping_cost,order_priority,year
0,AG-2011-2040,1/1/2011,6/1/2011,Standard Class,Toby Braunhardt,Consumer,Constantine,Algeria,Africa,Africa,...,Office Supplies,Storage,"Tenex Lockers, Blue",408,2,0.0,106.14,35.46,Medium,2011
1,IN-2011-47883,1/1/2011,8/1/2011,Standard Class,Joseph Holt,Consumer,New South Wales,Australia,APAC,Oceania,...,Office Supplies,Supplies,"Acme Trimmer, High Speed",120,3,0.1,36.036,9.72,Medium,2011
2,HU-2011-1220,1/1/2011,5/1/2011,Second Class,Annie Thurman,Consumer,Budapest,Hungary,EMEA,EMEA,...,Office Supplies,Storage,"Tenex Box, Single Width",66,4,0.0,29.64,8.17,High,2011
3,IT-2011-3647632,1/1/2011,5/1/2011,Second Class,Eugene Moren,Home Office,Stockholm,Sweden,EU,North,...,Office Supplies,Paper,"Enermax Note Cards, Premium",45,3,0.5,-26.055,4.82,High,2011
4,IN-2011-47883,1/1/2011,8/1/2011,Standard Class,Joseph Holt,Consumer,New South Wales,Australia,APAC,Oceania,...,Furniture,Furnishings,"Eldon Light Bulb, Duo Pack",114,5,0.1,37.77,4.7,Medium,2011


In [4]:
# Summary statistics
summary = df.describe()
print(summary)


           quantity      discount        profit  shipping_cost          year
count  51290.000000  51290.000000  51290.000000   51290.000000  51290.000000
mean       3.476545      0.142908     28.641740      26.375915   2012.777208
std        2.278766      0.212280    174.424113      57.296804      1.098931
min        1.000000      0.000000  -6599.978000       0.000000   2011.000000
25%        2.000000      0.000000      0.000000       2.610000   2012.000000
50%        3.000000      0.000000      9.240000       7.790000   2013.000000
75%        5.000000      0.200000     36.810000      24.450000   2014.000000
max       14.000000      0.850000   8399.976000     933.570000   2014.000000


In [11]:
import plotly.express as px

# 1. Sales over time
fig1 = px.line(df, x='discount', y='profit', title='Sales Over Time')
fig1.show()

# 2. Sales by Category
fig2 = px.bar(df, x='year', y='profit', title='Sales by Category')
fig2.show()

# 3. Sales Distribution
fig3 = px.histogram(df, x='profit', title='Sales Distribution')
fig3.show()


In [12]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Convert 'profit' to datetime
df['profit'] = pd.to_datetime(df['profit'])

# Convert dates to timestamp
df['profit'] = df['profit'].map(pd.Timestamp.timestamp)

# Define features (X) and target (y)
X = df[['profit', 'quantity', 'discount']]
y = df['shipping_cost']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [13]:
# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Calculate the mean squared error
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 2833.380068652626


In [15]:
#Create an Interactive Dashboard
from jupyter_dash import JupyterDash
from dash import dcc, html, Input, Output

# Initialize the Dash app
app = JupyterDash(__name__)

# Define the layout of the dashboard
app.layout = html.Div([
    html.H1('Sales Analysis Dashboard'),
    dcc.Dropdown(
        id='profit-dropdown',
        options=[{'label': cat, 'value': cat} for cat in df['profit'].unique()],
        multi=True,
        value=[df['profit'].unique()[0]]  # Default value
    ),
    dcc.Graph(id='sales-time'),
    dcc.Graph(id='sales-profit'),
    dcc.Graph(id='sales-distribution'),
])


In [16]:
#Add Callbacks for Interactivity:
@app.callback(
    Output('sales-time', 'figure'),
    Output('sales-category', 'figure'),
    Output('sales-distribution', 'figure'),
    Input('category-dropdown', 'value')
)
def update_graph(selected_categories):
    filtered_df = df[df['Category'].isin(selected_categories)]

    # Sales Over Time
    sales_time_fig = px.line(filtered_df, x='OrderDate', y='Sales', title='Sales Over Time')

    # Sales by Category
    sales_category_fig = px.bar(filtered_df, x='Category', y='Sales', title='Sales by Category')

    # Sales Distribution
    sales_distribution_fig = px.histogram(filtered_df, x='Sales', title='Sales Distribution')

    return sales_time_fig, sales_category_fig, sales_distribution_fig


In [17]:
# Run the app
app.run_server(mode='inline')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>