In [1]:
pip install pandas plotly openpyxl



In [4]:

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go



In [5]:

# Load dataset (adjust the path as needed)
df = pd.read_csv('/content/data.csv', encoding='ISO-8859-1')
# Clean column names if needed
df.columns = [col.strip() for col in df.columns]

# Drop duplicate rows
df = df.drop_duplicates()

# Drop rows with missing essential fields
df = df.dropna(subset=['Description', 'Quantity', 'UnitPrice', 'Country'])

# Remove negative or zero quantities and prices if contextually sensible
df = df[(df['Quantity'] > 0) & (df['UnitPrice'] > 0)]
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce', infer_datetime_format=True)
df['Revenue'] = df['Quantity'] * df['UnitPrice']

# Descriptive statistics (univariate analysis)
print(df[['Quantity', 'UnitPrice', 'Revenue']].describe())

# Optional: Strip whitespace from string columns
df['Description'] = df['Description'].str.strip()
df['Country'] = df['Country'].str.strip()



           Quantity     UnitPrice       Revenue
count  72636.000000  72636.000000  72636.000000
mean      10.005603      4.381540     20.333259
std      277.853227     53.362048    300.502726
min        1.000000      0.070000      0.140000
25%        1.000000      1.250000      3.750000
50%        3.000000      2.510000      8.470000
75%        8.000000      4.210000     17.000000
max    74215.000000  13541.330000  77183.600000


  df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce', infer_datetime_format=True)


In [6]:
# 2. Categorical Univariate Analysis with Plotly

# Bar Chart: Orders by Country
country_counts = df['Country'].value_counts().reset_index()
country_counts.columns = ['Country', 'count'] # Rename columns explicitly
fig_bar = px.bar(country_counts,
                 x='Country', y='count',
                 labels={'Country':'Country', 'count':'Order Count'},
                 title='Order Count by Country')
fig_bar.show()


In [7]:
# Pie Chart: Orders by Country
country_counts_pie = df['Country'].value_counts().reset_index()
country_counts_pie.columns = ['Country', 'count'] # Rename columns explicitly
fig_pie = px.pie(country_counts_pie,
                 names='Country', values='count',
                 title='Orders Proportion by Country')
fig_pie.show()
# Bar Chart: Top 10 Product Descriptions
top_products = df['Description'].value_counts().nlargest(10).reset_index()
top_products.columns = ['Description', 'count'] # Rename columns explicitly
fig_prod_bar = px.bar(top_products, x='Description', y='count',
                      labels={'Description':'Product', 'count':'Order Count'},
                      title='Top 10 Products Sold')
fig_prod_bar.show()


In [8]:
# 3. Continuous Univariate Analysis with Plotly

# Histogram: Quantity Distribution
fig_hist = px.histogram(df, x='Quantity', nbins=20, title='Quantity Distribution')
fig_hist.show()

# Density Plot: UnitPrice Distribution
fig_density = go.Figure(go.Histogram(x=df['UnitPrice'], histnorm='probability density', nbinsx=20))
fig_density.update_layout(title='Unit Price Density Distribution',
                         xaxis_title='Unit Price',
                         yaxis_title='Density')
fig_density.show()

# Rug Plot: UnitPrice
fig_rug = px.scatter(df, x='UnitPrice', y=[0]*len(df), title='Rug Plot: UnitPrice')
fig_rug.update_traces(marker=dict(symbol='line-ns-open', size=10))
fig_rug.update_yaxes(visible=False)
fig_rug.show()


In [9]:
# Strip Plot: Quantity
fig_strip = px.strip(df, x='Quantity', title='Strip Plot: Quantity')
fig_strip.show()


In [None]:
# Swarm Plot substitute: Since plotly has no direct swarm plot, use strip/violin/box with jitter
import plotly.io as pio
pio.renderers.default = "colab"

fig_violin = px.violin(df, y='Quantity', x='Country',
                       box=True, points='all',
                       title='Quantity by Country (Violin Plot with All Points)')
fig_violin.show()

In [12]:
# Scatterplot: Quantity vs UnitPrice
fig_scatter = px.scatter(df, x='Quantity', y='UnitPrice', title='UnitPrice vs Quantity')
fig_scatter.show()
# Line Plot: Quantity over InvoiceNo (sorted)
df_sorted = df.sort_values('InvoiceNo')
fig_line = px.line(df_sorted, x='InvoiceNo', y='Quantity', title='Quantity over Invoice Number')
fig_line.show()

In [10]:
print(df.columns.tolist())

['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate', 'UnitPrice', 'CustomerID', 'Country', 'Revenue']


In [11]:
display(df.head())

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6.0,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6.0,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8.0,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6.0,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6.0,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
