In [None]:
import fireducks.pandas as pd
import matplotlib.pyplot as plt

In [None]:
df1 = pd.read_excel('Online_Retail.xlsx')

In [None]:
df1.head()

In [None]:
df1.info()

In [None]:
# Convert object columns to string type
object_columns = df1.select_dtypes(include=['object']).columns
for col in object_columns:
    df1[col] = df1[col].astype(str)

# Verify the conversion
print("Updated data types:")
print(df1.dtypes)

In [None]:
df1.describe()

In [None]:
df1.isnull().sum()

In [None]:
df1[df1.Description.isnull()]

In [None]:
df1[df1.StockCode=="22139"]

In [None]:
df1[df1.StockCode=="22139"].Description.mode()

In [None]:
most_freq = df1[["StockCode", "Description"]].value_counts().reset_index()

In [None]:
most_freq

In [None]:
most_freq[most_freq.StockCode=="85123A"].head(1)

In [None]:
most_freq = most_freq.groupby("StockCode").head(1)
most_freq

In [None]:
most_freq.columns = ["StockCode", "freq_Description", "count"]
df2 = df1.merge(most_freq, on="StockCode", how="left")
df2.head()

In [None]:
df2["Description"] = df2["freq_Description"]
df2.head()

In [None]:
df2.isnull().sum()

In [None]:
df2.drop(columns=["freq_Description","count"], inplace = True)
df2.head()

In [None]:
df2.describe()

In [None]:
df2[df2.Quantity<0]

In [None]:
df2[df2.UnitPrice<0]

In [None]:
df3 = df2[(df2.UnitPrice>0) & (df2.Quantity>0)]
df3.describe()

In [None]:
df3.Quantity.quantile(0.9999)

In [None]:
df3[df3.Quantity>1500]

Feature Engineering

In [None]:
df4 = df3.copy()

In [None]:
df4["TotalSales"] = df4.Quantity * df4.UnitPrice
df4.head(3)

In [None]:
df4['Month'] = df4["InvoiceDate"].dt.month
df4.sample(3)

## Visualizations

In [None]:
monthly_sales = df4.groupby("Month")["TotalSales"].sum()
monthly_sales.plot(kind='line', title="Monthly Sales", marker="o")
plt.xlabel("Month")
plt.ylabel("Total Sales")
plt.grid()
plt.show()

## Insights
Total sales started rising up in August having a peek in November. This is likely due to holiday season at the end of the year


#### Top 5 Countries

In [None]:
top_5_countries = df4.groupby("Country")["TotalSales"].sum().sort_values(ascending=False).head(5)
top_5_countries

In [None]:
top_5_countries.plot(kind='barh', title='Top 5 Countries by Total Sales')
plt.xlabel('Total Sales')
plt.ylabel('Country')
plt.grid(axis='x')
plt.show()

In [None]:
country_wise_sales = df4.groupby('Country')['TotalSales'].sum()
total_sales = country_wise_sales.sum()

top_5_countries = country_wise_sales.sort_values(ascending=False).head()
percentages = (top_5_countries/total_sales)*100

plt.figure(figsize=(10,6))
bars = plt.barh(top_5_countries.index, percentages)
plt.xlabel("Percentage Contribution to Total Sales")
plt.ylabel("Country")
plt.title("Top 5 Countries by Percentage Contribution to Total Sales")
plt.grid(axis='x')
for bar,percentage in zip(bars, percentages):
    plt.text(bar.get_width() + 0.5, bar.get_y() + bar.get_height()/2, f'{percentage:.1f}%', va='center')
plt.show()



In [None]:
product_wise_sales = df4.groupby('StockCode')['TotalSales'].sum()

top_5_products = product_wise_sales.sort_values(ascending=False).head(5)
top_5_products.plot(kind='barh', color='skyblue')
plt.title('Product-Wise Sales')
plt.xlabel('Product Stock Code')
plt.gca().invert_yaxis()
plt.show()

In [None]:
total_sales = product_wise_sales.sum()
percentages = (top_5_products / total_sales) * 100
plt.figure(figsize=(10,6))
bars = plt.barh(top_5_products.index, percentages)
plt.xlabel('Percentage Contribution to Total Sales')
plt.ylabel('Product Stock Code')
plt.title('Top 5 Products by Percentage Contribution to Total Sales')
plt.grid(axis='x')
for bar, percentage in zip(bars, percentages):
    plt.text(bar.get_width() + 0.02, bar.get_y() + bar.get_height()/2, f'{percentage:.1f}%', va='center')
plt.show()

RFM Analysis

In [None]:
df4['InvoiceDate'].max()

In [None]:
current_dt = df4['InvoiceDate'].max() + pd.Timedelta(days=1)
current_dt

In [None]:
rfm = df4.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (current_dt - x.max()).days,
    'InvoiceNo': 'count',
    'TotalSales': 'sum'
})

rfm.columns = ['Recency', 'Frequency', 'Monetary']
rfm.head()

In [None]:
rfm['R_segment'] = pd.qcut(rfm['Recency'], 4, labels=[4,3,2,1])
rfm['F_segment'] = pd.qcut(rfm['Frequency'], 4, labels=[1,2,3,4])
rfm['M_segment'] = pd.qcut(rfm['Monetary'], 4, labels=[1,2,3,4])
rfm['RFM_Score'] = rfm[['R_segment', 'F_segment', 'M_segment']].sum(axis=1)
rfm

In [None]:
rfm.sort_values('RFM_Score', ascending=False)