In [None]:
!pip install pandas matplotlib seaborn
!pip install --upgrade pandas matplotlib seaborn
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
df = pd.read_csv("data/new_clean/cleaned_online_retail.csv")
print(df.shape)
df.head(20)

df.describe(include='all')

print(df.isnull().sum())

print(df['Quantity'].describe())

plt.figure(figsize=(8,4))
sns.histplot(df['Quantity'].clip(upper=100), bins=50, kde=True)
plt.title('Distribution of Quantity (clipped at 100)')
plt.xlabel('Quantity')
plt.ylabel('Frequency')
plt.show()


print(df['Price'].describe())

print(df[df['Price'] == 0].shape[0])

plt.figure(figsize=(8,4))
sns.histplot(df['Price'], bins=100, kde=True)
plt.xlim(0, 100)  # Focus on typical price range
plt.title('Distribution of Price (zoomed)')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

plt.figure(figsize=(8,4))
top_countries = df['Country'].value_counts().head(10)
sns.barplot(x=top_countries.values, y=top_countries.index)
plt.title('Top 10 Countries by Transaction Count')
plt.xlabel('Number of Transactions')
plt.ylabel('Country')
plt.show()

plt.figure(figsize=(8,4))
top_products = df['Description'].value_counts().head(10)
sns.barplot(x=top_products.values, y=top_products.index)
plt.title('Top 10 Products by Number of Sales')
plt.xlabel('Number of Transactions')
plt.ylabel('Product')
plt.show()

# Ensure InvoiceDate is datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

# Add revenue column
df['Revenue'] = df['Quantity'] * df['Price']

# Monthly revenue
monthly_revenue = df.set_index('InvoiceDate').resample('ME')['Revenue'].sum()

plt.figure(figsize=(12,4))
monthly_revenue.plot()
plt.title('Monthly Revenue Trend')
plt.xlabel('Month')
plt.ylabel('Revenue')
plt.show()

plt.figure(figsize=(6,4))
corr = df[['Quantity', 'Price', 'Revenue']].corr()
sns.heatmap(corr, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()
