In [None]:
# Ques_3.ipynb — Compare Data Completeness Over Time

import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("sales_data.csv")

# Convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Drop rows where date conversion failed
df = df.dropna(subset=['date'])

# Extract year-month for grouping
df['month'] = df['date'].dt.to_period('M')

# Calculate missing data rate per month
missing_by_month = df.groupby('month').apply(lambda x: x.isnull().mean().mean() * 100)

# Reset index for plotting
missing_by_month = missing_by_month.reset_index(name='Missing Rate (%)')

# Plot the results
plt.figure(figsize=(10, 5))
plt.plot(missing_by_month['month'].astype(str), missing_by_month['Missing Rate (%)'], marker='o', color='red')
plt.xticks(rotation=45)
plt.title("Monthly Missing Data Rate in Sales Data")
plt.xlabel("Month")
plt.ylabel("Missing Rate (%)")
plt.grid(True)
plt.tight_layout()
plt.show()

# Optional: Save the missing rate data
missing_by_month.to_csv("monthly_missing_data_rate.csv", index=False)