In [None]:
# ===============================
# IMPORT REQUIRED LIBRARIES
# ===============================

import numpy as np                     # For numerical computations
import pandas as pd                    # For dataframe operations
import matplotlib.pyplot as plt        # For plotting graphs
from scipy import stats                # For statistical functions

# ===============================
# CREATE DATAFRAME
# ===============================

np.random.seed(42)                     # Ensures reproducibility of random values
n = 500                                # Total number of orders

df = pd.DataFrame({
    "order_id": np.arange(1, n+1),     # Unique order IDs
    "warehouse": np.random.choice(["North", "South", "East", "West"], n),
    "product_category": np.random.choice(["Electronic", "Fashion", "Grocery"], n),
    "payment_type": np.random.choice(["COD", "Prepaid"], n),
    "delivery_days": np.random.randint(1, 15, n),
    "order_value": np.random.randint(500, 5000, n),
    "discount_percent": np.random.choice([5,10,15,20,25], n),
    "rating": np.random.randint(1, 6, n),
    "returned": np.random.choice(["Yes","No"], n, p=[0.2,0.8])
})

# ===============================
# CONVERT RETURN COLUMN TO BINARY
# ===============================

df["returned_binary"] = df["returned"].map({"Yes":1, "No":0})

# ===============================
# FREQUENCY DISTRIBUTION
# ===============================

product_freq = df["product_category"].value_counts()

# ===============================
# BAR CHART – ORDERS BY WAREHOUSE
# ===============================

df["warehouse"].value_counts().plot(kind="bar")
plt.title("Orders by Warehouse")
plt.xlabel("Warehouse")
plt.ylabel("Frequency")
plt.show()

# ===============================
# HISTOGRAM – DELIVERY DAYS
# ===============================

plt.hist(df["delivery_days"], bins=10)
plt.title("Delivery Days Histogram")
plt.xlabel("Days")
plt.ylabel("Frequency")
plt.show()

# ===============================
# PIE CHART – PAYMENT TYPE
# ===============================

df["payment_type"].value_counts().plot(kind="pie", autopct="%1.1f%%")
plt.title("Payment Type Distribution")
plt.ylabel("")
plt.show()

# ===============================
# LINE GRAPH – AVG DELIVERY DAYS VS DISCOUNT
# ===============================

avg_dd = df.groupby("discount_percent")["delivery_days"].mean()
plt.plot(avg_dd.index, avg_dd.values, marker="o")
plt.xlabel("Discount %")
plt.ylabel("Average Delivery Days")
plt.title("Avg Delivery Days vs Discount")
plt.show()

# ===============================
# BOXPLOT – ORDER VALUE BY PRODUCT CATEGORY
# ===============================

df.boxplot(column="order_value", by="product_category")
plt.title("Order Value by Category")
plt.suptitle("")
plt.show()

# ===============================
# FREQUENCY POLYGON
# ===============================

freq, bins = np.histogram(df["delivery_days"], bins=10)
midpoints = (bins[:-1] + bins[1:]) / 2
plt.plot(midpoints, freq, marker="o")
plt.title("Frequency Polygon")
plt.xlabel("Delivery Days")
plt.ylabel("Frequency")
plt.show()

# ===============================
# OGIVE – CUMULATIVE FREQUENCY
# ===============================

cum_freq = np.cumsum(freq)
plt.plot(midpoints, cum_freq, marker="o")
plt.title("Ogive Curve")
plt.xlabel("Delivery Days")
plt.ylabel("Cumulative Frequency")
plt.show()

# ===============================
# MEASURES OF CENTRAL TENDENCY
# ===============================

mean_dd = df["delivery_days"].mean()
median_dd = df["delivery_days"].median()
mode_dd = df["delivery_days"].mode()[0]

# ===============================
# WEIGHTED MEAN OF ORDER VALUE
# ===============================

weighted_mean = np.average(df["order_value"], weights=df["discount_percent"])

# ===============================
# COMBINED MEAN (NORTH + SOUTH)
# ===============================

combined_mean = df[df["warehouse"].isin(["North","South"])]["delivery_days"].mean()

# ===============================
# DISPERSION MEASURES
# ===============================

range_dd = df["delivery_days"].max() - df["delivery_days"].min()
variance_dd = df["delivery_days"].var()
std_dd = df["delivery_days"].std()

# ===============================
# INTERQUARTILE RANGE (IQR)
# ===============================

Q1 = df["delivery_days"].quantile(0.25)
Q3 = df["delivery_days"].quantile(0.75)
IQR = Q3 - Q1

# ===============================
# COEFFICIENT OF VARIATION
# ===============================

cv = (std_dd / mean_dd) * 100

# ===============================
# MOST CONSISTENT WAREHOUSE
# ===============================

consistent_warehouse = df.groupby("warehouse")["delivery_days"].std().idxmin()

# ===============================
# SKEWNESS & KURTOSIS
# ===============================

skewness = stats.skew(df["order_value"])
kurtosis = stats.kurtosis(df["order_value"])

# ===============================
# HISTOGRAM + KDE
# ===============================

plt.hist(df["order_value"], bins=30, density=True)
kde = stats.gaussian_kde(df["order_value"])
x = np.linspace(df["order_value"].min(), df["order_value"].max(), 500)
plt.plot(x, kde(x))
plt.title("Order Value Distribution with KDE")
plt.show()

# ===============================
# CORRELATION
# ===============================

pearson_corr = df["discount_percent"].corr(df["order_value"], method="pearson")
spearman_corr = df["delivery_days"].corr(df["rating"], method="spearman")

# ===============================
# SCATTER PLOT
# ===============================

plt.scatter(df["discount_percent"], df["order_value"])
plt.xlabel("Discount %")
plt.ylabel("Order Value")
plt.title("Discount vs Order Value")
plt.show()

# ===============================
# LINEAR REGRESSION
# ===============================

X = df["discount_percent"]
Y = df["order_value"]

slope, intercept, r, p, se = stats.linregress(X, Y)
predicted_20 = intercept + slope * 20

# ===============================
# PROBABILITY
# ===============================

p_returned = df["returned_binary"].mean()
p_prepaid_not_returned = len(df[(df["payment_type"]=="Prepaid") & (df["returned_binary"]==0)]) / len(df)

# ===============================
# BINOMIAL DISTRIBUTION
# ===============================

binomial_prob = stats.binom.pmf(3, 8, p_returned)

# ===============================
# POISSON DISTRIBUTION
# ===============================

poisson_prob = stats.poisson.pmf(6, mu=4)

# ===============================
# NORMAL DISTRIBUTION
# ===============================

prob_gt_7 = 1 - stats.norm.cdf(7, mean_dd, std_dd)

# ===============================
# SAMPLING
# ===============================

srs = df.sample(100)
stratified = df.groupby("warehouse", group_keys=False).apply(lambda x: x.sample(25))

# ===============================
# CENTRAL LIMIT THEOREM
# ===============================

sample_means = [df["order_value"].sample(50).mean() for _ in range(1000)]
plt.hist(sample_means, bins=30)
plt.title("Central Limit Theorem Demonstration")
plt.show()

# ===============================
# STANDARD ERROR
# ===============================

standard_error = std_dd / np.sqrt(len(df))
