In [18]:
# ===========================================================
# PROBABILITY AND STATISTICS WITH PYTHON (DETAILED TUTORIAL)
# ===========================================================
# This tutorial covers:
# 1. Dependent and Independent Events
# 2. Conditional Probability with a Contingency Table
# 3. Bayes’s Theorem
# 4. Random Variables & Continuous Distributions
# 5. Central Limit Theorem (CLT)
#
# We will use two libraries:
# numpy  → for numbers, random experiments, and math
# pandas → for tables (like Excel data handling)
# ===========================================================


# ===========================================================
# PART 1: EVENT DEPENDENCE AND INDEPENDENCE
# ===========================================================
import numpy as np   # "numpy" helps us with numbers and random generation

# 🎲 THEORY:
# When rolling a fair dice, all six outcomes {1,2,3,4,5,6} are equally likely.

# - Event A: "Dice A shows a number greater than 3" → {4,5,6}
#   Total possible outcomes = 6.
#   Favorable outcomes for A = 3 (numbers 4, 5, 6).
#   So theoretically, P(A) = 3/6 = 0.5
# - Event B: "Dice B shows an even number" → {2,4,6}
#   So theoretically, P(B) = 3/6 = 0.5
# - Event A ∩ B: "Dice A > 3 AND Dice B is even"
#   Since dice are independent, probability = 0.5 * 0.5 = 0.25

# # np.random.randint(1, 7, 1000) → generate 1000 random integers between 1 and 6
A = np.random.randint(1, 7, 1000)   # Dice A rolls
B = np.random.randint(1, 7, 1000)   # Dice B rolls

# Compute probabilities using proportions
P_A = np.mean(A > 3)                         # Should be close to 0.5
P_B = np.mean(B % 2 == 0)                    # Should be close to 0.5
P_A_and_B = np.mean((A > 3) & (B % 2 == 0))  # Should be close to 0.25
P_A_given_B = P_A_and_B / P_B                # Should be close to 0.5

# Print results
print("P(A):", P_A)                # Expected ≈ 0.50
print("P(B):", P_B)                # Expected ≈ 0.50
print("P(A ∩ B):", P_A_and_B)      # Expected ≈ 0.25
print("P(A | B):", P_A_given_B)    # Expected ≈ 0.50

# Check independence: P(A ∩ B) ?= P(A) * P(B)
independent = np.isclose(P_A_and_B, P_A * P_B)
print("Are A and B independent?", independent)

# WHY OUTPUT LOOKS LIKE THIS:
# Since dice rolls are random but fair:
#   - About half the numbers on A will be >3
#   - About half the numbers on B will be even
#   - About a quarter of the rolls will satisfy BOTH
# Small deviations occur because we used 1000 rolls (simulation).



P(A): 0.531
P(B): 0.502
P(A ∩ B): 0.261
P(A | B): 0.5199203187250996
Are A and B independent? False


In [1]:
# ===========================================================
# PART 2: CONDITIONAL PROBABILITY WITH A CONTINGENCY TABLE
# ===========================================================
import pandas as pd   # pandas is for structured data

# THEORY:
# Suppose we record results of students in Math & English.
# A contingency table organizes joint outcomes.
#
# Example table:
#                       Passed Math   Failed Math
# Passed English             30            20
# Failed English             10            40
#
# - Total students who passed English = 30+20 = 50
# - Among them, 30 passed Math.
# So, P(Passed Math | Passed English) = 30/50 = 0.6

data = {
    'Passed Math': [30, 10],
    'Failed Math': [20, 40]
}
table = pd.DataFrame(data, index=['Passed English', 'Failed English'])
print(table)

# Calculate conditional probability
P_PM_PE = table.loc['Passed English', 'Passed Math'] / table.loc['Passed English'].sum()
print("P(Passed Math | Passed English):", P_PM_PE)   # Expected = 0.6

# WHY OUTPUT LOOKS LIKE THIS:
# Because out of 50 students who passed English,
# 30 also passed Math → probability = 30/50 = 0.6



                Passed Math  Failed Math
Passed English           30           20
Failed English           10           40
P(Passed Math | Passed English): 0.6


In [20]:
# ===========================================================
# PART 3: BAYES’S THEOREM EXAMPLE
# ===========================================================
# # Bayes' theorem reverses conditional probabilities.
#
# Example: Spam detection test.
# - 1% of emails are spam (P(Spam) = 0.01)
# - Test detects spam correctly 99% of the time
# - Test gives false alarm 5% of the time for non-spam
#
# Question: If a test is POSITIVE, what is P(Spam | Positive)?


# P(Spam∣Positive) = P(Positive∣Spam)⋅P(Spam)  /  P(Positive∣Spam)⋅P(Spam)+P(Positive∣Not Spam)⋅P(Not Spam)


# what this formula says 


#  P(Spam∣Positive) ---> The probability that an email is spam given that the test flagged it positive 
#                     (This is what we want to know — the posterior probability.)


# P(Positive∣Spam)⋅P(Spam) ----->    The chance the test is positive when it really is spam,
#                                       multiplied by the chance of spam overall."
#                                     (This is the “true positive contribution.”)


# P(Positive∣Spam)⋅P(Spam)+P(Positive∣Not Spam)⋅P(Not Spam)---->  All the ways the test can be positive
#                                                            True positives (it really was spam), plus
#                                                           False positives (it wasn’t spam, but the test said positive)."
#                                                       (This is the total probability of a positive test.)


# Posterior = (True Positives) ÷ (All Positives).

# That’s all Bayes’ theorem is doing:
#         Focus only on cases where the test said “positive.”
#          Ask: out of those, how many were actually spam?

# Numerator = the group you care about (Spam ∧ Positive).
# Denominator = all cases with Positive.
# So it’s really just:   
#                      Desired Positives / All Positives



P_spam = 0.01    # 1% of emails are spam (the base rate or prior).
P_not_spam = 0.99   #  99% are not spam. 
P_positive_given_spam = 0.99    # If an email is spam, the test flags it positive 99% of the time (high sensitivity / true positive rate).
P_positive_given_not_spam = 0.05   # If an email is not spam, the test still flags positive 5% of the time (false positive rate).


# Total probability of positive
P_positive = (P_positive_given_spam * P_spam) + (P_positive_given_not_spam * P_not_spam)

# Bayes' theorem calculation
P_spam_given_positive = (P_positive_given_spam * P_spam) / P_positive
print("P(Spam | Positive):", P_spam_given_positive)   # Expected ≈ 0.167 (16.7%)









# # Even though test is 99% accurate,
# Spam is very rare (1% of all emails).
# So many positives come from false alarms.
# That’s why final probability is only ~16.7%.



P(Spam | Positive): 0.16666666666666669


In [21]:
# ===========================================================
# PART 4: RANDOM VARIABLES & CONTINUOUS DISTRIBUTIONS
# ===========================================================
# # A "random variable" is an outcome of chance.
# Continuous distributions describe probabilities for ranges.
# Example: Normal Distribution (bell-shaped).
#
# If mean = 50, std dev = 10:
# - About 68% of values lie within [40, 60].

#We are simulating values from a Normal Distribution (bell curve):

#Normal Distribution = A common probability distribution shaped like a bell.
# Examples in real life: height of people, exam marks, measurement errors.

#Parameters:

#loc=50 → This is the mean (average) of the distribution.
#→ Most values will cluster around 50.

#scale=10 → This is the standard deviation (spread).
#→ Most values will fall within ±10 of the mean.
#→ So most values will be between 40 and 60.

#size=1000 → This tells Python to generate 1000 random values.

#So, this line is like saying:
#“Give me 1000 random exam marks for students, where the average is 50 and the spread is 10.”



## Give me 1000 random exam marks for students, where the average is 50 and the spread is 10
data = np.random.normal(loc=50, scale=10, size=1000)


#Step 3 — Compute Statistics of the Sample Means

mean = np.mean(data)
std_dev = np.std(data)
prob = np.mean((data > 40) & (data < 60))
  
print("Mean:", mean)                        # Expected ≈ 50
print("Standard Deviation:", std_dev)       # Expected ≈ 10
print("P(40 < X < 60):", prob)              # Expected ≈ 0.68

# # Normal distribution has known rules:
# - 68% of values within 1 std dev
# - 95% within 2 std dev
# Simulation matches theory (with small random differences).



Mean: 49.32637259961216
Standard Deviation: 9.951482751914973
P(40 < X < 60): 0.686


In [22]:
# ===========================================================
# PART 5: CENTRAL LIMIT THEOREM (CLT) SIMULATION
# ===========================================================
# # Central Limit Theorem (CLT):
# If we repeatedly take samples and calculate their averages,
# the distribution of averages tends to become normal,
# even if the original population is skewed.

# 

#Step 1 — Generate a Skewed Population

population = np.random.exponential(scale=2, size=10000)  # Skewed population Creates 10,000 random values 
                                                 # from an Exponential distribution with parameter scale=2.
#  First 10 values of population: [0.321 1.587 0.092 2.301 1.126 0.458 0.305 6.872 3.212 0.784]   
# # Print only the first 10 values (to avoid flooding the screen)
# print("First 10 values of population:", population[:10])
# Print summary statistics
#print("Population Mean (expected ≈ 2):", np.mean(population))
# print("Population Standard Deviation (expected ≈ 2):", np.std(population))

# Step 2 — Draw Samples and Compute Sample Means

#np.random.choice(population, 30)
#   → Randomly picks 30 values (a sample) from the population of 10,000.
#np.mean(...)
#→ Calculates the average of those 30 values (the sample mean).
# [ ... for _ in range(1000)]
#     → Repeat this process 1000 times.

# So sample_means is a list of 1000 numbers, where each number is the average of a different sample of size 30.


sample_means = [np.mean(np.random.choice(population, 30)) for _ in range(1000)]
mean_sample_means = np.mean(sample_means)
std_sample_means = np.std(sample_means)


print("Mean of Sample Means:", mean_sample_means)              # Expected ≈ 2.0
# We start with a population of 10,000 numbers generated from an Exponential distribution with scale = 2.
# Why Should the Sample Means Center Around 2?
# Here’s the key idea from probability theory:
# If you draw a random sample from a population, the expected value of the sample mean is equal to the true population mean.
#Population distribution = Exponential with scale = 2.
# True mean = 2.
# So, on average, the sample mean should also = 2.
#  Even though individual samples might be a little above or below 2, when you average across 1000 sample means, 
# it settles very close to 2.
#Imagine population = [2, 2, 2, 2, 2] (mean = 2).
#If you pick any sample and take its mean, you always get 2.
#The mean of all sample means = 2.
#Now with a real exponential distribution:
#Some samples give means like 1.8, 2.3, 2.1, 1.9, etc.
#If you take many samples and average them, the highs and lows cancel out.
#Final result → Very close to the true mean (2).


print("Standard Deviation of Sample Means:", std_sample_means) # Smaller than population spread

# - The exponential population is skewed (not normal).
# - But the means of samples (size 30) form a bell-shaped curve.
# - This is the CLT in action!

Mean of Sample Means: 1.9995259998201336
Standard Deviation of Sample Means: 0.3642446677470784
