# Probability Concepts in Real-World Applications
This notebook explores probability concepts applied to large datasets.

## 1. Set Theory: Feature Selection
Using set operations to analyze overlapping features in datasets.

In [None]:

import pandas as pd

# Load Titanic dataset
df = pd.read_csv("https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv")

# Define two feature sets
features_A = set(["Pclass", "Sex", "Age", "Fare"])
features_B = set(["Sex", "Age", "Embarked", "Fare"])

# Set operations
print("Union:", features_A | features_B)
print("Intersection:", features_A & features_B)


## 2. Conditional Probability: Spam Detection
Using Naïve Bayes' Theorem to determine email spam likelihood.

In [None]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn import datasets

# Load spam dataset (SMS spam collection)
df = pd.read_csv("https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms-spam-collection.tsv", sep='	', names=['label', 'message'])

# Convert labels to binary
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Vectorize text data
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['message'])
y = df['label']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Naïve Bayes Model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict probabilities
probs = model.predict_proba(X_test)[:10]
print("Predicted spam probabilities:
", probs)


## 3. Probability Mass Function & CDF: Customer Purchases
Analyzing purchase probability distribution.

In [None]:

import numpy as np
import matplotlib.pyplot as plt

# Simulated dataset: Number of purchases per customer
purchases = np.random.poisson(3, 1000)  # Avg purchases per customer

# Compute PMF
values, counts = np.unique(purchases, return_counts=True)
pmf = counts / counts.sum()

# Compute CDF
cdf = np.cumsum(pmf)

# Plot PMF & CDF
plt.figure(figsize=(12,5))
plt.subplot(1,2,1)
plt.bar(values, pmf, alpha=0.6, color='b', label="PMF")
plt.xlabel("Purchases"); plt.ylabel("Probability"); plt.title("PMF")

plt.subplot(1,2,2)
plt.plot(values, cdf, marker="o", linestyle="--", color="r", label="CDF")
plt.xlabel("Purchases"); plt.ylabel("Cumulative Probability"); plt.title("CDF")

plt.show()


## 4. Expected Value: Stock Price Forecasting
Using probability to estimate future stock prices.

In [None]:

# Simulated stock price changes
price_changes = [-5, 10, -2, 8, 12, -7, 3]
probabilities = [0.1, 0.2, 0.15, 0.25, 0.1, 0.1, 0.1]

# Compute expected price change
expected_change = sum(p * c for p, c in zip(probabilities, price_changes))
print("Expected Stock Price Change:", expected_change)


## 5. Variance & Standard Deviation: Risk Assessment
Measuring financial risk using variance.

In [None]:

returns = np.random.normal(0.05, 0.2, 1000)  # Simulated stock returns
variance = np.var(returns)
std_dev = np.std(returns)
print(f"Variance: {variance:.4f}, Standard Deviation: {std_dev:.4f}")


## 6. Bayesian Inference: Disease Diagnosis
Using Bayes' Theorem to estimate disease probability.

In [None]:

# Given probabilities
P_disease = 0.01  # Prior probability of having disease
P_positive_given_disease = 0.95  # True positive rate
P_positive_given_no_disease = 0.05  # False positive rate

# Compute P(Disease | Positive Test)
P_positive = (P_positive_given_disease * P_disease) + (P_positive_given_no_disease * (1 - P_disease))
P_disease_given_positive = (P_positive_given_disease * P_disease) / P_positive

print(f"Probability of having disease given a positive test: {P_disease_given_positive:.4f}")
