# The `Bayes Theorem` 

In [1]:
import pandas as pd
import numpy as np

The Bayes Theorem formula is the following:

$$ \mathbb{P}(A | Data) =  \mathbb{P}(A) \times \frac{\mathbb{P}(Data | A) }{\mathbb{P}(Data)}$$

This Notebook aims to put the Bayes Theorem into practice by answering the following hypothetical scenario: 'Should we play sport outside expecting some weather conditions ?'

Created a dummy dataset with `weather conditions` (Rain, Sunny, Overcast) and `play` (Yes, No) suggesting whether a sport was played based on the weather conditions.

In [2]:
weather_data_example = ['Sunny', 'Overcast', 'Rainy', 'Sunny', 'Sunny', 'Overcast', 'Rainy', 'Rainy', 'Sunny',
'Rainy', 'Sunny', 'Overcast', 'Overcast', 'Rainy']

play_data_example = ['No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No', 'No', 'Yes', 'Yes', 'No', 'Yes', 'Yes', 'No']

data = {'weather': weather_data_example, 'play': play_data_example}

df = pd.DataFrame(data = data)
df

Unnamed: 0,weather,play
0,Sunny,No
1,Overcast,Yes
2,Rainy,Yes
3,Sunny,Yes
4,Sunny,Yes
5,Overcast,Yes
6,Rainy,No
7,Rainy,No
8,Sunny,Yes
9,Rainy,Yes


Calculating the theoretical probability of a game being played:

In [None]:
def prior_probability(event_name: str, observations: list) -> float:
    return sum([element == event_name for element in observations])/len(observations)

In [55]:
prior_probability('Yes', play_data_example)

0.6428571428571429

In [56]:
prior_probability('No', play_data_example)

0.35714285714285715

Calculating the theoretical probability of the weather being Sunny, Overcast or Rainy knowing that a game was either played or played 

In [6]:
def likelihood(weather, played, weather_data, play_data):
    count_intersection = sum([ x == weather and y == played for x,y in zip(weather_data, play_data)])
    count_known_data = sum([y == played for y in play_data])
    return count_intersection / count_known_data


In [57]:
likelihood("Rainy", "No", weather_data_example, play_data_example)

0.6

Computing the posterior probability as: 

$$\large \text{posterior probability} = \text{prior probability} \times \text{likelihood} \times \beta $$ 

where $ \large \beta = \frac{1}{P(weather)} $ is a normalisation factor.
 



In [58]:
def posterior_probability(played, weather, weather_data, play_data):
    p_played = prior_probability(played, play_data)
    p_weather = prior_probability(weather, weather_data)
    p_likelihood = likelihood(weather, played, weather_data, play_data)
    return p_played * p_likelihood / p_weather

Testing the code

In [59]:
print(posterior_probability("Yes", "Sunny", weather_data_example, play_data_example))
print(posterior_probability("No", "Sunny", weather_data_example, play_data_example))
print(posterior_probability("Yes", "Overcast", weather_data_example, play_data_example))
print(posterior_probability("No", "Overcast", weather_data_example, play_data_example))
print(posterior_probability("Yes", "Rainy", weather_data_example, play_data_example))
print(posterior_probability("No", "Sunny", weather_data_example, play_data_example))

0.6
0.4
1.0
0.0
0.39999999999999997
0.4


Matches are more likely to be played than not if the weather is sunny

In [60]:
posterior_probability("Yes", "Sunny", weather_data_example, play_data_example)

0.6

Best guess (probability) that the game will be canceled if you know for sure that it will be raining during the next game

In [61]:
posterior_probability("No", "Rainy", weather_data_example, play_data_example)

0.6

In [62]:
def detailed_analysis(weather_data_example, play_data_example):
        """
        Provide a detailed probabilistic analysis of play decisions
        """
        print("Detailed Probabilistic Sports Weather Analysis\n")

        for weather in set(weather_data_example):
            p_play = posterior_probability('Yes', weather, weather_data_example, play_data_example)
            p_no_play = posterior_probability('No', weather, weather_data_example, play_data_example)

            print(f"Weather Condition: {weather}")
            print(f"Probability of Playing: {p_play:.2%}")
            print(f"Probability of Not Playing: {p_no_play:.2%}")
            print("Recommendation: " +
                  ("Consider Playing" if p_play > p_no_play else "Likely to Cancel") + "\n")


In [63]:
detailed_analysis(weather_data_example, play_data_example)

Detailed Probabilistic Sports Weather Analysis

Weather Condition: Rainy
Probability of Playing: 40.00%
Probability of Not Playing: 60.00%
Recommendation: Likely to Cancel

Weather Condition: Overcast
Probability of Playing: 100.00%
Probability of Not Playing: 0.00%
Recommendation: Consider Playing

Weather Condition: Sunny
Probability of Playing: 60.00%
Probability of Not Playing: 40.00%
Recommendation: Consider Playing



In [66]:
from typing import List, Dict, Tuple, Union
import scipy.stats as stats

def calculate_bayesian_probabilities(weather_data: List[str],
                                     play_data: List[str]) -> Dict[str, Dict[str, float]]:
    """
    Advanced Bayesian probability calculation with more nuanced statistical analysis

    Args:
        weather_data (List[str]): List of weather conditions
        play_data (List[str]): Corresponding play/no play decisions

    Returns:
        Dict of probabilities and statistical metrics
    """
    # Convert to pandas DataFrame for advanced analysis
    df = pd.DataFrame({'weather': weather_data, 'play': play_data})

    # Contingency table
    contingency = pd.crosstab(df['weather'], df['play'], normalize='index')

    # Chi-square test of independence
    chi2, p_value = stats.chi2_contingency(pd.crosstab(df['weather'], df['play']))[:2]

    # Detailed probability calculations
    probabilities = {}
    for weather in df['weather'].unique():
        weather_subset = df[df['weather'] == weather]

        # Calculate probabilities
        total_games = len(weather_subset)
        play_count = len(weather_subset[weather_subset['play'] == 'Yes'])

        probabilities[weather] = {
            'play_probability': play_count / total_games,
            'no_play_probability': 1 - (play_count / total_games),
            'sample_proportion': len(weather_subset) / len(df),
            'play_count': play_count,
            'total_games': total_games
        }

    # Statistical significance indicator
    probabilities['statistical_test'] = {
        'chi2_statistic': chi2,
        'p_value': p_value,
        'is_significant': p_value < 0.05  # Standard significance level
    }

    return probabilities


In [70]:
calculate_bayesian_probabilities(weather_data_example, play_data_example)

{'Sunny': {'play_probability': 0.6,
  'no_play_probability': 0.4,
  'sample_proportion': 0.35714285714285715,
  'play_count': 3,
  'total_games': 5},
 'Overcast': {'play_probability': 1.0,
  'no_play_probability': 0.0,
  'sample_proportion': 0.2857142857142857,
  'play_count': 4,
  'total_games': 4},
 'Rainy': {'play_probability': 0.4,
  'no_play_probability': 0.6,
  'sample_proportion': 0.35714285714285715,
  'play_count': 2,
  'total_games': 5},
 'statistical_test': {'chi2_statistic': 3.5466666666666664,
  'p_value': 0.16976615743981122,
  'is_significant': False}}

In [None]:
def bootstrap_confidence_interval(weather_data: List[str],
                                   play_data: List[str],
                                   n_bootstraps: int = 10000) -> Dict[str, Dict[str, float]]:
    """
    Bootstrap confidence interval estimation for play probabilities

    Args:
        weather_data (List[str]): List of weather conditions
        play_data (List[str]): Corresponding play/no play decisions
        n_bootstraps (int): Number of bootstrap resamples

    Returns:
        Dictionary of confidence intervals for each weather condition
    """
    df = pd.DataFrame({'weather': weather_data, 'play': play_data})

    bootstrap_results = {}

    for weather in df['weather'].unique():
        weather_subset = df[df['weather'] == weather]
        play_samples = (weather_subset['play'] == 'Yes').values

        # Bootstrap resampling
        bootstrap_play_probs = np.zeros(n_bootstraps)
        for i in range(n_bootstraps):
            # Sample with replacement
            bootstrap_sample = np.random.choice(play_samples, size=len(play_samples), replace=True)
            bootstrap_play_probs[i] = np.mean(bootstrap_sample)

        # Calculate confidence intervals
        bootstrap_results[weather] = {
            'mean_play_probability': np.mean(bootstrap_play_probs),
            'confidence_interval_95': (
                np.percentile(bootstrap_play_probs, 2.5),
                np.percentile(bootstrap_play_probs, 97.5)
            ),
            'standard_error': np.std(bootstrap_play_probs)
        }

    return bootstrap_results


In [67]:
bootstrap_confidence_interval(weather_data_example, play_data_example, n_bootstraps = 10000)

{'Sunny': {'mean_play_probability': 0.601,
  'confidence_interval_95': (0.2, 1.0),
  'standard_error': 0.22110404790505306},
 'Overcast': {'mean_play_probability': 1.0,
  'confidence_interval_95': (1.0, 1.0),
  'standard_error': 0.0},
 'Rainy': {'mean_play_probability': 0.4019400000000001,
  'confidence_interval_95': (0.0, 0.8),
  'standard_error': 0.22050903927050244}}

In [68]:
def bayesian_decision_analysis(weather_data: List[str],
                                play_data: List[str],
                                risk_tolerance: float = 0.5) -> Dict[str, Dict[str, Union[str, float]]]:
    """
    Advanced Bayesian decision analysis with risk assessment

    Args:
        weather_data (List[str]): List of weather conditions
        play_data (List[str]): Corresponding play/no play decisions
        risk_tolerance (float): Threshold for decision-making

    Returns:
        Detailed decision analysis for each weather condition
    """
    # Get Bayesian probabilities
    probabilities = calculate_bayesian_probabilities(weather_data, play_data)

    # Get bootstrap confidence intervals
    bootstrap_intervals = bootstrap_confidence_interval(weather_data, play_data)

    # Decision analysis
    decision_analysis = {}
    for weather in set(weather_data):
        prob_stats = probabilities[weather]
        bootstrap_stats = bootstrap_intervals[weather]

        # Advanced decision logic
        decision_score = (
            prob_stats['play_probability'] * 0.7 +
            (1 - abs(bootstrap_stats['confidence_interval_95'][0] -
                     bootstrap_stats['confidence_interval_95'][1])) * 0.3
        )

        # Decision recommendation
        recommendation = (
            "Highly Recommended" if decision_score > 0.75 else
            "Recommended" if decision_score > risk_tolerance else
            "Not Recommended"
        )

        decision_analysis[weather] = {
            **prob_stats,
            **bootstrap_stats,
            'decision_score': decision_score,
            'recommendation': recommendation,
            'confidence_interval': bootstrap_stats['confidence_interval_95']
        }

    # Include overall statistical test results
    decision_analysis['statistical_significance'] = probabilities['statistical_test']

    return decision_analysis


In [69]:
decision_analysis = bayesian_decision_analysis(weather_data_example, play_data_example)


In [53]:
decision_analysis

{'Rainy': {'play_probability': 0.4,
  'no_play_probability': 0.6,
  'sample_proportion': 0.35714285714285715,
  'play_count': 2,
  'total_games': 5,
  'mean_play_probability': 0.39846000000000004,
  'confidence_interval_95': (0.0, 0.8),
  'standard_error': 0.21781099237641796,
  'decision_score': 0.33999999999999997,
  'recommendation': 'Not Recommended',
  'confidence_interval': (0.0, 0.8)},
 'Overcast': {'play_probability': 1.0,
  'no_play_probability': 0.0,
  'sample_proportion': 0.2857142857142857,
  'play_count': 4,
  'total_games': 4,
  'mean_play_probability': 1.0,
  'confidence_interval_95': (1.0, 1.0),
  'standard_error': 0.0,
  'decision_score': 1.0,
  'recommendation': 'Highly Recommended',
  'confidence_interval': (1.0, 1.0)},
 'Sunny': {'play_probability': 0.6,
  'no_play_probability': 0.4,
  'sample_proportion': 0.35714285714285715,
  'play_count': 3,
  'total_games': 5,
  'mean_play_probability': 0.5995199999999999,
  'confidence_interval_95': (0.2, 1.0),
  'standard_err