In [1]:
# Answer 3

class DiscreteRandomVariable:
    def __init__(self, values, probabilities):
        """
        Initializes a discrete random variable.

        :param values: A list of possible values the random variable can take.
        :param probabilities: A list of probabilities corresponding to each value.
                              The probabilities must sum to 1.
        """
        if len(values) != len(probabilities):
            raise ValueError("The number of values must match the number of probabilities.")
        if not (0 <= min(probabilities) <= 1):
            raise ValueError("Probabilities must be between 0 and 1.")
        if abs(sum(probabilities) - 1) > 1e-6:
            raise ValueError("Probabilities must sum to 1.")

        self.values = values
        self.probabilities = probabilities

    def expected_value(self):
        """
        Calculates the expected value (mean) of the random variable.

        :return: The expected value.
        """
        return sum(v * p for v, p in zip(self.values, self.probabilities))

    def variance(self):
        """
        Calculates the variance of the random variable.

        :return: The variance.
        """
        mean = self.expected_value()
        return sum(p * (v - mean) ** 2 for v, p in zip(self.values, self.probabilities))


# Example usage:
values = [1, 2, 3, 4, 5, 6]
probabilities = [1/6] * 6  # Uniform distribution for a fair six-sided die
die = DiscreteRandomVariable(values, probabilities)

print(f"Expected Value: {die.expected_value()}")
print(f"Variance: {die.variance()}")


Expected Value: 3.5
Variance: 2.9166666666666665


In [2]:
# Answer 4

import random
import numpy as np

def simulate_die_rolls(num_rolls):
    """
    Simulates rolling a fair six-sided die multiple times.

    :param num_rolls: The number of times to roll the die.
    :return: The mean (expected value) and variance of the outcomes.
    """
    # Simulate rolling the die num_rolls times
    rolls = [random.randint(1, 6) for _ in range(num_rolls)]

    # Calculate the mean (expected value) of the rolls
    mean = np.mean(rolls)

    # Calculate the variance of the rolls
    variance = np.var(rolls)

    return mean, variance

# Example usage: Simulate rolling the die 10,000 times
num_rolls = 10000
mean, variance = simulate_die_rolls(num_rolls)

print


<function print>

In [4]:
# Answer 5

import numpy as np
from scipy.stats import binom, poisson

def generate_samples(distribution, params, size=1000):
    """
    Generates random samples from a given probability distribution and calculates their mean and variance.

    :param distribution: The name of the distribution ('binomial' or 'poisson').
    :param params: Parameters for the distribution.
                   - For 'binomial': (n, p) where n is the number of trials and p is the probability of success.
                   - For 'poisson': (mu,) where mu is the expected number of events (lambda).
    :param size: The number of samples to generate (default is 1000).
    :return: A tuple containing the generated samples, mean, and variance of the samples.
    """
    if distribution == 'binomial':
        n, p = params
        samples = binom.rvs(n, p, size=size)
    elif distribution == 'poisson':
        mu = params[0]
        samples = poisson.rvs(mu, size=size)
    else:
        raise ValueError("Unsupported distribution. Choose 'binomial' or 'poisson'.")

    mean = np.mean(samples)
    variance = np.var(samples)
    return samples, mean, variance

# Example usage: Generate samples from a binomial distribution
samples, mean, variance = generate_samples('binomial', (10, 0.5), size=1000)
print(f"Binomial Distribution - Mean: {mean}, Variance: {variance}")

# Example usage: Generate samples from a Poisson distribution
#sample


Binomial Distribution - Mean: 5.056, Variance: 2.5728639999999996


In [5]:
# Answer 6

import numpy as np
from scipy.stats import norm

def generate_normal_samples(mean=0, std=1, size=1000):
    """
    Generates random samples from a Gaussian (normal) distribution and computes the mean, variance, and standard deviation.

    :param mean: The mean (expected value) of the normal distribution.
    :param std: The standard deviation of the normal distribution.
    :param size: The number of samples to generate (default is 1000).
    :return: A tuple containing the samples, calculated mean, variance, and standard deviation.
    """
    # Generate samples from the normal distribution
    samples = norm.rvs(loc=mean, scale=std, size=size)
    
    # Calculate mean, variance, and standard deviation of the samples
    sample_mean = np.mean(samples)
    sample_variance = np.var(samples)
    sample_std = np.std(samples)
    
    return samples, sample_mean, sample_variance, sample_std

# Example usage: Generate samples from a normal distribution
samples, sample_mean, sample_variance, sample_std = generate_normal_samples(mean=0, std=1, size=1000)

print(f"Normal Distribution - Mean: {sample_mean}")
print(f"Normal Distribution - Variance: {sample_variance}")
print(f"Normal Distribution - Standard Deviation: {sample_std}")


Normal Distribution - Mean: 0.012617270345898449
Normal Distribution - Variance: 1.0257600090483807
Normal Distribution - Standard Deviation: 1.0127981087306497


In [9]:
# Answer 8

from scipy.stats import norm
import numpy as np

def calculate_normal_pdf(x, mean=0, std=1):
    """
    Calculates the probability density function (PDF) of a normal distribution 
    at a given point x.

    :param x: The point at which to evaluate the PDF.
    :param mean: The mean of the normal distribution (default is 0).
    :param std: The standard deviation of the normal distribution (default is 1).
    :return: The value of the PDF at point x.
    """
    # Calculate the PDF of the normal distribution at x
    pdf_value = norm.pdf(x, loc=mean, scale=std)
    return pdf_value

# Example usage
x = 0  # Point at which to calculate the PDF
mean = 0  # Mean of the distribution
std = 1  # Standard deviation of the distribution

pdf_value = calculate_normal_pdf(x, mean, std)
print(f"PDF value at x = {x} for N({mean}, {std}^2): {pdf_value}")


PDF value at x = 0 for N(0, 1^2): 0.3989422804014327


In [10]:
# Answer 9

from scipy.stats import expon

def calculate_exponential_cdf(x, rate):
    """
    Calculates the cumulative distribution function (CDF) of an exponential distribution at a given point x.

    :param x: The point at which to evaluate the CDF.
    :param rate: The rate parameter (lambda) of the exponential distribution (lambda > 0).
    :return: The value of the CDF at point x.
    """
    # Calculate the scale parameter from the rate (scale = 1/rate)
    scale = 1 / rate
    
    # Calculate the CDF of the exponential distribution at x
    cdf_value = expon.cdf(x, scale=scale)
    return cdf_value

# Example usage
x = 2  # Point at which to calculate the CDF
rate = 0.5  # Rate parameter (lambda) of the distribution

cdf_value = calculate_exponential_cdf(x, rate)
print(f"CDF value at x = {x} for an exponential distribution with rate = {rate}: {cdf_value}")


CDF value at x = 2 for an exponential distribution with rate = 0.5: 0.6321205588285577


In [11]:
# Answer 10

from scipy.stats import poisson

def calculate_poisson_pmf(k, mu):
    """
    Calculates the probability mass function (PMF) of a Poisson distribution at a given value k.

    :param k: The number of occurrences (an integer) at which to evaluate the PMF.
    :param mu: The average rate (lambda) of occurrences (must be non-negative).
    :return: The value of the PMF at k.
    """
    # Calculate the PMF of the Poisson distribution at k
    pmf_value = poisson.pmf(k, mu)
    return pmf_value

# Example usage
k = 3  # Number of occurrences
mu = 2.5  # Average rate (lambda) of the Poisson distribution

pmf_value = calculate_poisson_pmf(k, mu)
print(f"PMF value at k = {k} for a Poisson distribution with mu = {mu}: {pmf_value}")


PMF value at k = 3 for a Poisson distribution with mu = 2.5: 0.21376301724973648


In [12]:
# Answer 11

import numpy as np
from statsmodels.stats.proportion import proportions_ztest

# 50 purchases out of 1000 visitors for the old layout
old_layout = np.array([1] * 50 + [0] * 950)

# 70 purchases out of 1000 visitors for the new layout
new_layout = np.array([1] * 70 + [0] * 930)

# Count the number of successes (purchases) and total observations for each group
successes = [np.sum(old_layout), np.sum(new_layout)]
observations = [len(old_layout), len(new_layout)]

# Perform the two-proportion Z-test
stat, p_value = proportions_ztest(successes, observations, alternative='larger')

# Print the results
print(f"Z-statistic: {stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpret the result
alpha = 0.05
if p_value < alpha:
    print("Reject the null hypothesis: The new layout has a significantly higher conversion rate.")
else:
    print("Fail to reject the null hypothesis: No significant difference in conversion rates.")


Z-statistic: -1.8831
P-value: 0.9702
Fail to reject the null hypothesis: No significant difference in conversion rates.


In [13]:
# Answer 12

import numpy as np
from scipy.stats import norm

# Sample data
before_program = np.array([75, 80, 85, 70, 90, 78, 92, 88, 82, 87])
after_program = np.array([80, 85, 90, 80, 92, 80, 95, 90, 85, 88])

# Calculate the differences
differences = after_program - before_program

# Mean and standard deviation of the differences
mean_diff = np.mean(differences)
std_diff = np.std(differences, ddof=1)  # sample standard deviation

# Sample size
n = len(differences)

# Calculate the z-score
z_score = mean_diff / (std_diff / np.sqrt(n))

# Determine the p-value (one-tailed test)
p_value = 1 - norm.cdf(z_score)

mean_diff, std_diff, z_score, p_value


(3.8, 2.616188916046478, 4.593190894944668, 2.182597052646784e-06)

In [14]:
# Answer 13

import numpy as np
from scipy import stats

# Blood pressure measurements before and after administering the drug
before_drug = np.array([145, 150, 140, 135, 155, 160, 152, 148, 130, 138])
after_drug = np.array([130, 140, 132, 128, 145, 148, 138, 136, 125, 130])

# Perform a paired t-test
t_statistic, p_value = stats.ttest_rel(before_drug, after_drug)

# Output results
print("T-statistic:", t_statistic)
print("P-value:", p_value)

# Interpret the results
alpha = 0.05  # Significance level
if p_value < alpha:
    print("The drug is effective in reducing blood pressure (reject H0).")
else:
    print("The drug is not effective in reducing blood pressure (fail to reject H0).")


T-statistic: 10.04987562112089
P-value: 3.433031432681152e-06
The drug is effective in reducing blood pressure (reject H0).


In [15]:
# Answer 14

import numpy as np
from scipy import stats

# Response times recorded from the sample
response_times = np.array([4.3, 3.8, 5.1, 4.9, 4.7, 4.2, 5.2, 4.5, 4.6, 4.4])

# Parameters for the z-test
population_mean = 5  # Claim made by customer service
sample_mean = np.mean(response_times)
sample_std = np.std(response_times, ddof=1)  # Sample standard deviation
sample_size = len(response_times)

# Perform a z-test
z_statistic = (sample_mean - population_mean) / (sample_std / np.sqrt(sample_size))
p_value = stats.norm.cdf(z_statistic)  # One-tailed test for less than

# Output results
print("Z-statistic:", z_statistic)
print("P-value:", p_value)

# Interpret the results
alpha = 0.05  # Significance level
if p_value < alpha:
    print("The claim that the average response time is less than 5 minutes is supported (reject H0).")
else:
    print("The claim that the average response time is less than 5 minutes is not supported (fail to reject H0).")


Z-statistic: -3.184457226042963
P-value: 0.0007251287113068958
The claim that the average response time is less than 5 minutes is supported (reject H0).


In [16]:
# Answer 15

import numpy as np
from scipy import stats

def ab_test_analysis(layout_a_clicks, layout_b_clicks):
    # Convert lists to numpy arrays
    layout_a_clicks = np.array(layout_a_clicks)
    layout_b_clicks = np.array(layout_b_clicks)
    
    # Calculate sample means
    mean_a = np.mean(layout_a_clicks)
    mean_b = np.mean(layout_b_clicks)
    
    # Calculate sample standard deviations
    std_a = np.std(layout_a_clicks, ddof=1)
    std_b = np.std(layout_b_clicks, ddof=1)
    
    # Calculate sample sizes
    n_a = len(layout_a_clicks)
    n_b = len(layout_b_clicks)
    
    # Calculate the t-statistic
    pooled_std = np.sqrt(((std_a**2 / n_a) + (std_b**2 / n_b)))
    t_statistic = (mean_a - mean_b) / pooled_std
    
    # Calculate degrees of freedom
    df = min(n_a - 1, n_b - 1)
    
    # Calculate the p-value
    p_value = 2 * stats.t.cdf(-abs(t_statistic), df)  # Two-tailed test
    
    return t_statistic, df, p_value

# Data
layout_a_clicks = [28, 32, 33, 29, 31, 34, 30, 35, 36, 37]
layout_b_clicks = [40, 41, 38, 42, 39, 44, 43, 41, 45, 47]

# Perform A/B test analysis
t_statistic, df, p_value = ab_test_analysis(layout_a_clicks, layout_b_clicks)

# Output results
print("T-statistic:", t_statistic)
print("Degrees of Freedom:", df)
print("P-value:", p_value)

# Interpretation
alpha = 0.05  # Significance level
if p_value < alpha:
    print("There is a significant difference between the click-through rates of the two layouts (reject H0).")
else:
    print("There is no significant difference between the click-through rates of the two layouts (fail to reject H0).")


T-statistic: -7.298102156175071
Degrees of Freedom: 9
P-value: 4.5739084843915195e-05
There is a significant difference between the click-through rates of the two layouts (reject H0).


In [17]:
# Answer 16

import numpy as np
from scipy import stats

def analyze_drug_effect(existing_drug_levels, new_drug_levels):
    # Convert lists to numpy arrays
    existing_drug_levels = np.array(existing_drug_levels)
    new_drug_levels = np.array(new_drug_levels)
    
    # Calculate sample means
    mean_existing = np.mean(existing_drug_levels)
    mean_new = np.mean(new_drug_levels)
    
    # Calculate sample standard deviations
    std_existing = np.std(existing_drug_levels, ddof=1)
    std_new = np.std(new_drug_levels, ddof=1)
    
    # Calculate sample sizes
    n_existing = len(existing_drug_levels)
    n_new = len(new_drug_levels)
    
    # Calculate the t-statistic
    pooled_std = np.sqrt(((std_existing**2 / n_existing) + (std_new**2 / n_new)))
    t_statistic = (mean_existing - mean_new) / pooled_std
    
    # Calculate degrees of freedom
    df = min(n_existing - 1, n_new - 1)
    
    # Calculate the p-value
    p_value = 2 * stats.t.cdf(-abs(t_statistic), df)  # Two-tailed test
    
    return t_statistic, df, p_value

# Data
existing_drug_levels = [180, 182, 175, 185, 178, 176, 172, 184, 179, 183]
new_drug_levels = [170, 172, 165, 168, 175, 173, 170, 178, 172, 176]

# Perform analysis
t_statistic, df, p_value = analyze_drug_effect(existing_drug_levels, new_drug_levels)

# Output results
print("T-statistic:", t_statistic)
print("Degrees of Freedom:", df)
print("P-value:", p_value)

# Interpretation
alpha = 0.05  # Significance level
if p_value < alpha:
    print("The new drug is significantly more effective than the existing drug in reducing cholesterol levels (reject H0).")
else:
    print("There is no significant difference in effectiveness between the new drug and the existing drug (fail to reject H0).")


T-statistic: 4.140480986208661
Degrees of Freedom: 9
P-value: 0.002519832704729902
The new drug is significantly more effective than the existing drug in reducing cholesterol levels (reject H0).


In [18]:
# Answer 17

import numpy as np
from scipy import stats

def analyze_intervention(pre_scores, post_scores):
    # Ensure the inputs are numpy arrays for easier manipulation
    pre_scores = np.array(pre_scores)
    post_scores = np.array(post_scores)
    
    # Check if the two arrays have the same length
    if len(pre_scores) != len(post_scores):
        raise ValueError("The length of pre-intervention and post-intervention scores must be the same.")
    
    # Calculate the paired t-test
    t_statistic, p_value = stats.ttest_rel(post_scores, pre_scores)
    
    return t_statistic, p_value

# Test data
pre_intervention_scores = [80, 85, 90, 75, 88, 82, 92, 78, 85, 87]
post_intervention_scores = [90, 92, 88, 92, 95, 91, 96, 93, 89, 93]

# Perform the analysis
t_stat, p_val = analyze_intervention(pre_intervention_scores, post_intervention_scores)

print(f"T-statistic: {t_stat:.4f}")
print(f"P-value: {p_val:.4f}")

# Interpretation
alpha = 0.05
if p_val < alpha:
    print("The intervention had a significant impact on the test scores.")
else:
    print("The intervention did not have a significant impact on the test scores.")


T-statistic: 4.4284
P-value: 0.0017
The intervention had a significant impact on the test scores.


In [19]:
# Answer 18

import numpy as np
from scipy import stats

# Generate synthetic salary data for male and female employees
np.random.seed(0)  # For reproducibility
male_salaries = np.random.normal(loc=50000, scale=10000, size=20)
female_salaries = np.random.normal(loc=55000, scale=9000, size=20)

# Calculate means and standard deviations
mean_male = np.mean(male_salaries)
mean_female = np.mean(female_salaries)
std_male = np.std(male_salaries, ddof=1)
std_female = np.std(female_salaries, ddof=1)

# Perform an independent two-sample t-test
t_statistic, p_value = stats.ttest_ind(male_salaries, female_salaries)

# Print results
print(f"Mean salary for males: ${mean_male:.2f}")
print(f"Mean salary for females: ${mean_female:.2f}")
print(f"Standard deviation for males: ${std_male:.2f}")
print(f"Standard deviation for females: ${std_female:.2f}")
print(f"T-statistic: {t_statistic:.2f}")
print(f"P-value: {p_value:.4f}")

# Determine if the difference is statistically significant
alpha = 0.05
if p_value < alpha:
    print("There is a statistically significant difference in average salaries.")
else:
    print("There is no statistically significant difference in average salaries.")


Mean salary for males: $55693.35
Mean salary for females: $55501.75
Standard deviation for males: $8722.69
Standard deviation for females: $10968.10
T-statistic: 0.06
P-value: 0.9516
There is no statistically significant difference in average salaries.


In [20]:
# Answer 19

import numpy as np
from scipy import stats

def analyze_quality(version1_scores, version2_scores, alpha=0.05):
    # Convert lists to numpy arrays for convenience
    version1_scores = np.array(version1_scores)
    version2_scores = np.array(version2_scores)
    
    # Calculate the means and standard deviations
    mean1 = np.mean(version1_scores)
    mean2 = np.mean(version2_scores)
    std1 = np.std(version1_scores, ddof=1)
    std2 = np.std(version2_scores, ddof=1)
    
    # Calculate the t-statistic and p-value
    t_statistic, p_value = stats.ttest_ind(version1_scores, version2_scores, equal_var=False)
    
    # Determine if the result is statistically significant
    significant = p_value < alpha
    
    # Print results
    print(f"Version 1: Mean = {mean1:.2f}, Std Dev = {std1:.2f}")
    print(f"Version 2: Mean = {mean2:.2f}, Std Dev = {std2:.2f}")
    print(f"T-Statistic = {t_statistic:.2f}")
    print(f"P-Value = {p_value:.4f}")
    print(f"Significant Difference: {'Yes' if significant else 'No'}")
    
    return t_statistic, p_value, significant

# Example data
version1_scores = [85, 88, 82, 89, 87, 84, 90, 88, 85, 86, 91, 83, 87, 84, 89, 86, 84, 88, 85, 86, 89, 90, 87, 88, 85]
version2_scores = [80, 78, 83, 81, 79, 82, 76, 80, 78, 81, 77, 82, 80, 79, 82, 79, 80, 81, 79, 82, 79, 78, 80, 81, 82]

# Analyze quality scores
analyze_quality(version1_scores, version2_scores)


Version 1: Mean = 86.64, Std Dev = 2.36
Version 2: Mean = 79.96, Std Dev = 1.77
T-Statistic = 11.33
P-Value = 0.0000
Significant Difference: Yes


(11.325830417646698, 1.078754084378157e-14, True)

In [21]:
# Answer 20

import numpy as np
from scipy import stats

# Data
branch_a_scores = [4, 5, 3, 4, 5, 4, 5, 3, 4, 4, 5, 4, 4, 3, 4, 5, 5, 4, 3, 4, 5, 4, 3, 5, 4, 4, 5, 3, 4, 5, 4]
branch_b_scores = [3, 4, 2, 3, 4, 3, 4, 2, 3, 3, 4, 3, 3, 2, 3, 4, 4, 3, 2, 3, 4, 3, 2, 4, 3, 3, 4, 2, 3, 4, 3]

# Convert lists to numpy arrays
a = np.array(branch_a_scores)
b = np.array(branch_b_scores)

# Calculate means and standard deviations
mean_a = np.mean(a)
mean_b = np.mean(b)
std_a = np.std(a, ddof=1)
std_b = np.std(b, ddof=1)

# Perform the t-test
t_stat, p_value = stats.ttest_ind(a, b)

# Output results
print(f"Branch A - Mean: {mean_a}, Std Dev: {std_a}")
print(f"Branch B - Mean: {mean_b}, Std Dev: {std_b}")
print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")

# Determine if the result is statistically significant
alpha = 0.05
if p_value < alpha:
    print("There is a statistically significant difference between the two branches.")
else:
    print("There is no statistically significant difference between the two branches.")


Branch A - Mean: 4.129032258064516, Std Dev: 0.7184212081070998
Branch B - Mean: 3.129032258064516, Std Dev: 0.7184212081070998
T-statistic: 5.480077554195743
P-value: 8.895290509945655e-07
There is a statistically significant difference between the two branches.


In [22]:
# Answer 21

import numpy as np
import pandas as pd
from scipy.stats import chi2_contingency

# Set seed for reproducibility
np.random.seed(0)

# Generate data
sample_size = 500
age_groups = np.random.choice(['18-30', '31-50', '51+'], size=sample_size)
voter_preferences = np.random.choice(['Candidate A', 'Candidate B'], size=sample_size)

# Create a DataFrame
data = pd.DataFrame({'Age Group': age_groups, 'Voter Preference': voter_preferences})

# Create a contingency table
contingency_table = pd.crosstab(data['Age Group'], data['Voter Preference'])

# Perform the Chi-Square test
chi2, p, dof, ex = chi2_contingency(contingency_table)

# Display results
print("Contingency Table:")
print(contingency_table)
print("\nChi-Square Test Results:")
print(f"Chi-Square Statistic: {chi2}")
print(f"P-value: {p}")
print(f"Degrees of Freedom: {dof}")
print(f"Expected Frequencies Table:\n{ex}")

# Interpret the result
alpha = 0.05
if p < alpha:
    print("\nThere is a significant association between age groups and voter preferences.")
else:
    print("\nThere is no significant association between age groups and voter preferences.")


Contingency Table:
Voter Preference  Candidate A  Candidate B
Age Group                                 
18-30                      95           87
31-50                      87           82
51+                        84           65

Chi-Square Test Results:
Chi-Square Statistic: 0.8779923945254768
P-value: 0.6446832311860852
Degrees of Freedom: 2
Expected Frequencies Table:
[[96.824 85.176]
 [89.908 79.092]
 [79.268 69.732]]

There is no significant association between age groups and voter preferences.


In [23]:
# Answer 22

import numpy as np
from scipy.stats import chi2_contingency

# Sample data: Product satisfaction levels (rows) vs. Customer regions (columns)
data = np.array([[50, 30, 40, 20], [30, 40, 30, 50], [20, 30, 40, 30]])

# Perform Chi-Square Test
chi2_stat, p_value, dof, expected = chi2_contingency(data)

print("Chi-Square Statistic:", chi2_stat)
print("Degrees of Freedom:", dof)
print("P-Value:", p_value)
print("Expected Frequencies:\n", expected)


Chi-Square Statistic: 27.777056277056275
Degrees of Freedom: 6
P-Value: 0.00010349448486004387
Expected Frequencies:
 [[34.14634146 34.14634146 37.56097561 34.14634146]
 [36.58536585 36.58536585 40.24390244 36.58536585]
 [29.26829268 29.26829268 32.19512195 29.26829268]]


In [None]:
# Answer 23