# Plotting Histograms

In [None]:
# Import plotting modules
import matplotlib.pyplot as plt
import seaborn as sns

# Set default Seaborn style
sns.set()

# Plot histogram of say patient lengths -
_ = plt.hist(patient_lengths)

# Show histogram
plt.show()

# Axis Labelling

In [None]:
# Plot histogram of patient_lengths
_ = plt.hist(patient_lengths)

# Label axes
plt.xlabel('patient length (cm)')
plt.ylabel('count')

# Show histogram
plt.show()

# Adjusting the Bins

In [None]:
# Import numpy
import numpy as np

# Compute number of data points: n_data
n_data = len(patient_lengths)

# Number of bins is the square root of number of data points: n_bins
n_bins = np.sqrt(n_data)

# Convert number of bins to integer: n_bins
n_bins = int(n_bins)

# Plot the histogram
_ = plt.hist(patient_lengths, bins = n_bins)

# Label axes
_ = plt.xlabel('patient length (cm)')
_ = plt.ylabel('count')

# Show histogram
plt.show()

# Bee Swarm Plot

In [None]:
# Create bee swarm plot with Seaborn's default settings
_ = sns.swarmplot(x='diagnosis', y='patient length (cm)', data=df)

# Label the axes
_ = plt.xlabel('diagnosis')
_ = plt.ylabel('patient length')

# Show the plot
plt.show()

# Computing the ECDF

In [None]:
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    # Number of data points: n
    n = len(data)
    # x-data for the ECDF: x
    x = np.sort(data)
    # y-data for the ECDF: y
    y = np.arange(1, n+1) / n
    return x, y

# Plotting the ECDF

In [None]:
# Compute ECDF for patient_data: x_vers, y_vers
x_vers, y_vers = ecdf(patient_lengths)

# Generate plot
plt.plot(x_vers, y_vers, marker = '.', linestyle = 'none')

# Make the margins nice
plt.margins(0.02)

# Label the axes
_ = plt.ylabel('ECDF')
_ = plt.xlabel('Patient Length')

# Display the plot
plt.show()

# Plotting for Multiple Variables

In [None]:
# Compute ECDFs
x_pneum,y_pneum = ecdf(pneumonia_patient_length)
x_MI,y_MI = ecdf(MI_patient_length)
x_GIB,y_GIB = ecdf(GIBleed_patient_length)

# Plot all ECDFs on the same plot
_ = plt.plot(x_set,y_set, marker = '.', linestyle = 'none' )
_ = plt.plot(x_vers,y_vers, marker = '.', linestyle = 'none' )
_ = plt.plot(x_virg,y_virg, marker = '.', linestyle = 'none' )

# Make nice margins
plt.margins(0.02)

# Annotate the plot
plt.legend(('pneumonia', 'MI', 'GIBleed'), loc='lower right')
_ = plt.xlabel('patient length (cm)')
_ = plt.ylabel('ECDF')

# Display the plot
plt.show()

# Mean

In [None]:
# Compute the mean: mean_length_pts
mean_length_pts = np.mean(patient_lengths)

# Print the result with some nice formatting
print('All pts:', mean_length_pts, 'cm')

# Percentiles

In [None]:
# Specify array of percentiles: percentiles
percentiles = np.array([2.5,25,50,75,97.5])

# Compute percentiles: ptiles_pts
ptiles_pts = np.percentile(patient_lengths, percentiles)

# Print the result
print(ptiles_pts)

# Percentiles and ECDF

In [None]:
# Plot the ECDF
_ = plt.plot(x_pneum, y_pneum, '.')
plt.margins(0.02)
_ = plt.xlabel('pneumonia patients length (cm)')
_ = plt.ylabel('ECDF')

# Overlay percentiles as red diamonds.
_ = plt.plot(ptiles_pts, percentiles/100, marker='D', color='red', linestyle='none')

# Show the plot
plt.show()

# Boxplot

In [None]:
# Create box plot with Seaborn's default settings
_ = sns.boxplot(x='diagnosis', y='patient length (cm)', data=df)

# Label the axes
_ = plt.xlabel('diagnosis')
_ = plt.ylabel('patient length(cm)')

# Show the plot
plt.show()

# Variance

In [None]:
# Array of differences to mean: differences
differences = patient_lengths - np.mean(patient_lengths)

# Square the differences: diff_sq
diff_sq = differences**2

# Compute the mean square difference: variance_explicit
variance_explicit = np.mean(diff_sq)

# Compute the variance using NumPy: variance_np
variance_np = np.var(patient_lengths)

# Print the results
print(variance_explicit, variance_np)

# They will be the same!

# Standard Deviation

In [None]:
import math

# Compute the variance: variance
variance = np.var(patient_lengths)

# Print the square root of the variance
print(math.sqrt(variance))

# Print the standard deviation
print(np.std(patient_lengths))

# OR

# Compute the variance: variance
variance = np.var(patient_lengths)

# Print the square root of the variance
print(np.sqrt(variance))

# Print the standard deviation
print(np.std(patient_lengths))

# Answers will all be the same! Isn't that fantastic ;P

# Scatter Plot

In [None]:
# Make a scatter plot
_ = plt.plot(pneumonia_patient_length, pneumonia_patient_width, marker = '.', linestyle = 'none')

# Set margins
plt.margins(0.02)

# Label the axes
_ = plt.xlabel('length')
_ = plt.ylabel('width')

# Show the result
plt.show()

# Covariance Matrices

In [None]:
# Compute the covariance matrix: covariance_matrix
covariance_matrix = np.cov(patient_lengths, patient_widths)

# Print covariance matrix
print(covariance_matrix)

# Extract covariance of length and width of patients: patient_cov
patient_cov = covariance_matrix[0,1]

# Print the length/width covariance
print(patient_cov)


# Pearson Correlation coefficients

In [None]:
def pearson_r(x, y):
    """Compute Pearson correlation coefficient between two arrays."""
    # Compute correlation matrix: corr_mat
    corr_mat = np.corrcoef(x,y)

    # Return entry [0,1]
    return corr_mat[0,1]

# Compute Pearson correlation coefficient for pneumonia_patients: pneumo
pneumo = pearson_r(pneumonia_patient_length, pneumonia_patient_width)

# Print the result
print(pneumo)


# Testing, Testing... Generating Random Numbers

In [None]:
# Seed the random number generator
np.random.seed(42)

# Initialize random numbers: random_numbers
random_numbers = np.empty(100000)

# Generate random numbers by looping over range(100000)
for i in range(100000):
    random_numbers[i] = np.random.random()

# Plot a histogram
_ = plt.hist(random_numbers)

# Show the plot
plt.show()

# Or more efficiently you could use np.random.random(size = 100000) but it's clearer what is happening this way.

# Bernoulli Trial

In [None]:
def perform_bernoulli_trial(n, p):
    """Perform n Bernoulli trials with success probability p
    and return number of successes."""
    # Initialize number of successes: n_success
    n_success = 0


    # Perform trials
    for i in range(n):
        # Choose random number between zero and one: random_number
        random_number = np.random.random()

        # If less than p, it's a success so add one to n_success
        if random_number < p:
            n_success += 1

    return n_success

# a failure is 1-p, thus is a positive value.

# Probabilistic Logic

In [None]:
"""
Lets say you treated 100 patients and the probability of death is 0.05. In order to test this probabilistically we can use
The Bernoulli Trail as a simulation to prove that this the result is actually true
"""

# Seed random number generator
np.random.seed(42)

# Initialize the number of deaths: n_deaths
n_deaths = np.empty(1000)

# Compute the number of deaths
for i in range(1000):
    n_deaths[i] = perform_bernoulli_trial(100,0.05)

    # this is taking in the number of patients in the test and the probability of death

# Plotting the histogram with default number of bins; axis are labelled
_ = plt.hist(n_deaths, normed = True)
_ = plt.xlabel('number of dead out of 100 patients')
_ = plt.ylabel('probability')

# Show the plot
plt.show()

# the result is a histogram showing a spread of variation in the actual number of defaults like you would have in real life. 

# Statistical Inference - when ECDF meets probability ;)

In [None]:
# Compute ECDF: x, y
x, y = ecdf(n_deaths)

# Plot the CDF with labeled axes - (CDF stands for cumulative /distribution/... deaths in this case ;o!)
_ = plt.plot(x, y, marker='.', linestyle='none')
_ = plt.xlabel('number of deaths out of 100')v
_ = plt.ylabel('CDF')

# Show the plot
plt.show()

# Compute the number of 100-loan simulations with 10 or more defaults: n_lose_money
n_lose_patient = np.sum(n_deaths >= 10)

# Compute and print probability of losing money
print('Probability of losing patient =', n_lose_patient / len(n_deaths))

# Discrete Uniform Probability Mass Function - ie. 'Rolling a Dice'

A probability is just a simple mathematical description of outcomes. So in the binomial distribution the number r of successes in n Bernoulli trials with a probability of p success, is binomially distributed

# Sampling From a Binomial Distribution

In [None]:
# Take 10,000 samples out of the binomial distribution: n_defaults
n_defaults = np.random.binomial(100,0.05, size = 10000) # This creates an n=100, probability = 0.05 test run 10,000 times

# Compute CDF: x, y
x,y = ecdf(n_defaults)

# Plot the CDF with axis labels
plt.plot(x,y,marker ='.', linestyle = 'none')
plt.xlabel('number of deaths in 100 patient encounters')
plt.ylabel('CDF')

# Show the plot
plt.show()

# This is a much quicker way to iterate than the methods above

# Plotting the Binomial PMF (prob mass function)

In [None]:
# Compute bin edges: bins
bins = np.arange(0, max(n_deaths) + 1.5) - 0.5

# Generate histogram
_ = plt.hist(n_defaults, normed=True, bins=bins)

# Set margins
plt.margins(0.02)

# Label axes
_ = plt.xlabel('number of deaths out of 100 patient encounters')
_ = plt.ylabel('PMF')

# Show the plot
plt.show()

# Poisson Process (Buses in Poissonville, Births, Meteors, Aviation Disasters, Website Hits - all independent processes)

In [None]:
# Draw 10,000 samples out of Poisson distribution: samples_poisson
samples_poisson = np.random.poisson(10, size = 10000)

# Print the mean and standard deviation
print('Poisson:     ', np.mean(samples_poisson),
                       np.std(samples_poisson))

# Specify values of n and p to consider for Binomial: n, p
n = [20,100,1000]
p = [0.5,0.1,0.01]

# Draw 10,000 samples for each n,p pair: samples_binomial
for i in range(3):
    samples_binomial = np.random.binomial(n[i], p[i], size=10000)

    # Print results
    print('n =', n[i], 'Binom:', np.mean(samples_binomial),
                                 np.std(samples_binomial))

In [None]:
# Draw 10,000 samples out of Poisson distribution: n_nodisease
n_nodisease = np.random.poisson(251/115, size=10000)

# Compute number of samples that are seven or greater: n_large
n_large = np.sum(n_nohitters >= 7)

# Compute probability of getting seven or more: p_large
p_large = n_large / 10000

# Print the result
print('Probability of seven or more no-disease:', p_large)