# Table of Contents
<li><a href="#Measures of center">Measures of center</a></li>
<li><a href="#Measures_of_spread">Measures_of_spread</a></li>
<li><a href="#What_are_the_chances">What_are_the_chances</a></li>
<li><a href="#Discrete_distributions">Discrete_distributions</a></li>
<li><a href="#Continuous_distributions">Continuous_distributions</a></li>
<li><a href="#The_binomial_distribution">The_binomial_distribution</a></li>
<li><a href="#The_normal_distribution">The_normal_distribution</a></li>
<li><a href="#The_central_limit_theorem">The_central_limit_theorem</a></li>
<li><a href="#The_Poisson_distribution">The_Poisson_distribution</a></li>
<li><a href="#More_probability_distributions">More_probability_distributions</a></li>
<li><a href="#Correlation">Correlation</a></li>
<li><a href="#Correlation_caveats">Correlation_caveats</a></li>

<a id='Measures of center'></a>
### Measures of center

In [None]:
# Import numpy with alias np
import numpy as np

# Subset country for USA: usa_consumption
usa_consumption = food_consumption[food_consumption.country == 'USA']

# Calculate mean consumption in USA
print(usa_consumption.consumption.mean())

# Calculate median consumption in USA
print(usa_consumption.consumption.median())

In [None]:
# Import matplotlib.pyplot with alias plt
import matplotlib.pyplot as plt

# Subset for food_category equals rice
rice_consumption = food_consumption[food_consumption.food_category == 'rice']

# Histogram of co2_emission for rice and show plot
plt.hist(rice_consumption.co2_emission)
plt.show()

In [None]:
# Subset for food_category equals rice
rice_consumption = food_consumption[food_consumption['food_category'] == 'rice']

# Calculate mean and median of co2_emission with .agg()
print(rice_consumption.co2_emission.agg([np.mean, np.median]))

<a id='Measures_of_spread'></a>
### Measures_of_spread

In [None]:
# Print variance and sd of co2_emission for each food_category
print(food_consumption.groupby('food_category')['co2_emission'].agg([np.var, np.std]))

# Create histogram of co2_emission for food_category 'beef'
food_consumption[food_consumption.food_category == 'beef']['co2_emission'].hist()
plt.show()

# Create histogram of co2_emission for food_category 'eggs'
food_consumption[food_consumption.food_category == 'eggs']['co2_emission'].hist()
plt.show()

In [None]:
# Calculate the quartiles of co2_emission
print(np.quantile(food_consumption.co2_emission, [0, 0.25, 0.5, 0.75, 1]))

In [None]:
# Calculate the quintiles of co2_emission
print(np.quantile(food_consumption.co2_emission, np.linspace(0, 1, 6)))

In [None]:
# Calculate the deciles of co2_emission
print(np.quantile(food_consumption.co2_emission, np.linspace(0, 1, 11)))

![image.png](attachment:a6422ef3-900f-4b54-8252-354af4c09759.png)

In [None]:
# Calculate total co2_emission per country: emissions_by_country
emissions_by_country = food_consumption.groupby('country')['co2_emission'].sum()

# Compute the first and third quantiles and IQR of emissions_by_country
q1 = np.quantile(emissions_by_country, 0.25)
q3 = np.quantile(emissions_by_country, 0.75)
iqr = q3 - q1

# Calculate the lower and upper cutoffs for outliers
lower = q1 - 1.5 * iqr
upper = q3 + 1.5 * iqr

# Subset emissions_by_country to find outliers
outliers = emissions_by_country[(emissions_by_country < lower) | (emissions_by_country > upper)]
print(outliers)

<a id='What_are_the_chances'></a>
# What_are_the_chances

![image.png](attachment:bf4c1c92-e645-440a-923b-9a8fa9213f00.png)

In [None]:
# Count the deals for each product
counts = amir_deals['product'].value_counts()

# Calculate probability of picking a deal with each product
probs = counts / np.sum(counts)
print(probs)

![image.png](attachment:16e9bf4b-9c1a-40ca-b19a-c94a22e22a89.png)

![image.png](attachment:be786f55-5164-4fe0-858a-3e540a9a4077.png)

In [None]:
# Set random seed
np.random.seed(24)

# Sample 5 deals without replacement
sample_without_replacement = amir_deals.sample(5)
print(sample_without_replacement)

In [None]:
# Sample 5 deals with replacement
sample_with_replacement = amir_deals.sample(5, replace=True)
print(sample_with_replacement)

![image.png](attachment:5fedbeb2-4db5-401a-93fa-14abd4dcf325.png)
Spectactular sampling! It's important to consider how you'll take a sample since there's no one-size-fits-all way to sample, and this can have an effect on your results.



<a id='Discrete_distributions'></a>
# Discrete_distributions

![image.png](attachment:0cecc1ed-acc8-4200-9714-6aff35f85ec8.png)

In [None]:
# Create a histogram of restaurant_groups and show plot
restaurant_groups['group_size'].hist(bins=[2, 3, 4, 5, 6])
plt.show()

In [None]:
# Create probability distribution
size_dist = restaurant_groups['group_size'].value_counts() / restaurant_groups.shape[0]
# Reset index and rename columns
size_dist = size_dist.reset_index()
size_dist.columns = ['group_size', 'prob']

# Expected value
expected_value = np.sum(size_dist['group_size'] * size_dist['prob'])

# Subset groups of size 4 or more
groups_4_or_more = size_dist[size_dist.group_size >= 4]

# Sum the probabilities of groups_4_or_more
prob_4_or_more = groups_4_or_more.prob.sum()
print(prob_4_or_more)

Dexterous distribution utilization! You'll continue to build upon these skills since many statistical tests and methods use probability distributions as their foundation.



<a id='Continuous_distributions'></a>
# Continuous_distributions

![image.png](attachment:2b6d1f7c-d43d-475f-95e2-03409db8fdf7.png)

In [None]:
# Min and max wait times for back-up that happens every 30 min
min_time = 0
max_time = 30

# Import uniform from scipy.stats
from scipy.stats import uniform

# Calculate probability of waiting less than 5 mins
prob_less_than_5 = uniform.cdf(5, min_time, max_time)
print(prob_less_than_5)

In [None]:
# Calculate probability of waiting more than 5 mins
prob_greater_than_5 = 1 - uniform.cdf(5, min_time, max_time)
print(prob_greater_than_5)

In [None]:
# Calculate probability of waiting 10-20 mins
prob_between_10_and_20 = uniform.cdf(20, min_time, max_time) - uniform.cdf(10, min_time, max_time)
print(prob_between_10_and_20)

![image.png](attachment:5a576559-1b9a-4104-9ebf-e485e3fda087.png)

In [None]:
# Set random seed to 334
np.random.seed(334)

# Import uniform
from scipy.stats import uniform

# Generate 1000 wait times between 0 and 30 mins
wait_times = uniform.rvs(0, 30, size=1000)

# Create a histogram of simulated times and show plot
plt.hist(wait_times)
plt.show()

<a id='The_binomial_distribution'></a>
# The_binomial_distribution

![image.png](attachment:d067a148-80a3-45e9-a4e3-9dd4ec791822.png)

In [None]:
# Import binom from scipy.stats
from scipy.stats import binom

# Set random seed to 10
np.random.seed(10)

# Simulate a single deal
print(binom.rvs(1, 0.3, size=1))

In [None]:
# Simulate 1 week of 3 deals
print(binom.rvs(3, 0.3, size=1))

In [None]:
# Import binom from scipy.stats
from scipy.stats import binom

# Set random seed to 10
np.random.seed(10)

# Simulate 52 weeks of 3 deals
deals = binom.rvs(3, 0.3, size=52)

# Print mean deals won per week
print(deals.mean())

![image.png](attachment:43762311-74bc-4f6f-b884-cd93a17b6f00.png)

In [None]:
# Probability of closing 3 out of 3 deals
prob_3 = binom.pmf(3, 3, 0.3)

print(prob_3)

In [None]:
# Probability of closing <= 1 deal out of 3 deals
prob_less_than_or_equal_1 = binom.cdf(1, 3, 0.3)

print(prob_less_than_or_equal_1)

In [None]:
# Probability of closing > 1 deal out of 3 deals
prob_greater_than_1 = 1 - binom.cdf(1, 3, 0.3)

print(prob_greater_than_1)

![image.png](attachment:e4ad4f36-dd6f-49dd-b8cd-86e4f57dbcc3.png)

In [None]:
# Expected number won with 30% win rate
won_30pct = 3 * 0.3
print(won_30pct)

# Expected number won with 25% win rate
won_25pct = 3 * 0.25
print(won_25pct)

# Expected number won with 35% win rate
won_35pct = 3 * 0.35
print(won_35pct)

<a id='The_normal_distribution'></a>
# The_normal_distribution

![image.png](attachment:de7ad8f4-5a2c-4e2c-8af5-ed3205624a54.png)

In [None]:
# Histogram of amount with 10 bins and show plot
amir_deals['amount'].hist(bins=10)
plt.show()


![image.png](attachment:969e447f-6360-47de-924a-8c2c96434d13.png)

In [None]:
# Probability of deal < 7500
prob_less_7500 = norm.cdf(7500, 5000, 2000)

print(prob_less_7500)

In [None]:
# Probability of deal > 1000
prob_over_1000 = 1 - norm.cdf(1000, 5000, 2000)

print(prob_over_1000)

In [None]:
# Probability of deal between 3000 and 7000
prob_3000_to_7000 = norm.cdf(7000, 5000, 2000) - norm.cdf(3000, 5000, 2000)

print(prob_3000_to_7000)

In [None]:
# Calculate amount that 25% of deals will be less than
pct_25 = norm.ppf(0.25, 5000, 2000)

print(pct_25)

Nifty normal distribution usage! You know that you can count on Amir 75% (1-0.25) of the time to make a sale worth at least $3651.02. This information could be useful in making company-wide sales projections.



![image.png](attachment:dc04b4f1-fe97-4c28-966b-49a72356a1b6.png)

In [None]:
# Calculate new average amount
new_mean = 5000 + 5000 * 0.2

# Calculate new standard deviation
new_sd = 2000 + 2000 * 0.3

# Simulate 36 new sales
new_sales = norm.rvs(new_mean, new_sd, size=36)

# Create histogram and show
plt.hist(new_sales)
plt.show()

Successful simulating! Although the average sale amount went up, the variation also increased, so it's not straightforward to decide whether these sales are better than his current ones. In the next exercise, you'll explore the effects of higher variation.



![image.png](attachment:b02cb54d-a3af-479d-b504-f8c0a3cc76fa.png)


```IPython Shell
Slides
In [1]:
1 - norm.cdf(1000, 5000, 2000)
Out[1]:
0.9772498680518208
In [2]:
1 - norm.cdf(1000, 6000, 2600)
Out[2]:
0.9727648049862613
```

Great work! In the current market, Amir makes sales over $1000 about 97.7% of the time, and about 97.3% of the time in the predicted market, so there's not much of a difference. However, his average sale amount is higher in the predicted market, so your company may want to consider other metrics as well.



<a id='The_central_limit_theorem'></a>
# The_central_limit_theorem

![image.png](attachment:a5a68f5c-1847-4908-b89f-051dccc51953.png)

In [None]:
# Create a histogram of num_users and show
amir_deals['num_users'].hist()
plt.show()

In [None]:
# Set seed to 104
np.random.seed(104)

sample_means = []
# Loop 100 times
for i in range(100):
  # Take sample of 20 num_users
  samp_20 = amir_deals['num_users'].sample(20, replace=True)
  # Calculate mean of samp_20
  samp_20_mean = np.mean(samp_20)
  # Append samp_20_mean to sample_means
  sample_means.append(samp_20_mean)
  
# Convert to Series and plot histogram
sample_means_series = pd.Series(sample_means)
sample_means_series.hist()
# Show plot
plt.show()

![image.png](attachment:4e8019d7-99e9-488a-a2dc-3e2eb45def53.png)

In [None]:
# Set seed to 321
np.random.seed(321)

sample_means = []
# Loop 30 times to take 30 means
for i in range(30):
  # Take sample of size 20 from num_users col of all_deals with replacement
  cur_sample = all_deals['num_users'].sample(20, replace=True)
  # Take mean of cur_sample
  cur_mean = cur_sample.mean()
  # Append cur_mean to sample_means
  sample_means.append(cur_mean)

# Print mean of sample_means
print(np.mean(sample_means))

# Print mean of num_users in amir_deals
print(amir_deals['num_users'].mean())

<a id='The_Poisson_distribution'></a>
# The_Poisson_distribution

![image.png](attachment:f6b924b1-45c8-4cc1-a360-130ce3a2ebf5.png)

In [None]:
# Import poisson from scipy.stats
from scipy.stats import poisson

# Probability of 5 responses
prob_5 = poisson.pmf(5, 4)

print(prob_5)

In [None]:
# Import poisson from scipy.stats
from scipy.stats import poisson

# Probability of 5 responses
prob_coworker = poisson.pmf(5, 5.5)

print(prob_coworker)

In [None]:
# Import poisson from scipy.stats
from scipy.stats import poisson

# Probability of 2 or fewer responses
prob_2_or_less = poisson.cdf(2, 4)

print(prob_2_or_less)

In [None]:
# Import poisson from scipy.stats
from scipy.stats import poisson

# Probability of > 10 responses
prob_over_10 = 1 - poisson.cdf(10, 4)

print(prob_over_10)

<a id='More_probability_distributions'></a>
# More_probability_distributions

![image.png](attachment:05a954af-b0c5-490b-8562-f08b80fd9efd.png)

In [None]:
# Import expon from scipy.stats
from scipy.stats import expon

# Print probability response takes < 1 hour
print(expon.cdf(1, scale=2.5))

In [None]:
# Print probability response takes > 4 hours
print(1 - expon.cdf(4, scale=2.5))

In [None]:
# Print probability response takes 3-4 hours
print(expon.cdf(4, scale=2.5) - expon.cdf(3, scale=2.5))

<a id='Correlation'></a>
# Correlation

![image.png](attachment:ece2599d-12f8-4dab-93e7-ac48acbde606.png)

In [None]:
# Create a scatterplot of happiness_score vs. life_exp and show
sns.scatterplot(x='life_exp', y='happiness_score', data=world_happiness)

# Show plot
plt.show()

In [None]:
# Create scatterplot of happiness_score vs life_exp with trendline
sns.lmplot(x='life_exp', y='happiness_score', data=world_happiness, ci=None)

# Show plot
plt.show()

In [None]:
# Create scatterplot of happiness_score vs life_exp with trendline
sns.lmplot(x='life_exp', y='happiness_score', data=world_happiness, ci=None)

# Show plot
plt.show()

# Correlation between life_exp and happiness_score
cor = world_happiness['life_exp'].corr(world_happiness['happiness_score'])

print(cor)

<a id='Correlation_caveats'></a>
# Correlation_caveats

![image.png](attachment:9a2e4470-be00-49c0-94ad-a8285d77d508.png)

In [None]:
# Scatterplot of gdp_per_cap and life_exp
sns.scatterplot(x='gdp_per_cap', y='life_exp', data=world_happiness)

# Show plot
plt.show()
  
# Correlation between gdp_per_cap and life_exp
cor = world_happiness['gdp_per_cap'].corr(world_happiness['life_exp'])

print(cor)

![image.png](attachment:bc7d7b5c-c139-4cd5-adc3-b2b8add09a3f.png)

![image.png](attachment:7b19b15d-1877-40fc-af57-74b3c3b8c6ef.png)

In [None]:
# Scatterplot of happiness_score vs. gdp_per_cap
sns.scatterplot(x='gdp_per_cap', y='happiness_score', data=world_happiness)
plt.show()

# Calculate correlation
cor = world_happiness['gdp_per_cap'].corr(world_happiness['happiness_score'])
print(cor)

In [None]:
# Create log_gdp_per_cap column
world_happiness['log_gdp_per_cap'] = np.log(world_happiness['gdp_per_cap'])

# Scatterplot of happiness_score vs. log_gdp_per_cap
sns.scatterplot(x='log_gdp_per_cap', y='happiness_score', data=world_happiness)
plt.show()

# Calculate correlation
cor = world_happiness['log_gdp_per_cap'].corr(world_happiness['happiness_score'])
print(cor)

![image.png](attachment:d8e3eabc-ab32-414b-a41a-8e40d0bf64c4.png)

In [None]:
# Scatterplot of grams_sugar_per_day and happiness_score
sns.scatterplot(x='grams_sugar_per_day', y='happiness_score', data=world_happiness)
plt.show()

# Correlation between grams_sugar_per_day and happiness_score
cor = world_happiness['grams_sugar_per_day'].corr(world_happiness['happiness_score'])
print(cor)

![image.png](attachment:78c16113-c89a-4353-b5c2-aada5f560545.png)