# Simulation Exercises

In [1]:
import numpy as np
import pandas as pd
import random

### 1) How likely is it that you roll doubles when rolling two dice?

In [2]:
# set number of trials and events per trial

nrows = 100_000 # number of trials

ncols = 2       # number of random events in each trial

In [16]:
np.random.choice([1, 2, 3, 4, 5, 6],4)

array([5, 3, 4, 1])

In [17]:
np.random.choice([1, 2, 3, 4, 5, 6], nrows * ncols)

array([5, 2, 6, ..., 6, 5, 1])

In [18]:
rolls = np.random.choice([1, 2, 3, 4, 5, 6], nrows * ncols).reshape(nrows, ncols)
rolls

array([[4, 3],
       [5, 1],
       [5, 2],
       ...,
       [4, 1],
       [4, 5],
       [4, 5]])

In [19]:
# convert to data frame
df = pd.DataFrame(rolls)
df

Unnamed: 0,0,1
0,4,3
1,5,1
2,5,2
3,1,4
4,4,2
...,...,...
99995,2,2
99996,4,5
99997,4,1
99998,4,5


In [20]:
# add add calculation column 
df['doubles'] = df[0] == df[1]
df.head(10)

Unnamed: 0,0,1,doubles
0,4,3,False
1,5,1,False
2,5,2,False
3,1,4,False
4,4,2,False
5,3,4,False
6,1,2,False
7,1,4,False
8,6,3,False
9,5,6,False


In [21]:
# get odds of doubles
df.doubles.mean()

0.16651

In [22]:
odds_of_doubles = int(round(df.doubles.mean(),2) * 100)

print(f'The odds of rolling doubles on two fair 6 sided dice are {odds_of_doubles}%')

The odds of rolling doubles on two fair 6 sided dice are 17%


In [23]:
# All together

nrows = 100_000 # set nrows to number of simulations
ncols = 2       # set ncolumns to number of dice rolled in each simulation

# create data frame with each row representing a simulation and each column representing a dice roll in the simulation
rolls = np.random.choice([1, 2, 3, 4, 5, 6], nrows * ncols).reshape(nrows, ncols) 

df = pd.DataFrame(rolls)

# add double columnn
df['doubles'] = df[0] == df[1]

# get mean of doubles column
# format and print results
odds_of_doubles = int(round(df.doubles.mean(),2) * 100)

print(f'The odds of rolling doubles on two fair 6 sided dice are {odds_of_doubles}%')

The odds of rolling doubles on two fair 6 sided dice are 17%


### 2) If you flip 8 coins, what is the probability of getting exactly 3 heads? What is the probability of getting more than 3 heads?

In [24]:
nrows = 100_000 # set nrows to number of simulations
ncols = 8       # set ncols to number of random events in each simulation

In [25]:
# create array
rolls = np.random.choice([True, False], nrows * ncols).reshape(nrows, ncols)
rolls

array([[ True,  True,  True, ..., False,  True,  True],
       [ True,  True, False, ..., False,  True, False],
       [ True, False, False, ..., False,  True, False],
       ...,
       [ True,  True, False, ..., False,  True,  True],
       [ True,  True,  True, ..., False,  True,  True],
       [False, False,  True, ..., False,  True,  True]])

In [26]:
# convert to data frame
df = pd.DataFrame(rolls)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,True,True,True,True,True,False,True,True
1,True,True,False,False,False,False,True,False
2,True,False,False,True,True,False,True,False
3,False,True,True,False,True,True,False,False
4,False,False,True,True,True,True,True,True


In [27]:
# add calculation columnns
df['number_of_heads'] = df.sum(axis=1)
df['exactly_three_heads'] = df.number_of_heads == 3
df['three_or_more_heads'] = df.number_of_heads >= 3
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,number_of_heads,exactly_three_heads,three_or_more_heads
0,True,True,True,True,True,False,True,True,7,False,True
1,True,True,False,False,False,False,True,False,3,True,True
2,True,False,False,True,True,False,True,False,4,False,True
3,False,True,True,False,True,True,False,False,4,False,True
4,False,False,True,True,True,True,True,True,6,False,True


In [None]:
# get mean of relevant columns

In [28]:
df.exactly_three_heads.mean()

0.21908

In [29]:
df.three_or_more_heads.mean()

0.85634

In [30]:
# get mean of calculation columns
# format and print results
odds_of_three_heads = int(round(df.exactly_three_heads.mean(),2) * 100)
odds_of_three_or_more_heads = int(round(df.three_or_more_heads.mean(),2) * 100)

print(f'The odds of getting exactly three heads when flipping five coins is {odds_of_three_heads}%')
print(f'The odds of gettng three or more heads in the same five flips is {odds_of_three_or_more_heads}%')

The odds of getting exactly three heads when flipping five coins is 22%
The odds of gettng three or more heads in the same five flips is 86%


In [33]:
# All together
nrows = 100_000 # set rows to number of simulations
ncols = 8       # set columns to number of random events in each simulation

# create array
rolls = np.random.choice([True, False], nrows * ncols).reshape(nrows, ncols)

# create dataframe
df = pd.DataFrame(rolls)

# add calculation columnns
df['number_of_heads'] = df.sum(axis=1)
df['exactly_three_heads'] = df.number_of_heads == 3
df['three_or_more_heads'] = df.number_of_heads >= 3

# get mean of calculation True/False columns
# format and print results
odds_of_three_heads = int(round(df.exactly_three_heads.mean(),2) * 100)
odds_of_three_or_more_heads = int(round(df.three_or_more_heads.mean(),2) * 100)

print(f'The odds of getting exactly three heads when flipping five coins is {odds_of_three_heads}%')
print(f'The odds of gettng three or more heads in the same five flips is {odds_of_three_or_more_heads}%')

The odds of getting exactly three heads when flipping five coins is 22%
The odds of gettng three or more heads in the same five flips is 85%


### 3) There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. Assuming that Codeup randomly selects an alumni to put on a billboard, what are the odds that the two billboards I drive past both have data science students on them?

In [34]:
nrows = n_simulations = 100_000 # set rows to number of simulations
ncols = n_dice_rolled = 2       # set columns to number of bilboards in each simulation

In [35]:
# create data frame with each row representing a simulation and each column representing a bilboard in the simulation
rolls = np.random.choice(["DS","WEB","WEB","WEB"], nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(rolls)
df.head()

Unnamed: 0,0,1
0,WEB,WEB
1,WEB,DS
2,WEB,DS
3,WEB,DS
4,DS,WEB


In [39]:
# add calculation columnn
df['both_ds'] = (df[0] == "DS") & (df[1] == "DS")
df

Unnamed: 0,0,1,both_ds
0,WEB,WEB,False
1,WEB,DS,False
2,WEB,DS,False
3,WEB,DS,False
4,DS,WEB,False
...,...,...,...
99995,DS,DS,True
99996,WEB,WEB,False
99997,WEB,WEB,False
99998,DS,WEB,False


In [40]:
df.both_ds.mean()

0.06317

In [41]:
# get mean of calculation True/False columns
# format and print results
both_ds = int(round(df.both_ds.mean(),2) * 100)


print(f'The odds of both billboards being Data Science are {both_ds}%')

The odds of both billboards being Data Science are 6%


In [43]:
# All together
nrows = n_simulations = 100_000 # set rows to number of simulations
ncols = n_dice_rolled = 2       # set columns to number of bilboards in each simulation

# create data frame with each row representing a simulation and each column representing a bilboard in the simulation
rolls = np.random.choice(["DS","WEB","WEB","WEB"], nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(rolls)

# add calculation columnn
df['both_ds'] = (df[0] == "DS") & (df[1] == "DS")

# get mean of calculation True/False columns
# format and print results
both_ds = int(round(df.both_ds.mean(),2) * 100)

print(f'The odds of both billboards being Data Science are {both_ds}%')

The odds of both billboards being Data Science are 6%


### 4) Codeup students buy, on average, 3 poptart packages (+- 1.5) a day from the snack vending machine. If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts on Friday afternoon?

In [44]:
nrows = n_simulations = 100_000 # set rows to number of simulations
ncols = n_dice_rolled = 5       # set columns to number of days in each simulation

In [46]:
# create array with each row representing a simulation and each column representing a day in the simulation
rolls = np.random.uniform(1.5, 4.5, nrows * ncols).astype(int).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(rolls)
df.head()

Unnamed: 0,0,1,2,3,4
0,1,3,2,2,1
1,1,3,2,2,2
2,1,3,1,3,3
3,3,2,1,3,1
4,2,2,4,1,1


In [47]:
# add calculation columnns
df['total_pop_tarts_bought'] = df.sum(axis=1)
df['pop_tarts_remain'] = df['total_pop_tarts_bought'] < 17
df.head()

Unnamed: 0,0,1,2,3,4,total_pop_tarts_bought,pop_tarts_remain
0,1,3,2,2,1,9,True
1,1,3,2,2,2,10,True
2,1,3,1,3,3,11,True
3,3,2,1,3,1,10,True
4,2,2,4,1,1,10,True


In [48]:
df['pop_tarts_remain'].mean()

0.97053

In [49]:
# get mean of calculation True/False columns
# format and print results
tarts = int(round(df['pop_tarts_remain'].mean(),2) * 100)


print(f'The odds of being able to by a PopTart in the afternoon on friday are {tarts}%')

The odds of being able to by a PopTart in the afternoon on friday are 97%


In [50]:
# All together
nrows = n_simulations = 100_000 # set rows to number of simulations
ncols = n_dice_rolled = 5       # set columns to number of days in each simulation

# create array with each row representing a simulation and each column representing a day in the simulation
rolls = np.random.uniform(1.5, 4.5, nrows * ncols).astype(int).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(rolls)

# add calculation columnns
df['total_pop_tarts'] = df.sum(axis=1)
df['pop_tarts_remain'] = df['total_pop_tarts'] < 17

# get mean of calculation True/False columns
# format and print results
tarts = int(round(df['pop_tarts_remain'].mean(),2) * 100)

print(f'The odds of being able to by a PopTart in the afternoon on friday are {tarts}%')

The odds of being able to by a PopTart in the afternoon on friday are 97%


### 5) Men have an average height of 178 cm and standard deviation of 8cm. Women have a mean of 170, sd = 6cm. If a man and woman are chosen at random, P(woman taller than man)?

In [51]:
# create arrays of veriables
male_hights = np.random.normal(178, 8, 100_000)
female_hights = np.random.normal(170, 6, 100_000)

In [52]:
male_hights

array([193.58735533, 184.46893071, 160.97155547, ..., 167.74911424,
       185.91186578, 179.54740027])

In [53]:
female_hights

array([175.89484191, 173.64983057, 170.43718969, ..., 171.48257548,
       166.8124565 , 170.24463837])

In [54]:
# convert array to data frame
df = pd.DataFrame({"male_hights" : male_hights,
                   "female_hights" : female_hights})

df.head()

Unnamed: 0,male_hights,female_hights
0,193.587355,175.894842
1,184.468931,173.649831
2,160.971555,170.43719
3,174.606467,172.701569
4,193.457313,168.896346


In [55]:
# add calculation column
df['female_is_taller'] = df.female_hights > df.male_hights
df.head()

Unnamed: 0,male_hights,female_hights,female_is_taller
0,193.587355,175.894842,False
1,184.468931,173.649831,False
2,160.971555,170.43719,True
3,174.606467,172.701569,False
4,193.457313,168.896346,False


In [56]:
# get mean of female_is_taller
df['female_is_taller'].mean()

0.21251

In [57]:
# All together
# create arrays of veriables
male_hights = np.random.normal(178, 8, 100_000)
female_hights = np.random.normal(170, 6, 100_000)

# convert array to data frame
df = pd.DataFrame({"male_hights" : male_hights,
                   "female_hights" : female_hights})

# add calculated column
df['female_is_taller'] = df.female_hights > df.male_hights

# get mean of female_is_taller
df['female_is_taller'].mean()

# get mean of calculation True/False columns
# format and print results
taller_female = int(round(df['female_is_taller'].mean(),2) * 100)

print(f"The probability of a woman, chosen at random, being taller than a man, chosen at random is {taller_female}%")

The probability of a woman, chosen at random, being taller than a man, chosen at random is 21%


### 6) When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted and the installation fails.

### a) What are the odds that after having 50 students download anaconda, no one has an installation issue? 100 students?

In [59]:
nrows = 100_000 # set rows to number of simulations
ncols = 50      # set columns to number of days in each simulation

In [60]:
# create array with each row representing a simulation and each column representing a student in the simulation

# list with 249 falses and one true
issue_range = [False for r in range(1,250)]
issue_range.append(True)

# create array
installs = np.random.choice(issue_range, nrows * ncols).reshape(nrows, ncols)
installs

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [61]:
# convert to data frame
df = pd.DataFrame(installs)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [62]:
# add calculation columnns
df['has_no_problem'] = df.sum(axis=1) == 0
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,has_no_problem
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [63]:
df['has_no_problem'].mean()

0.81862

In [85]:
# All together
nrows = 100_000 # set rows to number of simulations
ncols = 50      # set columns to number of students in each simulation

# list with 249 falses and one true
issue_range = [False for r in range(1,250)]
issue_range.append(True)

# create array with each row representing a simulation and each column representing a student in the simulation
installs = np.random.choice(issue_range, nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(installs)

# add calculation columnns
df['has_no_problem'] = df.sum(axis=1) == 0

# get mean of calculation True/False columns
# format and print results
problems = int(round(df['has_no_problem'].mean(),2) * 100)

print(f'The odds of 50 students dowloading Anaconda without a problem are {problems}%')

The odds of 50 students dowloading Anaconda without a problem are 82%


### b) What is the probability that we observe an installation issue within the first 150 students that download anaconda?

In [86]:
# All together
nrows = 100_000 # set rows to number of simulations
ncols = 150      # set columns to number of students in each simulation

# list with 249 falses and one true
issue_range = [False for r in range(1,250)]
issue_range.append(True)

# create array with each row representing a simulation and each column representing a student in the simulation
installs = np.random.choice(issue_range, nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(installs)

# add calculation columnns
df['has_a_problem'] = df.sum(axis=1) > 0

# get mean of calculation True/False columns
# format and print results
problems = int(round(df['has_a_problem'].mean(),2) * 100)

print(f'The odds of a problem occuring in the first 150 instllations is {problems}%')

The odds of a problem occuring in the first 150 instllations is 45%


### 5) How likely is it that 450 students all download anaconda without an issue?

In [87]:
# All together
nrows = 100_000 # set rows to number of simulations
ncols = 450      # set columns to number of students in each simulation

# list with 249 falses and one true
issue_range = [False for r in range(1,250)]
issue_range.append(True)

# create array with each row representing a simulation and each column representing a student in the simulation
installs = np.random.choice(issue_range, nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(installs)

# add calculation columnns
df['has_no_problem'] = df.sum(axis=1) == 0

# get mean of calculation True/False columns
# format and print results
problems = int(round(df['has_no_problem'].mean(),2) * 100)

print(f'The odds of 450 instllations occuring without an issue is {problems}%')

The odds of 450 instllations occuring without an issue is 16%


### 6) There's a 70% chance on any given day that there will be at least one food truck at Travis Park. However, you haven't seen a food truck there in 3 days. How unlikely is this?

In [88]:
nrows = 100_000 # set rows to number of simulations
ncols = 3      # set columns to number of days in each simulation

In [89]:
# create array with each row representing a simulation and each column representing a student in the simulation
sightings = np.random.choice([True, False], nrows * ncols, p=[.7,.3]).reshape(nrows, ncols)
sightings

array([[ True,  True,  True],
       [False,  True, False],
       [ True, False,  True],
       ...,
       [False,  True,  True],
       [ True,  True, False],
       [ True, False,  True]])

In [90]:
# convert to data frame
df = pd.DataFrame(sightings)
df.head()

Unnamed: 0,0,1,2
0,True,True,True
1,False,True,False
2,True,False,True
3,True,False,True
4,True,False,True


In [91]:
# add calculation columnns
df['times_sighted'] = df.sum(axis=1)
df['not_sighted_three_days'] = df.times_sighted == 0
df.head()

Unnamed: 0,0,1,2,times_sighted,not_sighted_three_days
0,True,True,True,3,False
1,False,True,False,1,False
2,True,False,True,2,False
3,True,False,True,2,False
4,True,False,True,2,False


In [92]:
df['not_sighted_three_days'].mean()

0.02739

In [93]:
# get mean of calculation True/False columns
# format and print results
trucks = int(round(df['not_sighted_three_days'].mean(),2) * 100)

print(f'The odds of not seeing a food truck in Travis Park for three days is {trucks}%')

The odds of not seeing a food truck in Travis Park for three days is 3%


### 7) How likely is it that a food truck will show up sometime this week?

In [99]:
# Remaining week days
nrows = 100_000 # set rows to number of simulations
ncols = 2      # set columns to number of students in each simulation

# create array with each row representing a simulation and each column representing a student in the simulation
sightings = np.random.choice([True, False], nrows * ncols, p=[.7,.3]).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(sightings)

# add calculation columnns
df['not_sighted_two_days'] = df.sum(axis=1) == 0

# get mean of calculation True/False columns
# format and print results
trucks = int(round(df['not_sighted_two_days'].mean(),4) * 100)

print(f'The odds of not seeing a food truck in Travis Park for {ncols} days is {trucks}%')

The odds of not seeing a food truck in Travis Park for 2 days is 9%


### 8) If 23 people are in the same room, what are the odds that two of them share a birthday? What if it's 20 people? 40?

In [100]:
nrows = 100_000 # set rows to number of simulations
ncols = 23       # set columns to number of people in each simulation

In [104]:
# create array with each row representing a simulation and each column representing the birthdate of a person in the simulation
birth_days = np.random.choice([r for r in range(366)], nrows * ncols).reshape(nrows, ncols)
birth_days

array([[273, 152, 333, ...,  50,  71, 322],
       [119, 137,  37, ..., 279,  70,  67],
       [359, 128, 319, ..., 227, 170,  61],
       ...,
       [245, 135, 157, ..., 298,  14, 115],
       [ 75,  78, 310, ..., 255, 149, 304],
       [248, 197,  31, ..., 333,   0, 210]])

In [105]:
# convert to data frame
df = pd.DataFrame(birth_days)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,273,152,333,320,49,238,230,225,146,249,...,268,209,88,90,149,193,288,50,71,322
1,119,137,37,333,331,63,149,258,47,81,...,88,264,197,331,130,46,212,279,70,67
2,359,128,319,171,267,77,16,218,249,6,...,71,47,100,228,264,348,74,227,170,61
3,204,364,175,260,190,206,226,148,26,231,...,225,20,153,57,154,340,171,161,270,298
4,136,83,151,68,89,353,110,239,115,71,...,9,44,343,32,87,197,115,267,283,57


In [106]:
# add calculation columnns
df['at_least_two_matching_birthdays'] = df.nunique(axis=1) < ncols
df['exactly_two_matching_birthdays'] = df.nunique(axis=1) == ncols-1
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,at_least_two_matching_birthdays,exactly_two_matching_birthdays
0,273,152,333,320,49,238,230,225,146,249,...,88,90,149,193,288,50,71,322,False,False
1,119,137,37,333,331,63,149,258,47,81,...,197,331,130,46,212,279,70,67,True,False
2,359,128,319,171,267,77,16,218,249,6,...,100,228,264,348,74,227,170,61,True,False
3,204,364,175,260,190,206,226,148,26,231,...,153,57,154,340,171,161,270,298,False,False
4,136,83,151,68,89,353,110,239,115,71,...,343,32,87,197,115,267,283,57,True,False


In [107]:
# get mean of calculation True/False columns
# format and print results
at_least_two_matching_birthdays = int(round(df['at_least_two_matching_birthdays'].mean(),2) * 100)
exactly_two_matching_birthdays = int(round(df['exactly_two_matching_birthdays'].mean(),2) * 100)

print(f'The odds of a room of {ncols} people having at least two people that share a birthday are {at_least_two_matching_birthdays}%')
print(f'The odds of a room of {ncols} people having exactly two people that share a birthday are {exactly_two_matching_birthdays}%')

The odds of a room of 23 people having at least two people that share a birthday are 51%
The odds of a room of 23 people having exactly two people that share a birthday are 13%


In [108]:
nrows = 100_000 # set rows to number of simulations
ncols = 23       # set columns to number of students in each simulation

# create array with each row representing a simulation and each column representing a student in the simulation
sightings = np.random.choice([r for r in range(366)], nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(sightings)

# add calculation columnns
df['at_least_two_matching_birthdays'] = df.nunique(axis=1) < ncols
df['exactly_two_matching_birthdays'] = df.nunique(axis=1) == ncols-1

# get mean of calculation True/False columns
# format and print results
at_least_two_matching_birthdays = int(round(df['at_least_two_matching_birthdays'].mean(),2) * 100)
exactly_two_matching_birthdays = int(round(df['exactly_two_matching_birthdays'].mean(),2) * 100)

print(f'The odds of a room of {ncols} people having at least two people that share a birthday are {at_least_two_matching_birthdays}%')
print(f'The odds of a room of {ncols} people having exactly least two people that share a birthday are {exactly_two_matching_birthdays}%')

The odds of a room of 23 people having at least two people that share a birthday are 50%
The odds of a room of 23 people having exactly least two people that share a birthday are 13%


### 20 people

In [109]:
nrows = 100_000 # set rows to number of simulations
ncols = 20       # set columns to number of students in each simulation

# create array with each row representing a simulation and each column representing a student in the simulation
sightings = np.random.choice([r for r in range(366)], nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(sightings)

# add calculation columnns
df['at_least_two_matching_birthdays'] = df.nunique(axis=1) < ncols
df['exactly_two_matching_birthdays'] = df.nunique(axis=1) == ncols-1

# get mean of calculation True/False columns
# format and print results
at_least_two_matching_birthdays = int(round(df['at_least_two_matching_birthdays'].mean(),2) * 100)
exactly_two_matching_birthdays = int(round(df['exactly_two_matching_birthdays'].mean(),2) * 100)

print(f'The odds of a room of {ncols} people having at least two people that share a birthday are {at_least_two_matching_birthdays}%')
print(f'The odds of a room of {ncols} people having exactly least two people that share a birthday are {exactly_two_matching_birthdays}%')

The odds of a room of 20 people having at least two people that share a birthday are 41%
The odds of a room of 20 people having exactly least two people that share a birthday are 9%


### 40 people

In [110]:
nrows = 100_000 # set rows to number of simulations
ncols = 40       # set columns to number of students in each simulation

# create array with each row representing a simulation and each column representing a student in the simulation
sightings = np.random.choice([r for r in range(366)], nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(sightings)

# add calculation columnns
df['at_least_two_matching_birthdays'] = df.nunique(axis=1) < ncols
df['exactly_two_matching_birthdays'] = df.nunique(axis=1) == ncols-1

# get mean of calculation True/False columns
# format and print results
at_least_two_matching_birthdays = int(round(df['at_least_two_matching_birthdays'].mean(),2) * 100)
exactly_two_matching_birthdays = int(round(df['exactly_two_matching_birthdays'].mean(),2) * 100)

print(f'The odds of a room of {ncols} people having at least two people that share a birthday are {at_least_two_matching_birthdays}%')
print(f'The odds of a room of {ncols} people having exactly two people that share a birthday are {exactly_two_matching_birthdays}%')

The odds of a room of 40 people having at least two people that share a birthday are 89%
The odds of a room of 40 people having exactly two people that share a birthday are 28%
