# Simulation Exercises

In [1]:
import numpy as np
import pandas as pd
import random

### 1) How likely is it that you roll doubles when rolling two dice?

In [2]:
# set number of trials and events per trial

nrows = 100_000 # number of trials

ncols = 2       # number of random events in each trial

In [3]:
np.random.choice([1, 2, 3, 4, 5, 6],4)

array([6, 4, 5, 4])

In [4]:
np.random.choice([1, 2, 3, 4, 5, 6], nrows * ncols)

array([6, 1, 3, ..., 5, 2, 3])

In [5]:
rolls = np.random.choice([1, 2, 3, 4, 5, 6], nrows * ncols).reshape(nrows, ncols)
rolls

array([[1, 3],
       [1, 3],
       [1, 2],
       ...,
       [3, 6],
       [1, 4],
       [1, 4]])

In [6]:
# convert to data frame
df = pd.DataFrame(rolls)
df

Unnamed: 0,0,1
0,1,3
1,1,3
2,1,2
3,3,2
4,2,4
...,...,...
99995,5,1
99996,2,2
99997,3,6
99998,1,4


In [7]:
# add add calculation column 
df['doubles'] = df[0] == df[1]
df.head(10)

Unnamed: 0,0,1,doubles
0,1,3,False
1,1,3,False
2,1,2,False
3,3,2,False
4,2,4,False
5,1,1,True
6,1,6,False
7,5,1,False
8,6,2,False
9,1,3,False


In [8]:
# get odds of doubles
df.doubles.mean()

0.16792

In [9]:
odds_of_doubles = int(round(df.doubles.mean(),2) * 100)

print(f'The odds of rolling doubles on two fair 6 sided dice are {odds_of_doubles}%')

The odds of rolling doubles on two fair 6 sided dice are 17%


In [10]:
# All together

nrows = 100_000 # set nrows to number of simulations
ncols = 2       # set ncolumns to number of dice rolled in each simulation

# create data frame with each row representing a simulation and each column representing a dice roll in the simulation
rolls = np.random.choice([1, 2, 3, 4, 5, 6], nrows * ncols).reshape(nrows, ncols) 

df = pd.DataFrame(rolls)

# add double columnn
df['doubles'] = df[0] == df[1]

# get mean of doubles column
# format and print results
odds_of_doubles = int(round(df.doubles.mean(),2) * 100)

print(f'The odds of rolling doubles on two fair 6 sided dice are {odds_of_doubles}%')

The odds of rolling doubles on two fair 6 sided dice are 17%


### 2) If you flip 8 coins, what is the probability of getting exactly 3 heads? What is the probability of getting more than 3 heads?

In [12]:
nrows = 100_000 # set nrows to number of simulations
ncols = 8       # set ncols to number of random events in each simulation

In [13]:
# create array
rolls = np.random.choice([True, False], nrows * ncols).reshape(nrows, ncols)
rolls

array([[False,  True, False, ...,  True, False, False],
       [ True, False, False, ...,  True,  True,  True],
       [False,  True, False, ..., False, False,  True],
       ...,
       [ True, False, False, ...,  True, False, False],
       [ True, False,  True, ..., False, False,  True],
       [False, False, False, ..., False,  True,  True]])

In [14]:
# convert to data frame
df = pd.DataFrame(rolls)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,False,True,False,False,True,True,False,False
1,True,False,False,True,False,True,True,True
2,False,True,False,False,True,False,False,True
3,True,True,True,False,True,True,True,True
4,False,False,True,True,True,True,False,False


In [15]:
# add calculation columnns
df['number_of_heads'] = df.sum(axis=1)
df['exactly_three_heads'] = df.number_of_heads == 3
df['three_or_more_heads'] = df.number_of_heads >= 3
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,number_of_heads,exactly_three_heads,three_or_more_heads
0,False,True,False,False,True,True,False,False,3,True,True
1,True,False,False,True,False,True,True,True,5,False,True
2,False,True,False,False,True,False,False,True,3,True,True
3,True,True,True,False,True,True,True,True,7,False,True
4,False,False,True,True,True,True,False,False,4,False,True


In [16]:
# get mean of relevant columns

In [17]:
df.exactly_three_heads.mean()

0.21864

In [18]:
df.three_or_more_heads.mean()

0.85512

In [19]:
# get mean of calculation columns
# format and print results
odds_of_three_heads = int(round(df.exactly_three_heads.mean(),2) * 100)
odds_of_three_or_more_heads = int(round(df.three_or_more_heads.mean(),2) * 100)

print(f'The odds of getting exactly three heads when flipping five coins is {odds_of_three_heads}%')
print(f'The odds of gettng three or more heads in the same five flips is {odds_of_three_or_more_heads}%')

The odds of getting exactly three heads when flipping five coins is 22%
The odds of gettng three or more heads in the same five flips is 86%


In [20]:
# All together
nrows = 100_000 # set rows to number of simulations
ncols = 8       # set columns to number of random events in each simulation

# create array
rolls = np.random.choice([True, False], nrows * ncols).reshape(nrows, ncols)

# create dataframe
df = pd.DataFrame(rolls)

# add calculation columnns
df['number_of_heads'] = df.sum(axis=1)
df['exactly_three_heads'] = df.number_of_heads == 3
df['three_or_more_heads'] = df.number_of_heads >= 3

# get mean of calculation True/False columns
# format and print results
odds_of_three_heads = int(round(df.exactly_three_heads.mean(),2) * 100)
odds_of_three_or_more_heads = int(round(df.three_or_more_heads.mean(),2) * 100)

print(f'The odds of getting exactly three heads when flipping five coins is {odds_of_three_heads}%')
print(f'The odds of gettng three or more heads in the same five flips is {odds_of_three_or_more_heads}%')

The odds of getting exactly three heads when flipping five coins is 22%
The odds of gettng three or more heads in the same five flips is 86%


### 3) There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. Assuming that Codeup randomly selects an alumni to put on a billboard, what are the odds that the two billboards I drive past both have data science students on them?

In [22]:
nrows = 100_000 # set rows to number of simulations
ncols = 2       # set columns to number of bilboards in each simulation

In [23]:
# create data frame with each row representing a simulation and each column representing a bilboard in the simulation
rolls = np.random.choice(["DS","WEB","WEB","WEB"], nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(rolls)
df.head()

Unnamed: 0,0,1
0,WEB,DS
1,WEB,WEB
2,WEB,WEB
3,WEB,WEB
4,DS,WEB


In [24]:
# add calculation columnn
df['both_ds'] = (df[0] == "DS") & (df[1] == "DS")
df

Unnamed: 0,0,1,both_ds
0,WEB,DS,False
1,WEB,WEB,False
2,WEB,WEB,False
3,WEB,WEB,False
4,DS,WEB,False
...,...,...,...
99995,WEB,WEB,False
99996,WEB,DS,False
99997,WEB,DS,False
99998,WEB,WEB,False


In [25]:
df.both_ds.mean()

0.06297

In [26]:
# get mean of calculation True/False columns
# format and print results
both_ds = int(round(df.both_ds.mean(),2) * 100)


print(f'The odds of both billboards being Data Science are {both_ds}%')

The odds of both billboards being Data Science are 6%


In [27]:
# All together
nrows = 100_000 # set rows to number of simulations
ncols = 2       # set columns to number of bilboards in each simulation

# create data frame with each row representing a simulation and each column representing a bilboard in the simulation
rolls = np.random.choice(["DS","WEB","WEB","WEB"], nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(rolls)

# add calculation columnn
df['both_ds'] = (df[0] == "DS") & (df[1] == "DS")

# get mean of calculation True/False columns
# format and print results
both_ds = int(round(df.both_ds.mean(),2) * 100)

print(f'The odds of both billboards being Data Science are {both_ds}%')

The odds of both billboards being Data Science are 6%


### 4) Codeup students buy, on average, 3 poptart packages (+- 1.5) a day from the snack vending machine. If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts on Friday afternoon?

In [29]:
nrows = 100_000 # set rows to number of simulations
ncols = 5       # set columns to number of days in each simulation

In [30]:
# create array with each row representing a simulation and each column representing a day in the simulation
rolls = np.random.normal(3, 1.5, nrows * ncols).astype(int).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(rolls)
df.head()

Unnamed: 0,0,1,2,3,4
0,5,3,2,1,3
1,0,1,2,1,4
2,3,5,1,4,4
3,1,3,3,5,2
4,2,2,3,2,0


In [31]:
# add calculation columnns
df['total_pop_tarts_bought'] = df.sum(axis=1)
df['pop_tarts_remain'] = df['total_pop_tarts_bought'] < 17
df.head()

Unnamed: 0,0,1,2,3,4,total_pop_tarts_bought,pop_tarts_remain
0,5,3,2,1,3,14,True
1,0,1,2,1,4,8,True
2,3,5,1,4,4,17,False
3,1,3,3,5,2,14,True
4,2,2,3,2,0,9,True


In [32]:
df['pop_tarts_remain'].mean()

0.87851

In [33]:
# get mean of calculation True/False columns
# format and print results
tarts = int(round(df['pop_tarts_remain'].mean(),2) * 100)


print(f'The odds of being able to by a PopTart in the afternoon on friday are {tarts}%')

The odds of being able to by a PopTart in the afternoon on friday are 88%


In [34]:
# All together
nrows = 100_000 # set rows to number of simulations
ncols = 5       # set columns to number of days in each simulation

# create array with each row representing a simulation and each column representing a day in the simulation
rolls = np.random.normal(3, 1.5, nrows * ncols).astype(int).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(rolls)

# add calculation columnns
df['total_pop_tarts'] = df.sum(axis=1)
df['pop_tarts_remain'] = df['total_pop_tarts'] < 17

# get mean of calculation True/False columns
# format and print results
tarts = int(round(df['pop_tarts_remain'].mean(),2) * 100)

print(f'The odds of being able to by a PopTart in the afternoon on friday are {tarts}%')

The odds of being able to by a PopTart in the afternoon on friday are 88%


### 5) Men have an average height of 178 cm and standard deviation of 8cm. Women have a mean of 170, sd = 6cm. If a man and woman are chosen at random, P(woman taller than man)?

In [35]:
# create arrays of veriables
male_hights = np.random.normal(178, 8, 100_000)
female_hights = np.random.normal(170, 6, 100_000)

In [36]:
male_hights

array([178.43516344, 191.83858352, 174.60905105, ..., 193.54263372,
       181.15170989, 170.77968098])

In [37]:
female_hights

array([175.88336765, 180.77538162, 161.6464049 , ..., 173.95384645,
       163.96644415, 169.14894175])

In [38]:
# convert array to data frame
df = pd.DataFrame({"male_hights" : male_hights,
                   "female_hights" : female_hights})

df.head()

Unnamed: 0,male_hights,female_hights
0,178.435163,175.883368
1,191.838584,180.775382
2,174.609051,161.646405
3,169.76328,171.933599
4,176.682855,167.700606


In [39]:
# add calculation column
df['female_is_taller'] = df.female_hights > df.male_hights
df.head()

Unnamed: 0,male_hights,female_hights,female_is_taller
0,178.435163,175.883368,False
1,191.838584,180.775382,False
2,174.609051,161.646405,False
3,169.76328,171.933599,True
4,176.682855,167.700606,False


In [40]:
# get mean of female_is_taller
df['female_is_taller'].mean()

0.21135

In [41]:
# All together
# create arrays of veriables
male_hights = np.random.normal(178, 8, 100_000)
female_hights = np.random.normal(170, 6, 100_000)

# convert array to data frame
df = pd.DataFrame({"male_hights" : male_hights,
                   "female_hights" : female_hights})

# add calculated column
df['female_is_taller'] = df.female_hights > df.male_hights

# get mean of female_is_taller
df['female_is_taller'].mean()

# get mean of calculation True/False columns
# format and print results
taller_female = int(round(df['female_is_taller'].mean(),2) * 100)

print(f"The probability of a woman, chosen at random, being taller than a man, chosen at random is {taller_female}%")

The probability of a woman, chosen at random, being taller than a man, chosen at random is 21%


### 6) When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted and the installation fails.

### a) What are the odds that after having 50 students download anaconda, no one has an installation issue? 100 students?

In [42]:
nrows = 100_000 # set rows to number of simulations
ncols = 50      # set columns to number of days in each simulation

In [43]:
# create array with each row representing a simulation and each column representing a student in the simulation

# list with 249 falses and one true
issue_range = [False for r in range(1,250)]
issue_range.append(True)

# create array
installs = np.random.choice(issue_range, nrows * ncols).reshape(nrows, ncols)
installs

array([[False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False]])

In [44]:
# convert to data frame
df = pd.DataFrame(installs)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [45]:
# add calculation columnns
df['has_no_problem'] = df.sum(axis=1) == 0
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,41,42,43,44,45,46,47,48,49,has_no_problem
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
2,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True


In [46]:
df['has_no_problem'].mean()

0.81609

In [49]:
# All together
nrows = 100_000 # set rows to number of simulations
ncols = 100      # set columns to number of students in each simulation

# list with 249 falses and one true
issue_range = [False for r in range(1,250)]
issue_range.append(True)

# create array with each row representing a simulation and each column representing a student in the simulation
installs = np.random.choice(issue_range, nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(installs)

# add calculation columnns
df['has_no_problem'] = df.sum(axis=1) == 0

# get mean of calculation True/False columns
# format and print results
problems = int(round(df['has_no_problem'].mean(),2) * 100)

print(f'The odds of 100 students dowloading Anaconda without a problem are {problems}%')

The odds of 100 students dowloading Anaconda without a problem are 67%


### b) What is the probability that we observe an installation issue within the first 150 students that download anaconda?

In [50]:
# All together
nrows = 100_000 # set rows to number of simulations
ncols = 150      # set columns to number of students in each simulation

# list with 249 falses and one true
issue_range = [False for r in range(1,250)]
issue_range.append(True)

# create array with each row representing a simulation and each column representing a student in the simulation
installs = np.random.choice(issue_range, nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(installs)

# add calculation columnns
df['has_a_problem'] = df.sum(axis=1) > 0

# get mean of calculation True/False columns
# format and print results
problems = int(round(df['has_a_problem'].mean(),2) * 100)

print(f'The odds of a problem occuring in the first 150 instllations is {problems}%')

The odds of a problem occuring in the first 150 instllations is 45%


### 5) How likely is it that 450 students all download anaconda without an issue?

In [51]:
# All together
nrows = 100_000 # set rows to number of simulations
ncols = 450      # set columns to number of students in each simulation

# list with 249 falses and one true
issue_range = [False for r in range(1,250)]
issue_range.append(True)

# create array with each row representing a simulation and each column representing a student in the simulation
installs = np.random.choice(issue_range, nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(installs)

# add calculation columnns
df['has_no_problem'] = df.sum(axis=1) == 0

# get mean of calculation True/False columns
# format and print results
problems = int(round(df['has_no_problem'].mean(),2) * 100)

print(f'The odds of 450 instllations occuring without an issue is {problems}%')

The odds of 450 instllations occuring without an issue is 17%


### 6) There's a 70% chance on any given day that there will be at least one food truck at Travis Park. However, you haven't seen a food truck there in 3 days. How unlikely is this?

In [52]:
nrows = 100_000 # set rows to number of simulations
ncols = 3      # set columns to number of days in each simulation

In [53]:
# create array with each row representing a simulation and each column representing a student in the simulation
sightings = np.random.choice([True, False], nrows * ncols, p=[.7,.3]).reshape(nrows, ncols)
sightings

array([[ True,  True,  True],
       [ True, False,  True],
       [ True,  True, False],
       ...,
       [ True,  True,  True],
       [False,  True,  True],
       [ True, False, False]])

In [54]:
# convert to data frame
df = pd.DataFrame(sightings)
df.head()

Unnamed: 0,0,1,2
0,True,True,True
1,True,False,True
2,True,True,False
3,True,False,True
4,True,True,False


In [55]:
# add calculation columnns
df['times_sighted'] = df.sum(axis=1)
df['not_sighted_three_days'] = df.times_sighted == 0
df.head()

Unnamed: 0,0,1,2,times_sighted,not_sighted_three_days
0,True,True,True,3,False
1,True,False,True,2,False
2,True,True,False,2,False
3,True,False,True,2,False
4,True,True,False,2,False


In [56]:
df['not_sighted_three_days'].mean()

0.02714

In [57]:
# get mean of calculation True/False columns
# format and print results
trucks = int(round(df['not_sighted_three_days'].mean(),2) * 100)

print(f'The odds of not seeing a food truck in Travis Park for three days is {trucks}%')

The odds of not seeing a food truck in Travis Park for three days is 3%


### 7) How likely is it that a food truck will show up sometime this week?

In [58]:
# Remaining week days
nrows = 100_000 # set rows to number of simulations
ncols = 2      # set columns to number of students in each simulation

# create array with each row representing a simulation and each column representing a student in the simulation
sightings = np.random.choice([True, False], nrows * ncols, p=[.7,.3]).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(sightings)

# add calculation columnns
df['not_sighted_two_days'] = df.sum(axis=1) == 0

# get mean of calculation True/False columns
# format and print results
trucks = int(round(df['not_sighted_two_days'].mean(),4) * 100)

print(f'The odds of not seeing a food truck in Travis Park for {ncols} days is {trucks}%')

The odds of not seeing a food truck in Travis Park for 2 days is 8%


### 8) If 23 people are in the same room, what are the odds that two of them share a birthday? What if it's 20 people? 40?

In [60]:
nrows = 100_000 # set rows to number of simulations
ncols = 23       # set columns to number of people in each simulation

In [61]:
# create array with each row representing a simulation and each column representing the birthdate of a person in the simulation
birth_days = np.random.choice([r for r in range(366)], nrows * ncols).reshape(nrows, ncols)
birth_days

array([[302,  49,  95, ...,  25, 231, 188],
       [122, 243, 252, ..., 217,  34, 173],
       [105, 339, 235, ..., 273, 107, 344],
       ...,
       [360,  15, 277, ..., 237, 211, 168],
       [187, 217, 212, ..., 333,  62, 271],
       [ 58,  19, 155, ...,  11,  31, 324]])

In [62]:
# convert to data frame
df = pd.DataFrame(birth_days)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,302,49,95,4,324,28,67,73,263,134,...,172,13,363,116,300,52,16,25,231,188
1,122,243,252,248,63,269,171,189,130,167,...,311,231,38,228,329,45,32,217,34,173
2,105,339,235,14,353,104,308,218,337,268,...,243,353,194,220,333,74,338,273,107,344
3,58,132,76,135,208,165,208,4,13,232,...,110,157,324,150,274,259,288,103,85,257
4,132,31,259,291,29,254,242,195,233,214,...,64,242,192,325,153,110,84,346,308,7


In [63]:
# add calculation columnns
df['at_least_two_matching_birthdays'] = df.nunique(axis=1) < ncols
df['exactly_two_matching_birthdays'] = df.nunique(axis=1) == ncols-1
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15,16,17,18,19,20,21,22,at_least_two_matching_birthdays,exactly_two_matching_birthdays
0,302,49,95,4,324,28,67,73,263,134,...,363,116,300,52,16,25,231,188,False,False
1,122,243,252,248,63,269,171,189,130,167,...,38,228,329,45,32,217,34,173,False,False
2,105,339,235,14,353,104,308,218,337,268,...,194,220,333,74,338,273,107,344,True,False
3,58,132,76,135,208,165,208,4,13,232,...,324,150,274,259,288,103,85,257,True,False
4,132,31,259,291,29,254,242,195,233,214,...,192,325,153,110,84,346,308,7,True,False


In [64]:
# get mean of calculation True/False columns
# format and print results
at_least_two_matching_birthdays = int(round(df['at_least_two_matching_birthdays'].mean(),2) * 100)
exactly_two_matching_birthdays = int(round(df['exactly_two_matching_birthdays'].mean(),2) * 100)

print(f'The odds of a room of {ncols} people having at least two people that share a birthday are {at_least_two_matching_birthdays}%')
print(f'The odds of a room of {ncols} people having exactly two people that share a birthday are {exactly_two_matching_birthdays}%')

The odds of a room of 23 people having at least two people that share a birthday are 51%
The odds of a room of 23 people having exactly two people that share a birthday are 13%


In [None]:
nrows = 100_000 # set rows to number of simulations
ncols = 23       # set columns to number of students in each simulation

# create array with each row representing a simulation and each column representing a student in the simulation
sightings = np.random.choice([r for r in range(366)], nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(sightings)

# add calculation columnns
df['at_least_two_matching_birthdays'] = df.nunique(axis=1) < ncols
df['exactly_two_matching_birthdays'] = df.nunique(axis=1) == ncols-1

# get mean of calculation True/False columns
# format and print results
at_least_two_matching_birthdays = int(round(df['at_least_two_matching_birthdays'].mean(),2) * 100)
exactly_two_matching_birthdays = int(round(df['exactly_two_matching_birthdays'].mean(),2) * 100)

print(f'The odds of a room of {ncols} people having at least two people that share a birthday are {at_least_two_matching_birthdays}%')
print(f'The odds of a room of {ncols} people having exactly least two people that share a birthday are {exactly_two_matching_birthdays}%')

### 20 people

In [65]:
nrows = 100_000 # set rows to number of simulations
ncols = 20       # set columns to number of students in each simulation

# create array with each row representing a simulation and each column representing a student in the simulation
sightings = np.random.choice([r for r in range(366)], nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(sightings)

# add calculation columnns
df['at_least_two_matching_birthdays'] = df.nunique(axis=1) < ncols
df['exactly_two_matching_birthdays'] = df.nunique(axis=1) == ncols-1

# get mean of calculation True/False columns
# format and print results
at_least_two_matching_birthdays = int(round(df['at_least_two_matching_birthdays'].mean(),2) * 100)
exactly_two_matching_birthdays = int(round(df['exactly_two_matching_birthdays'].mean(),2) * 100)

print(f'The odds of a room of {ncols} people having at least two people that share a birthday are {at_least_two_matching_birthdays}%')
print(f'The odds of a room of {ncols} people having exactly least two people that share a birthday are {exactly_two_matching_birthdays}%')

The odds of a room of 20 people having at least two people that share a birthday are 41%
The odds of a room of 20 people having exactly least two people that share a birthday are 9%


### 40 people

In [66]:
nrows = 100_000 # set rows to number of simulations
ncols = 40       # set columns to number of students in each simulation

# create array with each row representing a simulation and each column representing a student in the simulation
sightings = np.random.choice([r for r in range(366)], nrows * ncols).reshape(nrows, ncols)

# convert to data frame
df = pd.DataFrame(sightings)

# add calculation columnns
df['at_least_two_matching_birthdays'] = df.nunique(axis=1) < ncols
df['exactly_two_matching_birthdays'] = df.nunique(axis=1) == ncols-1

# get mean of calculation True/False columns
# format and print results
at_least_two_matching_birthdays = int(round(df['at_least_two_matching_birthdays'].mean(),2) * 100)
exactly_two_matching_birthdays = int(round(df['exactly_two_matching_birthdays'].mean(),2) * 100)

print(f'The odds of a room of {ncols} people having at least two people that share a birthday are {at_least_two_matching_birthdays}%')
print(f'The odds of a room of {ncols} people having exactly two people that share a birthday are {exactly_two_matching_birthdays}%')

The odds of a room of 40 people having at least two people that share a birthday are 89%
The odds of a room of 40 people having exactly two people that share a birthday are 28%
