Do your work for this exercise in either a python file named `simulation.py` or a jupyter notebook named `simulation.ipynb`.

In [800]:
# Import libraries
import numpy as np
import pandas as pd

# Set up overall conditions
np.random.seed(123)
n_trials = 100_000

1. How likely is it that you roll doubles when rolling two dice?

In [803]:
# set up individual variables for our test
outcomes = list(range(1,7))
n_dice = 2

In [805]:
rolls = np.random.choice(outcomes,(n_trials,n_dice))
rolls[:10]

array([[6, 3],
       [5, 3],
       [2, 4],
       [3, 4],
       [2, 2],
       [1, 2],
       [2, 1],
       [1, 2],
       [4, 6],
       [5, 1]])

In [807]:
# Determine if roll 1 is also roll 2
(rolls[:,0] == rolls[:,1]).sum() / len(rolls)

0.16497

2. If you flip 8 coins, what is the probability of getting exactly 3 heads? What is the probability of getting more than 3 heads?

In [810]:
# Adjust outcomes and set number of coins
outcomes = ['H','T']
n_coins = 8

# Create number of trials
flips = np.random.choice(outcomes,(n_trials,n_coins))
flips[:10]

array([['T', 'T', 'T', 'T', 'H', 'T', 'T', 'H'],
       ['H', 'H', 'H', 'H', 'T', 'H', 'T', 'H'],
       ['H', 'H', 'T', 'T', 'T', 'T', 'H', 'H'],
       ['H', 'T', 'T', 'T', 'H', 'T', 'H', 'H'],
       ['T', 'T', 'H', 'T', 'H', 'H', 'T', 'H'],
       ['T', 'H', 'T', 'H', 'T', 'H', 'T', 'T'],
       ['H', 'H', 'T', 'H', 'T', 'T', 'T', 'H'],
       ['H', 'H', 'H', 'T', 'H', 'T', 'T', 'T'],
       ['T', 'H', 'H', 'T', 'H', 'H', 'T', 'T'],
       ['T', 'H', 'H', 'T', 'H', 'T', 'T', 'T']], dtype='<U1')

In [812]:
# Determine probability of exactly 3 heads

# First turn each flip into a boolean to check for heads, then get the row summary
# Then if the row summary is equal to 3, sum that new boolean and divide by length of flips
# to get final probability
((flips == 'H').sum(axis=1) == 3).sum() / len(flips)

0.22128

In [814]:
((flips == 'H').sum(axis=1) > 3).mean()

0.63372

3. There are approximitely 3 web development cohorts for every 1 data science cohort at Codeup. Assuming that Codeup randomly selects an alumni to put on a billboard, what are the odds that the two billboards I drive past both have data science students on them?

In [817]:
# 3 web dev to 1 data science = 1/4th classes is a DS class
# So it'll be something along the lines of 1/4 classes times 1/4
# So our answer should look something like 1/16
1/16

0.0625

In [819]:
outcomes = ['DS','WD','WD','WD']
n_boards = 2

billboards = np.random.choice(outcomes,(n_trials,n_boards))
billboards[:10]

array([['WD', 'WD'],
       ['WD', 'WD'],
       ['DS', 'WD'],
       ['WD', 'DS'],
       ['WD', 'DS'],
       ['WD', 'WD'],
       ['WD', 'DS'],
       ['WD', 'DS'],
       ['WD', 'WD'],
       ['WD', 'WD']], dtype='<U2')

In [821]:
# Create test condition against 'DS' for Data Science
# Sum across rows
# If the sum total equals 2, then 'DS' is on both billboards (turn that into a boolean)
# Then get the sum of booleans divided by length of billboards
((billboards == 'DS').sum(axis=1) == 2).sum() / len(billboards)

0.06396

In [823]:
# Try this with mean
((billboards == 'DS').sum(axis=1) == 2).mean()

0.06396

In [825]:
# And 0.06212 is approximately correct to the odds we expect to see

4. Codeup students buy, on average, 3 poptart packages with a standard deviation of 1.5 a day from the snack vending machine. If on monday the machine is restocked with 17 poptart packages, how likely is it that I will be able to buy some poptarts on Friday afternoon? (Remember, if you have mean and standard deviation, use the `np.random.normal`) *You'll need to make a judgement call on how to handle some of your values*

In [828]:
# Get an idea of using np.random.normal, and once familiarized, set that to a variable
daily_poptarts = np.random.normal(loc=3.0,scale=1.5,size=(n_trials,5))
daily_poptarts[:10]

array([[1.68798591, 2.7698127 , 3.30310137, 3.45726925, 0.55099958],
       [2.95339273, 1.30502667, 5.53459412, 4.82111324, 3.64236023],
       [4.29160809, 1.96440374, 1.6045665 , 4.05790293, 2.53907919],
       [1.52474342, 5.63936177, 4.04014284, 5.34792325, 3.85892812],
       [1.8058607 , 2.67899881, 2.06632554, 2.12347509, 4.29128652],
       [4.87627538, 6.69561876, 1.07489227, 3.12003442, 0.63729615],
       [6.21811961, 4.19961101, 1.38112108, 4.94211217, 2.83728238],
       [3.54715382, 4.90212404, 0.84800817, 2.84666182, 3.31683785],
       [2.66662526, 4.270446  , 4.61220303, 2.62988572, 3.24501702],
       [5.55606049, 2.9311997 , 1.13641218, 2.5483197 , 2.87501629]])

In [830]:
# Making some assumptions here, but summing each row then subtracting that sum
# from 17 (for initial starting value) should yield approximately how many packages are left
# on Friday afternoon/evening
end_consumption = 17 - daily_poptarts.sum(axis=1)
end_consumption[:10]

array([ 5.23083119, -1.25648699,  2.54243955, -3.4110994 ,  4.03405334,
        0.59588302, -2.57824625,  1.5392143 , -0.42417704,  1.95299163])

In [832]:
# Can't buy less than 1 package, so compare each value against 1
# Then get the mean of those booleans
(end_consumption > 1).mean()

0.61516

In [834]:
# Try altogether now
daily_poptarts = np.random.normal(loc=3.0,scale=1.5,size=(n_trials,5))
end_consumption = 17 - daily_poptarts.sum(axis=1)
(end_consumption > 1).mean().round(2)

# End value tends to be about 0.62, rounded

0.62

5. Compare Heights

- Men have an average height of 178 cm and standard deviation of 8cm.

In [838]:
# Establish male height and std
m_ht = 178
m_sd = 8

# make the array of male heights
m_heights = np.random.normal(m_ht,m_sd,(n_trials))
m_heights[:10]

array([175.06731455, 180.16580625, 184.72608636, 179.99107353,
       173.11957806, 183.59234394, 183.99629043, 182.59218779,
       161.61620243, 187.14400529])

- Women have a mean of 170, sd = 6cm.

In [841]:
# Establish female height and std
f_ht = 170
f_sd = 6

# make the array of female heights
f_heights = np.random.normal(f_ht,f_sd,(n_trials))
f_heights[:10]

array([166.15346864, 166.03998357, 165.4708793 , 170.00408182,
       177.51055964, 172.31119058, 177.30114656, 180.99675224,
       169.07419928, 165.68364652])

- Since you have means and standard deviations, you can use `np.random.normal` to generate observations.

In [844]:
# Establish a DataFrame to hold randomized comparisons
# Dataframe not necessary, but it makes things easier

heights = pd.DataFrame(
    {
        'male_height':np.random.choice(m_heights,(n_trials)),
        'female_height':np.random.choice(f_heights,(n_trials))
    }
)

heights.describe().round()

Unnamed: 0,male_height,female_height
count,100000.0,100000.0
mean,178.0,170.0
std,8.0,6.0
min,147.0,145.0
25%,173.0,166.0
50%,178.0,170.0
75%,183.0,174.0
max,208.0,196.0


- If a man and woman are chosen at random, what is the likelihood the woman is taller than the man?

In [847]:
# Simple boolean comparison and voila, we have our probability!
(heights.female_height > heights.male_height).mean()

0.21457

6. When installing anaconda on a student's computer, there's a 1 in 250 chance that the download is corrupted and the installation fails. What are the odds that after having 50 students download anaconda, no one has an installation issue? 100 students?

In [850]:
# p = probability of single event, in this case 1 in 250 chance of download corrupted
# n = number of trials, in this case the number of students downloading
p = 1/250
n = 50
(1-p)**n

# So the probability of no one having an installation issue over 50 students is:

0.8184024506760997

In [852]:
# Simulate the odds
downloads = np.random.random((n_trials,n))
downloads[:2]

array([[0.12310616, 0.52675227, 0.20126506, 0.18043498, 0.80104613,
        0.62691938, 0.27662418, 0.81050405, 0.60229298, 0.05055323,
        0.81671477, 0.94373607, 0.72268724, 0.15622368, 0.34682255,
        0.31358997, 0.47121437, 0.28991572, 0.73319046, 0.86925656,
        0.1681269 , 0.42222747, 0.45120135, 0.00439776, 0.84075202,
        0.29748951, 0.03844183, 0.00567731, 0.15562885, 0.31130481,
        0.06799421, 0.72630896, 0.80293094, 0.23245541, 0.5135685 ,
        0.64329465, 0.5002918 , 0.27122958, 0.58331618, 0.75355195,
        0.46456472, 0.83569144, 0.22662404, 0.14728746, 0.77547688,
        0.9003262 , 0.46647358, 0.17055439, 0.7239486 , 0.48968672],
       [0.86588007, 0.98744363, 0.60769011, 0.73389239, 0.17195023,
        0.36624114, 0.22410136, 0.04640949, 0.02370488, 0.31819625,
        0.9208575 , 0.44227546, 0.71155077, 0.59375613, 0.45129411,
        0.20459393, 0.84094231, 0.45397945, 0.41321107, 0.12560271,
        0.82872283, 0.02012217, 0.18634058, 0.0

In [854]:
# Create a boolean of students succeeding
download_success = downloads > p
download_success[:2]

array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True]])

In [856]:
# Aggregate the results
(download_success.sum(axis=1) == n).mean()

0.8192

In [858]:
# change n to 100 for larger scale
n = 100
(1-p)**n

# Probability of 100 students not having a single issue:

0.6697825712726458

In [860]:
# Simulate the odds, changing n to 100
downloads = np.random.random((n_trials,n))
download_success = downloads > p
(download_success.sum(axis=1) == n).mean()

0.66692

    What is the probability that we observe an installation issue within the first 150 students that download anaconda?

In [863]:
# Now changing n to 150
# Also run it a bit differently
n = 150

# Probability of a single event happening is just the inverse of the event not happening
1-(1-p)**n

# Expected answer:

0.4518483022503271

In [865]:
downloads = np.random.random((n_trials,n))
download_fail = downloads <= p
(download_fail.sum(axis=1) >= 1).mean()

0.45283

    How likely is it that 450 students all download anaconda without an issue?

In [868]:
n = 450

(1-p)**n

# Expected answer:

0.1647032961586129

In [870]:
downloads = np.random.random((n_trials,n))
download_success = downloads > p
(download_success.sum(axis=1) == n).mean()

0.16506

7. There's a 70% chance on any given day that there will be at least one food truck at Travis Park. However, you haven't seen a food truck there in 3 days. How unlikely is this?

In [938]:
# Establish variables
p = 7/10
n = 3

(1-p)**n
# Probability of food truck not appearing in 3 days:

0.027000000000000014

In [875]:
# prob_of_truck=np.random.random((n_trials,n))
# prob_of_truck[:10]
# truck_found=prob_of_truck < p
# truck_found[:10]
# (truck_found.sum(axis=1) == 0).mean()

In [877]:
# Reminder that in this instance, the probabilities are weighted
prob_of_truck = np.random.choice(['Truck','No Truck'],(n_trials,n),p=[.7,.3])
prob_of_truck[:10]

array([['Truck', 'No Truck', 'Truck'],
       ['Truck', 'No Truck', 'No Truck'],
       ['Truck', 'Truck', 'Truck'],
       ['Truck', 'No Truck', 'Truck'],
       ['Truck', 'Truck', 'No Truck'],
       ['No Truck', 'No Truck', 'Truck'],
       ['Truck', 'Truck', 'No Truck'],
       ['Truck', 'Truck', 'No Truck'],
       ['Truck', 'Truck', 'Truck'],
       ['Truck', 'Truck', 'Truck']], dtype='<U8')

In [879]:
truck_found = prob_of_truck == 'Truck'
truck_found[:10]

array([[ True, False,  True],
       [ True, False, False],
       [ True,  True,  True],
       [ True, False,  True],
       [ True,  True, False],
       [False, False,  True],
       [ True,  True, False],
       [ True,  True, False],
       [ True,  True,  True],
       [ True,  True,  True]])

In [881]:
(truck_found.sum(axis=1) == 0).mean()

0.02792

    How likely is it that a food truck will show up sometime this week?

In [942]:
# Change n to 7 for 7 days of the week
n = 7

prob_of_truck = np.random.random((n_trials,n))
# print(prob_of_truck[:10])
truck_found = prob_of_truck < p
# print(truck_found[:10])
(truck_found.sum(axis=1) >= 1).mean()

0.99985

8. If 23 people are in the same room, what are the odds that two of them share a birthday? What if it's 20 people? 40?

In [887]:
# Establish probabilities and number of events
p = 1/365
n = 23
birthdays = list(range(1,366))

In [889]:
birthday_df = pd.DataFrame(np.random.choice(birthdays,(n_trials,n)))
birthday_df.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13,14,15,16,17,18,19,20,21,22
0,219,212,157,281,6,239,150,338,172,5,...,206,261,230,132,136,177,140,265,362,32
1,126,154,308,91,27,204,6,305,196,238,...,313,94,237,12,89,102,285,353,49,250
2,185,68,245,275,184,53,12,87,132,3,...,118,216,187,13,305,336,314,156,216,300
3,263,302,219,15,113,115,229,54,36,179,...,162,79,302,22,131,37,347,9,360,200
4,255,337,100,122,321,313,24,350,148,267,...,239,271,154,343,277,287,109,18,186,42


In [891]:
# For exactly 2 people and no more
(birthday_df.nunique(axis=1) == n-1).mean()

0.36393

In [892]:
# For at least 2 or more people
(birthday_df.nunique(axis=1) < n).mean()

0.50665

In [893]:
# Calculate probability for 20 people in a room
n = 20
birthday_df = pd.DataFrame(np.random.choice(birthdays,(n_trials,n)))
(birthday_df.nunique(axis=1) < n).mean()

0.41281

In [894]:
# Calculate probability for 40 people in a room
n = 40
birthday_df = pd.DataFrame(np.random.choice(birthdays,(n_trials,n)))
(birthday_df.nunique(axis=1) < n).mean()

0.89259