# Simple Random Sampling

In [25]:
import pandas as pd
import numpy as np

In [2]:
df_2015 = pd.read_csv('./data_pd/marathon_results_2015.csv')
df_2016 = pd.read_csv('./data_pd/marathon_results_2016.csv')
df_2017 = pd.read_csv('./data_pd/marathon_results_2017.csv')
df = pd.concat([df_2015,df_2016,df_2017])
df.head(5)

Unnamed: 0.1,Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 9,...,30K,35K,40K,Pace,Proj Time,Official Time,Overall,Gender,Division,Unnamed: 8
0,0.0,3,"Desisa, Lelisa",25,M,Ambo,,ETH,,,...,1:32:00,1:47:59,2:02:39,0:04:56,-,2:09:17,1,1,1,
1,1.0,4,"Tsegay, Yemane Adhane",30,M,Addis Ababa,,ETH,,,...,1:31:59,1:47:59,2:02:42,0:04:58,-,2:09:48,2,2,2,
2,2.0,8,"Chebet, Wilson",29,M,Marakwet,,KEN,,,...,1:32:00,1:47:59,2:03:01,0:04:59,-,2:10:22,3,3,3,
3,3.0,11,"Kipyego, Bernard",28,M,Eldoret,,KEN,,,...,1:32:00,1:48:03,2:03:47,0:05:00,-,2:10:47,4,4,4,
4,4.0,10,"Korir, Wesley",32,M,Kitale,,KEN,,,...,1:32:00,1:47:59,2:03:27,0:05:00,-,2:10:49,5,5,5,


In [3]:
# Get the total number of rows in the original dataframe
original_row_count = len(df)


In [4]:
# perform Simple random sampling (select 1/10th of the records)
sample_df = df.sample(frac=0.1, random_state=42)
sample_df

Unnamed: 0.1,Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 9,...,30K,35K,40K,Pace,Proj Time,Official Time,Overall,Gender,Division,Unnamed: 8
24038,24038.0,31350,"Labonte, Laura R.",34,F,Charlestown,MA,USA,,,...,3:22:19,3:56:22,4:29:25,0:10:49,-,4:43:30,24039,10561,5235,
20218,20218.0,21352,"Vargas, Mario",64,M,Shaker Heights,OH,USA,,,...,2:52:26,3:24:14,3:54:58,0:09:28,-,4:08:04,20219,11836,568,
23755,23755.0,27235,"Wilkins, Stephen D.",53,M,Dorchester,MA,USA,,,...,3:14:37,3:49:59,4:24:08,0:10:40,-,4:39:30,23756,13348,1896,
1447,1447.0,4337,"Johnson, David T",32,M,Brandon,FL,USA,,,...,2:00:46,2:21:31,2:43:11,0:06:39,-,2:54:17,1448,1376,1108,
25121,,26089,"Bashor, Mari W",67,F,Sacramento,CA,USA,,,...,3:35:49,4:17:13,4:56:18,0:11:58,5:13:34,5:13:34,25121,11285,87,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23322,23322.0,11427,"Silwal, Suman",45,M,Pelham,AL,USA,,,...,2:47:50,3:31:03,4:16:56,0:10:29,-,4:34:32,23323,13155,2301,
5102,,10536,"Baganz, Paul",62,M,Fond Du Lac,WI,USA,,,...,2:21:55,2:46:20,3:10:05,0:07:40,3:20:41,3:20:41,5103,4349,25,
12821,12821.0,12730,"Strutz, Nicole",22,F,Two Rivers,WI,USA,,,...,2:37:04,3:09:17,3:39:22,0:08:48,-,3:50:20,12823,4327,2843,
13241,,21185,"Greengrass, Suzie",45,F,Cary,NC,USA,,,...,2:39:19,3:08:02,3:35:21,0:08:42,3:47:50,3:47:50,13241,4490,482,


In [5]:
# Get the total number of rows in the sampled dataframe
sample_row_count = len(sample_df)
df.head()
sample_df.head()


Unnamed: 0.1,Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 9,...,30K,35K,40K,Pace,Proj Time,Official Time,Overall,Gender,Division,Unnamed: 8
24038,24038.0,31350,"Labonte, Laura R.",34,F,Charlestown,MA,USA,,,...,3:22:19,3:56:22,4:29:25,0:10:49,-,4:43:30,24039,10561,5235,
20218,20218.0,21352,"Vargas, Mario",64,M,Shaker Heights,OH,USA,,,...,2:52:26,3:24:14,3:54:58,0:09:28,-,4:08:04,20219,11836,568,
23755,23755.0,27235,"Wilkins, Stephen D.",53,M,Dorchester,MA,USA,,,...,3:14:37,3:49:59,4:24:08,0:10:40,-,4:39:30,23756,13348,1896,
1447,1447.0,4337,"Johnson, David T",32,M,Brandon,FL,USA,,,...,2:00:46,2:21:31,2:43:11,0:06:39,-,2:54:17,1448,1376,1108,
25121,,26089,"Bashor, Mari W",67,F,Sacramento,CA,USA,,,...,3:35:49,4:17:13,4:56:18,0:11:58,5:13:34,5:13:34,25121,11285,87,


In [6]:
print(f"Number of rows in the original dataframe: {original_row_count}")
print(f"Number of rows in the sampled dataframe: {sample_row_count}")

Number of rows in the original dataframe: 79638
Number of rows in the sampled dataframe: 7964


# Systematic Random Sampling

In [7]:
# perform systematic sampling
# ::7 means from the start to end, pick every 7th record
# , : means pick all the columns
sample_df = df.iloc[::7, :]

In [8]:
sample_row_count = len(sample_df)

In [9]:
# Print first few rows of the original and sampled dataframes
print("First few rows of the original dataframes: ")
df.head()
print("\n First few rows of the sampled dataframe: ")
sample_df.head()

First few rows of the original dataframes: 

 First few rows of the sampled dataframe: 


Unnamed: 0.1,Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 9,...,30K,35K,40K,Pace,Proj Time,Official Time,Overall,Gender,Division,Unnamed: 8
0,0.0,3,"Desisa, Lelisa",25,M,Ambo,,ETH,,,...,1:32:00,1:47:59,2:02:39,0:04:56,-,2:09:17,1,1,1,
7,7.0,1,"Keflezighi, Meb",39,M,San Diego,CA,USA,,,...,1:31:59,1:47:59,2:04:58,0:05:04,-,2:12:42,8,8,8,
14,14.0,76,"Goffi, Danilo",42,M,Parabiago - Milan,,ITA,,,...,1:38:02,1:54:55,2:11:25,0:05:18,-,2:18:44,15,15,1,
21,21.0,32,"Zywicki, Benjamin P.",26,M,Louisville,CO,USA,,,...,1:38:46,1:56:05,2:13:25,0:05:24,-,2:21:10,22,22,21,
28,28.0,106,"Glaz, Daniel",32,M,Chicago,IL,USA,,,...,1:42:19,1:59:50,2:17:03,0:05:32,-,2:24:44,29,29,27,


In [10]:
print(f"Number of rows in the original dataframe: {original_row_count}")
print(f"Number of rows in the sampled dataframe: {sample_row_count}")

Number of rows in the original dataframe: 79638
Number of rows in the sampled dataframe: 11377


# Stratified Random Sampling

In [None]:
# Find total number of male and female atheletes
total_males = len(df[df["M/F"] == "M"])
total_females = len(df[df["M/F"] == "F"])
total_males, total_females

(43482, 36156)

In [16]:
# Calculate the proportionate number of male and female atheletes for the sample
sample_size = int(0.1*original_row_count)
sample_males = int(0.1*total_males)
sample_females = int(0.1*total_females)

In [17]:
# Generate proportionate sample of male and female atheletes
male_sample = df[df['M/F'] == 'M'].sample(n=sample_males, random_state=42)
female_sample = df[df['M/F'] == 'F'].sample(n=sample_females, random_state=42)

In [18]:
# Combine male and female sample into a single sample dataframe
sample_df = pd.concat([male_sample, female_sample])

In [21]:
# GEt the total number of rows in sampled dataframe
sample_row_count = len(sample_df)

In [22]:
print(f"Number of rows in the original dataframe: {original_row_count}")
print(f"Number of rows in the sampled dataframe: {sample_row_count}")

Number of rows in the original dataframe: 79638
Number of rows in the sampled dataframe: 7963


In [24]:
# Print the number of male and female atheletes in the sapmled daatframe
print(f"Number of male atheletes in the sample dataframe: {len(sample_df[sample_df['M/F'] == 'M'])}")
print(f"Number of female atheletes in the sample dataframe: {len(sample_df[sample_df['M/F'] == 'F'])}")

Number of male atheletes in the sample dataframe: 4348
Number of female atheletes in the sample dataframe: 3615


# Cluster Sampling

In [32]:
# Find the unique countries the atheletes are from
unique_countries = df['Country'].unique()
num_countries = len(unique_countries)
unique_countries, num_countries

(array(['ETH', 'KEN', 'USA', 'UKR', 'RSA', 'ITA', 'RUS', 'JPN', 'CAN',
        'BEL', 'NZL', 'BLR', 'AUS', 'GBR', 'CRO', 'ECU', 'GER', 'ESP',
        'SWE', 'BRA', 'HKG', 'MEX', 'DEN', 'MAS', 'IRL', 'ISL', 'CHI',
        'GUA', 'FIN', 'SVK', 'COL', 'SUI', 'CHN', 'AUT', 'NED', 'FRA',
        'CRC', 'CYP', 'POL', 'NOR', 'KOR', 'POR', 'TPE', 'PER', 'SIN',
        'PAN', 'VEN', 'LUX', 'CZE', 'VIE', 'MAR', 'BER', 'ROU', 'IND',
        'LIE', 'ARG', 'ESA', 'DOM', 'ISR', 'GRE', 'SLO', 'LTU', 'URU',
        'CAY', 'EST', 'JAM', 'UAE', 'VGB', 'TUR', 'LAT', 'AND', 'OMA',
        'BAH', 'TRI', 'INA', 'AHO', 'PHI', 'UGA', 'QAT', 'ZIM', 'TWN',
        'HON', 'FLK', 'HUN', 'JOR', 'ALB', 'SRB', 'PAK', 'BRN', 'BDI',
        'EGY', 'THA', 'MLT', 'KSA', 'SMR', 'BUL', 'ALG', 'NCA', 'BAR',
        'GRN', 'PAR', 'TCA', 'KUW', 'MGL', 'NGR'], dtype=object),
 105)

In [33]:
# Randomly select half of the countries
np.random.seed(42)
select_countries = np.random.choice(unique_countries, size = num_countries//2, replace=False)
select_countries

array(['COL', 'JAM', 'EST', 'IND', 'PAN', 'SMR', 'NGR', 'LUX', 'NZL',
       'ETH', 'SWE', 'SUI', 'BDI', 'ALG', 'UGA', 'RSA', 'TWN', 'AUT',
       'AUS', 'CHI', 'GRN', 'ARG', 'DEN', 'PHI', 'SIN', 'BAH', 'ECU',
       'TPE', 'KOR', 'BEL', 'ALB', 'BLR', 'MGL', 'QAT', 'FIN', 'ZIM',
       'ITA', 'URU', 'ESA', 'NOR', 'FRA', 'GER', 'UAE', 'NED', 'JPN',
       'PER', 'TUR', 'LAT', 'GUA', 'BRA', 'JOR', 'ISL'], dtype=object)

In [28]:
# Create a new datadrame containing all the records of the selected counties
sample_df = df[df['Country'].isin(select_countries)]

In [31]:
# Get the total number of rows in the smpled dataframe
sample_row_count = len(sample_df)
sample_row_count

5331

In [30]:
# Print the first few rows of original sampled dataframe
print("Original dataframe : ")
print(df.head())
print("Sample dataframe : ")
print(sample_df.head())

Original dataframe : 
   Unnamed: 0 Bib                   Name  Age M/F         City State Country  \
0         0.0   3         Desisa, Lelisa   25   M         Ambo   NaN     ETH   
1         1.0   4  Tsegay, Yemane Adhane   30   M  Addis Ababa   NaN     ETH   
2         2.0   8         Chebet, Wilson   29   M     Marakwet   NaN     KEN   
3         3.0  11       Kipyego, Bernard   28   M      Eldoret   NaN     KEN   
4         4.0  10          Korir, Wesley   32   M       Kitale   NaN     KEN   

  Citizen Unnamed: 9  ...      30K      35K      40K     Pace Proj Time  \
0     NaN        NaN  ...  1:32:00  1:47:59  2:02:39  0:04:56         -   
1     NaN        NaN  ...  1:31:59  1:47:59  2:02:42  0:04:58         -   
2     NaN        NaN  ...  1:32:00  1:47:59  2:03:01  0:04:59         -   
3     NaN        NaN  ...  1:32:00  1:48:03  2:03:47  0:05:00         -   
4     NaN        NaN  ...  1:32:00  1:47:59  2:03:27  0:05:00         -   

  Official Time Overall Gender Division Unname

In [34]:
print(f"Number of rows in the original dataframe: {original_row_count}")
print(f"Number of rows in the sampled dataframe: {sample_row_count}")

Number of rows in the original dataframe: 79638
Number of rows in the sampled dataframe: 5331


# Assignment
    - Which x_bar is closest to meu(u) {based on the 4 sampling techniques}

In [35]:
df[['Hours','Minutes','Seconds']]  = df['Official Time'].str.split(':', expand=True)
df['Overall_duration'] = df['Hours'].astype(int)*3600 + df['Minutes'].astype(int)*60 + df['Seconds'].astype(int)
df.head()


Unnamed: 0.1,Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 9,...,Proj Time,Official Time,Overall,Gender,Division,Unnamed: 8,Hours,Minutes,Seconds,Overall_duration
0,0.0,3,"Desisa, Lelisa",25,M,Ambo,,ETH,,,...,-,2:09:17,1,1,1,,2,9,17,7757
1,1.0,4,"Tsegay, Yemane Adhane",30,M,Addis Ababa,,ETH,,,...,-,2:09:48,2,2,2,,2,9,48,7788
2,2.0,8,"Chebet, Wilson",29,M,Marakwet,,KEN,,,...,-,2:10:22,3,3,3,,2,10,22,7822
3,3.0,11,"Kipyego, Bernard",28,M,Eldoret,,KEN,,,...,-,2:10:47,4,4,4,,2,10,47,7847
4,4.0,10,"Korir, Wesley",32,M,Kitale,,KEN,,,...,-,2:10:49,5,5,5,,2,10,49,7849


In [40]:
results = []

In [41]:
meu = df['Overall_duration'].mean()
results.append({"meu": meu})
print(f"Value of meu : {meu}")

Value of meu : 13989.929166980588


In [42]:
sample_df_simple = df.sample(frac=0.1, random_state=42)
sample_df_simple
xbar_simple = sample_df_simple['Overall_duration'].mean()
results.append({"xbar_simple": xbar_simple})
print(f"Value of xbar from simple random sampling : {xbar_simple}")


Value of xbar from simple random sampling : 13955.71320944249


In [44]:
sample_df_systematic = df.iloc[::7, :]
xbar_systematic = sample_df_systematic['Overall_duration'].mean()
results.append({"xbar_systematic": xbar_systematic})
print(f"Value of xbar from systematic sampling : {xbar_systematic}")

Value of xbar from systematic sampling : 13988.119715214907


In [46]:
male_sample = df[df['M/F'] == 'M'].sample(n=sample_males, random_state=42)
female_sample = df[df['M/F'] == 'F'].sample(n=sample_females, random_state=42)
sample_df_stratified = pd.concat([male_sample, female_sample])
xbar_stratified = sample_df_stratified['Overall_duration'].mean()
results.append({"xbar_stratified": xbar_stratified})
print(f"Value of xbar from strafied sampling : {xbar_stratified}")


Value of xbar from strafied sampling : 14009.489137259827


In [47]:
sample_df_cluster = df[df['Country'].isin(select_countries)]
xbar_cluster = sample_df_cluster['Overall_duration'].mean()
results.append({"xbar_cluster": xbar_cluster})
print(f"Value of xbar from cluster sampling : {xbar_cluster}")

Value of xbar from cluster sampling : 13891.37385105984


In [48]:
results

[{'meu': 13989.929166980588},
 {'xbar_simple': 13955.71320944249},
 {'xbar_systematic': 13988.119715214907},
 {'xbar_stratified': 14009.489137259827},
 {'xbar_cluster': 13891.37385105984}]

In [None]:
x = results.sort(key= lambda x: x.items())

In [None]:
dic = {'meu': 13989.929166980588,
 'xbar_simple': 13955.71320944249,
 'xbar_systematic': 13988.119715214907,
 'xbar_stratified': 14009.489137259827,
 'xbar_cluster': 13891.37385105984}
closest = 10000000000000
s = ""
for key, value in dic.items():
    if key == 'meu':
        pass
    else:
        if abs(value-dic['meu']) < closest:
            closest = abs(value-dic['meu'])
            s = key

s, closest  

('xbar_systematic', 1.809451765680933)

In [79]:
ls = list(dic.items())
ls

[('meu', 13989.929166980588),
 ('xbar_simple', 13955.71320944249),
 ('xbar_systematic', 13988.119715214907),
 ('xbar_stratified', 14009.489137259827),
 ('xbar_cluster', 13891.37385105984)]

In [83]:
ls = sorted(ls, key = lambda item: (abs(item[1] -dic['meu'])))
ls

[('meu', 13989.929166980588),
 ('xbar_systematic', 13988.119715214907),
 ('xbar_stratified', 14009.489137259827),
 ('xbar_simple', 13955.71320944249),
 ('xbar_cluster', 13891.37385105984)]

# Conditional Probability Implementation
    - For the marathon dataset ... Find the conditional Probability of Finish within 3 hours given that the pace < 8 minutes per mil/km

In [86]:
df[['Hours_pace','Minutes_pace','Seconds_pace']]  = df['Pace'].str.split(':', expand=True)
df['Pace_duration'] = df['Hours_pace'].astype(int)*3600 + df['Minutes_pace'].astype(int)*60 + df['Seconds_pace'].astype(int)
df = df.drop(['Hours','Minutes','Seconds','Hours_pace','Minutes_pace','Seconds_pace'], axis=1)

In [87]:
df.head()

Unnamed: 0.1,Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 9,...,40K,Pace,Proj Time,Official Time,Overall,Gender,Division,Unnamed: 8,Overall_duration,Pace_duration
0,0.0,3,"Desisa, Lelisa",25,M,Ambo,,ETH,,,...,2:02:39,0:04:56,-,2:09:17,1,1,1,,7757,296
1,1.0,4,"Tsegay, Yemane Adhane",30,M,Addis Ababa,,ETH,,,...,2:02:42,0:04:58,-,2:09:48,2,2,2,,7788,298
2,2.0,8,"Chebet, Wilson",29,M,Marakwet,,KEN,,,...,2:03:01,0:04:59,-,2:10:22,3,3,3,,7822,299
3,3.0,11,"Kipyego, Bernard",28,M,Eldoret,,KEN,,,...,2:03:47,0:05:00,-,2:10:47,4,4,4,,7847,300
4,4.0,10,"Korir, Wesley",32,M,Kitale,,KEN,,,...,2:03:27,0:05:00,-,2:10:49,5,5,5,,7849,300


In [88]:
# Define the condition
condition_A = df['Overall_duration'] < 3*60*60  #Finish within 3 hours
condition_B = df['Pace_duration'] < 8*60    # Pace less than 8 minutes per kilometer


In [89]:
# Calculate the Probabilities
P_B = len(df[condition_B]) / len(df)
P_A_and_B = len(df[condition_A & condition_B]) / len(df)

# Conditional Probabilty P(A|B)
P_A_given_B = P_A_and_B / P_B
print(f"P(A | B) = {P_A_given_B:.4f}")

P(A | B) = 0.2282
