# Python Programming: Systematic Sampling

In [None]:
# Importing Numpy
#
import pandas as pd
import numpy as np
import string
import random

## Examples 

In [None]:

# The idea in systematic sampling is that, given the population units numbered from 1 to  N , 
# we compute for the sampling interval, given by  k=N/n , where  n  is the number of units needed for the sample. 
# After that, we choose for the random start, number between  1  and  k. 
# This random start will be the first sample, and then the second unit in the sample is obtained by adding 
# the sampling interval to the random start, and so on. 
# There are two types of systematic sampling namely, linear and circular systematic samplings. 
# Circular systematic sampling treats the population units numbered from  1  to  N  in circular form,
# so that if the increment step is more than the number of  N  units, say  N+2 , 
# the sample unit is the  2nd element in the population, and so on. 
# The code that we will working with can be used both for linear and circular. 
# Since there are rules in linear that are not satisfied in the function, 
# one of which is if  k  is not a whole number, despite that, however, we can always extend it to a more general function.
# ---
# Question: Perform systematic sampling given the following dataset:
# ---
#

# The data
sal_dat = np.array([25, 15, 20, 25, 18, 12, 24, 30, 15, 20, 10, 10, 11, 14, 22, 16])
salary = sal_dat * 1000

# Function for systematic sampling
def sys_sample(df, r, n):
    k = df.shape[0] // n

    b = [None] * n; a = r
    b[0] = a

    for i in np.arange(1, n):
        a = a + k

        if a > df.shape[0]:
            a = a - df.shape[0]

        b[i] = a

    return {"Data" : df[b], "Index" : b, "K" : k}

# Do the sampling for random start,
# r = 2, and number of sample, n = 4
sys_sample(salary, r = 1, n = 8)

{'Data': array([15000, 25000, 12000, 30000, 20000, 10000, 14000, 16000]),
 'Index': [1, 3, 5, 7, 9, 11, 13, 15],
 'K': 2}

## <font color="green">Challenges</font>

In [None]:
# Challenge 1
# ---
# Question: Peform systematic sampling given the following dataset 
# ---
Dataset= np.array([33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55]) 
 
# Function for systematic sampling
def sys_sample(df, r, n):
    k = df.shape[0] // n

    b = [None] * n; a = r
    b[0] = a

    for i in np.arange(1, n):
        a = a + k

        if a > df.shape[0]:
            a = a - df.shape[0]

        b[i] = a

    return {"Data" : df[b], "Index" : b, "K" : k}

# Do the sampling for random start,
# r = 2, and number of sample, n = 4
sys_sample(Dataset, r = 1, n = 4)

{'Data': array([34, 39, 44, 49]), 'Index': [1, 6, 11, 16], 'K': 5}

In [None]:
# Challenge 2
# ---
# Question: You're given data ranging from 175 to 1000. Select a sample from this data by performing systematic sampling.
# ---
# 
num = range(175,1000)
df = np.array(num)
# Function for systematic sampling
def sys_sample(df, r, n):
    k = df.shape[0] // n

    b = [None] * n; a = r
    b[0] = a

    for i in np.arange(1, n):
        a = a + k

        if a > df.shape[0]:
            a = a - df.shape[0]

        b[i] = a

    return {"Data" : df[b], "Index" : b, "K" : k}

# Do the sampling for random start,
# r = 2, and number of sample, n = 4
sys_sample(df, r = 1, n = 3)

{'Data': array([176, 451, 726]), 'Index': [1, 276, 551], 'K': 275}

In [None]:
# Challenge 3
# ---
# Question: There are 19 students in this class. Let’s choose a 1-in-3 systematic sample from the 19 students in the class.
# ---
# 
students = np.array([1, 2, 3, 4, 5, 6, 7, 8,9, 10, 11, 12, 13, 14,15, 16,17, 18, 19]) 
 
# Function for systematic sampling
def sys_sample(students, r, n):
    k = students.shape[0] // n

    b = [None] * n; a = r
    b[0] = a

    for i in np.arange(1, n):
        a = a + k

        if a > students.shape[0]:
            a = a - students.shape[0]

        b[i] = a

    return {"Data" : students[b], "Index" : b, "K" : k}

# Do the sampling for random start,
# r = 2, and number of sample, n = 4
sys_sample(students, r = 1, n =5)

{'Data': array([ 2,  5,  8, 11, 14]), 'Index': [1, 4, 7, 10, 13], 'K': 3}

In [None]:
# Challenge 4
# ---
# Question: Select a sample from n = 12 members from a population of size N = 287. 
# ---
# 
member = range(12,287)
df = np.array(member)
 
# Function for systematic sampling
def sys_sample(df, r, n):
    k = df.shape[0] // n

    b = [None] * n; a = r
    b[0] = a

    for i in np.arange(1, n):
        a = a + k

        if a > df.shape[0]:
            a = a - df.shape[0]

        b[i] = a

    return {"Data" : df[b], "Index" : b, "K" : k}

# Do the sampling for random start,
# r = 2, and number of sample, n = 4
sys_sample(df, r = 1, n = 3)

{'Data': array([ 13, 104, 195]), 'Index': [1, 92, 183], 'K': 91}

In [None]:
# Challenge 5
# ---
# Question: You work for the Olympics Data Analytics in Geneva and would like perform a study on the performance of the top marathon
# olympics athletes. For reasons beyond your control, resort to perform systematic sampling from the given Boston 2017 marathon dataset.
# ---
# Question: http://bit.ly/BostonMarathonDataset
# ---
# 
BostonMarathon = pd.read_csv("http://bit.ly/BostonMarathonDataset")
BostonMarathon1 =BostonMarathon['Overall']
# Function for systematic sampling
def sys_sample(BostonMarathon1, r, n):
    k = BostonMarathon1.shape[0] // n

    b = [None] * n; a = r
    b[0] = a

    for i in np.arange(1, n):
        a = a + k

        if a > BostonMarathon1.shape[0]:
            a = a - BostonMarathon1.shape[0]

        b[i] = a

    return {"Data" : BostonMarathon1[b], "Index" : b, "K" : k}

# Do the sampling for random start,
# r = 2, and number of sample, n = 4
sys_sample(BostonMarathon1, r = 1, n = 3)

{'Data': 1            2
 8804      8806
 17607    17609
 Name: Overall, dtype: int64, 'Index': [1, 8804, 17607], 'K': 8803}

In [None]:
BostonMarathon = pd.read_csv("http://bit.ly/BostonMarathonDataset")
BostonMarathon.head()

Unnamed: 0.1,Unnamed: 0,Bib,Name,Age,M/F,City,State,Country,Citizen,Unnamed: 9,...,25K,30K,35K,40K,Pace,Proj Time,Official Time,Overall,Gender,Division
0,0,11,"Kirui, Geoffrey",24,M,Keringet,,KEN,,,...,1:16:59,1:33:01,1:48:19,2:02:53,0:04:57,-,2:09:37,1,1,1
1,1,17,"Rupp, Galen",30,M,Portland,OR,USA,,,...,1:16:59,1:33:01,1:48:19,2:03:14,0:04:58,-,2:09:58,2,2,2
2,2,23,"Osako, Suguru",25,M,Machida-City,,JPN,,,...,1:17:00,1:33:01,1:48:31,2:03:38,0:04:59,-,2:10:28,3,3,3
3,3,21,"Biwott, Shadrack",32,M,Mammoth Lakes,CA,USA,,,...,1:17:00,1:33:01,1:48:58,2:04:35,0:05:03,-,2:12:08,4,4,4
4,4,9,"Chebet, Wilson",31,M,Marakwet,,KEN,,,...,1:16:59,1:33:01,1:48:41,2:05:00,0:05:04,-,2:12:35,5,5,5
