In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

titanic = pd.read_csv('../00_datasets/titanic.csv')
print(titanic.head())

   survived  pclass     sex   age  sibsp  parch     fare embarked  class  \
0         0       3    male  22.0      1      0   7.2500        S  Third   
1         1       1  female  38.0      1      0  71.2833        C  First   
2         1       3  female  26.0      0      0   7.9250        S  Third   
3         1       1  female  35.0      1      0  53.1000        S  First   
4         0       3    male  35.0      0      0   8.0500        S  Third   

     who  adult_male deck  embark_town alive  alone  
0    man        True  NaN  Southampton    no  False  
1  woman       False    C    Cherbourg   yes  False  
2  woman       False  NaN  Southampton   yes   True  
3  woman       False    C  Southampton   yes  False  
4    man        True  NaN  Southampton    no   True  


## Simple random sampling

In [65]:
# print(titanic.sample(n=5)) # pick 5 random samples from complete dataset
# print(titanic.sample(frac=1)) # randomize all rows in dataset
# print(titanic.sample(frac=0.1)) # pick 10% random rows
print(titanic.sample(frac=0.1, random_state=42)) # set seed to get always same result

titanic_SRS = titanic.sample(frac=0.1, random_state=42)

     survived  pclass     sex   age  sibsp  parch      fare embarked   class  \
709         1       3    male   NaN      1      1   15.2458        C   Third   
439         0       2    male  31.0      0      0   10.5000        S  Second   
840         0       3    male  20.0      0      0    7.9250        S   Third   
720         1       2  female   6.0      0      1   33.0000        S  Second   
39          1       3  female  14.0      1      0   11.2417        C   Third   
..        ...     ...     ...   ...    ...    ...       ...      ...     ...   
174         0       1    male  56.0      0      0   30.6958        C   First   
493         0       1    male  71.0      0      0   49.5042        C   First   
215         1       1  female  31.0      1      0  113.2750        C   First   
309         1       1  female  30.0      0      0   56.9292        C   First   
822         0       1    male  38.0      0      0    0.0000        S   First   

       who  adult_male deck  embark_tow

## Stratified sampling

In it, samples are randomly collected from each sub-group

In [66]:
# Proportion of passengers by who
who_counts_pop = titanic.value_counts('who', normalize=True)
print(who_counts_pop)

print('-' * 100)

# Proportional stratified sampling for 40% of each 'who' group
titanic_strat = titanic.groupby('who').sample(frac=.4, random_state=42) # Get random values: 40% from man, 40% from woman, 40% from child
print(titanic_strat.value_counts('who', normalize=True))

print('-' * 100)

# Get 20 random values from each 'who' group
titanic_strat = titanic.groupby('who').sample(n=20, random_state=42)
print(titanic_strat.value_counts('who', normalize=True)) # each group contains same proportion, because we selected 20 values from each group

who
man      0.602694
woman    0.304153
child    0.093154
Name: proportion, dtype: float64
----------------------------------------------------------------------------------------------------
who
man      0.603933
woman    0.303371
child    0.092697
Name: proportion, dtype: float64
----------------------------------------------------------------------------------------------------
who
child    0.333333
man      0.333333
woman    0.333333
Name: proportion, dtype: float64


## Cluster Sampling

randomly select a few sub-groups from a group and then randomly select a few values from those subgroups

In below example, we are randomly selecting only two sub-groups from `who`, and then; randomly selecting 5 rows from each selected sub-group

`The main benefit of cluster sampling over stratified sampling is that you can save time and money by not including every subgroup in your sample.`



In [67]:
import random
random.seed(42)

titanic['who'] = titanic['who'].astype('category')

who_pop = list(titanic['who'].unique()) # Create a list of unique 'who' values
who_samp = random.sample(who_pop, k=2) # Randomly sample two 'who' values
print("who_samp: ", who_samp)

# Filter for rows where 'who' is in who_samp
who_condition = titanic['who'].isin(who_samp)
titanic_filtered = titanic[who_condition]

print("Selected Group values0: ", who_samp)
print("Selected Group values1: ", titanic_filtered['who'].unique(), end='\n\n')
print("Selected Group values2: ", titanic_filtered['who'].cat.categories) # In titanic_filtered dataset, we have only two used categories for 'who'. Means, one subgroup does not contain any row in dataset

print('-' * 100)

titanic_filtered['who'] = titanic_filtered['who'].cat.remove_unused_categories() # Remove categories with no rows
titanic_clust = titanic_filtered.groupby('who').sample(n=20, random_state=2022) # Randomly sample 20 passengers from each sampled 'who' group
print(titanic_clust[['survived', 'sex', 'who', 'age', 'fare']]) # Print the sample

who_samp:  ['child', 'man']
Selected Group values0:  ['child', 'man']
Selected Group values1:  ['man', 'child']
Categories (3, object): ['child', 'man', 'woman']

Selected Group values2:  Index(['child', 'man', 'woman'], dtype='object')
----------------------------------------------------------------------------------------------------
     survived     sex    who    age      fare
182         0    male  child   9.00   31.3875
22          1  female  child  15.00    8.0292
14          0  female  child  14.00    7.8542
731         0    male  child  11.00   18.7875
479         1  female  child   2.00   12.2875
374         0  female  child   3.00   21.0750
278         0    male  child   7.00   29.1250
819         0    male  child  10.00   27.9000
480         0    male  child   9.00   46.9000
720         1  female  child   6.00   33.0000
16          0    male  child   2.00   29.1250
297         0  female  child   2.00  151.5500
58          1  female  child   5.00   27.7500
788         1    m

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_filtered['who'] = titanic_filtered['who'].cat.remove_unused_categories() # Remove categories with no rows
  titanic_clust = titanic_filtered.groupby('who').sample(n=20, random_state=2022) # Randomly sample 20 passengers from each sampled 'who' group


## calculate mean of fare by who for original dataset / SRS / SS / CS

In [68]:
titanic_who_by_fare_mean = titanic.groupby('who')['fare'].mean()
titanic_SRS_who_by_fare_mean = titanic_SRS.groupby('who')['fare'].mean()
titanic_strat_who_by_fare_mean = titanic_strat.groupby('who')['fare'].mean()
titanic_clust_who_by_fare_mean = titanic_clust.groupby('who')['fare'].mean()

print("Original Dataset Mean of fare: ", titanic_who_by_fare_mean, sep='\n')
print('-'*100)
print("Simple random Sampling, Mean of fare: ", titanic_SRS_who_by_fare_mean, sep='\n')
print('-'*100)
print("Stratified Sampling, Mean of fare: ", titanic_strat_who_by_fare_mean, sep='\n')
print('-'*100)
print("Cluster Sampling, Mean of fare: ", titanic_clust_who_by_fare_mean, sep='\n')

Original Dataset Mean of fare: 
who
child    32.785795
man      24.864182
woman    46.570711
Name: fare, dtype: float64
----------------------------------------------------------------------------------------------------
Simple random Sampling, Mean of fare: 
who
child    36.971429
man      22.033586
woman    49.062379
Name: fare, dtype: float64
----------------------------------------------------------------------------------------------------
Stratified Sampling, Mean of fare: 
who
child    32.650000
man      18.898325
woman    35.951465
Name: fare, dtype: float64
----------------------------------------------------------------------------------------------------
Cluster Sampling, Mean of fare: 
who
child    29.71813
man      22.18750
Name: fare, dtype: float64


  titanic_who_by_fare_mean = titanic.groupby('who')['fare'].mean()
  titanic_clust_who_by_fare_mean = titanic_clust.groupby('who')['fare'].mean()


## Relative Error (b/w means of original and sample datasets)

In [82]:
# Calculate the relative error in percentage for mean of fare between original dataset and Simple random sampling dataset
titanic_fare_mean = titanic['fare'].mean()
titanic_SRS_fare_mean = titanic_SRS['fare'].mean()
total_error_original_vs_SRS = abs(titanic_fare_mean - titanic_SRS_fare_mean)
print("difference of mean b/w sampled dataset and original dataset: ", total_error_original_vs_SRS)

rel_error_in_percentage = (total_error_original_vs_SRS / titanic_fare_mean) * 100
print("Relative error: ", rel_error_in_percentage) # how much percent mean is different

difference of mean b/w sampled dataset and original dataset:  1.0261763010882845
Relative error:  3.186466508009274
