# Introducción a técnicas de muestreo
---

- Análisis de ciencias de datos
- Grupo 100
- Profesora: Rubí Isela Gutiérrez López
- Integrantes:

|Nombre | Matrícula |
|----|----|
| Juan Pablo Echeagaray González | A00830646 |
| Grace Aviance Silva Aróstegui | A01285158 |
| Ricardo de Jesús Balam Ek | A00831262 |
| Taurino López González | A01284076 |
| Emily Rebeca Méndez Cruz | A00830768 |
| Eugenio Santisteban Zolezzi | A01720932 |



In [1]:
# Dependencias
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [4]:
# Lectura de datos
# Debe de hacerse por partes, ya que el archivo original es muy grande
chunk_size = 10_000
list_df = []

for df in pd.read_csv('210507COVID19MEXICO.csv', chunksize=chunk_size):
    list_df.append(df)

df = pd.concat(list_df)


In [5]:
df.shape


(6734249, 40)

In [6]:
print(df.columns)


Index(['FECHA_ACTUALIZACION', 'ID_REGISTRO', 'ORIGEN', 'SECTOR', 'ENTIDAD_UM',
       'SEXO', 'ENTIDAD_NAC', 'ENTIDAD_RES', 'MUNICIPIO_RES', 'TIPO_PACIENTE',
       'FECHA_INGRESO', 'FECHA_SINTOMAS', 'FECHA_DEF', 'INTUBADO', 'NEUMONIA',
       'EDAD', 'NACIONALIDAD', 'EMBARAZO', 'HABLA_LENGUA_INDIG', 'INDIGENA',
       'DIABETES', 'EPOC', 'ASMA', 'INMUSUPR', 'HIPERTENSION', 'OTRA_COM',
       'CARDIOVASCULAR', 'OBESIDAD', 'RENAL_CRONICA', 'TABAQUISMO',
       'OTRO_CASO', 'TOMA_MUESTRA_LAB', 'RESULTADO_LAB',
       'TOMA_MUESTRA_ANTIGENO', 'RESULTADO_ANTIGENO', 'CLASIFICACION_FINAL',
       'MIGRANTE', 'PAIS_NACIONALIDAD', 'PAIS_ORIGEN', 'UCI'],
      dtype='object')


### Métricas de la población

In [5]:
df['EDAD'].describe()


count    6.734249e+06
mean     4.088934e+01
std      1.686857e+01
min      0.000000e+00
25%      2.800000e+01
50%      3.900000e+01
75%      5.200000e+01
max      1.210000e+02
Name: EDAD, dtype: float64

In [6]:
df['INTUBADO'].value_counts() / len(df) * 100


97    88.883467
2      9.759292
1      1.217909
99     0.139333
Name: INTUBADO, dtype: float64

## Estadísticos de la población


## Muestreos aleatorios

### Muestreo aleatorio simple

In [7]:
n_1, n_2 = 0.1, 0.05
simple_sample_1 = df.sample(n=int(df.shape[0] * n_1))
simple_sample_2 = df.sample(n=int(df.shape[0] * n_2))


In [8]:
print(f"""
Tamaño de la muestra 1 = {simple_sample_1.shape[0]} ~ {n_1 * 100}%
Tamaño de la muestra 2 = {simple_sample_2.shape[0]} ~ {n_2 * 100}%""")



Tamaño de la muestra 1 = 673424 ~ 10.0%
Tamaño de la muestra 2 = 336712 ~ 5.0%


#### Métricas

In [9]:
simple_sample_1['EDAD'].describe()


count    673424.000000
mean         40.943602
std          16.873723
min           0.000000
25%          28.000000
50%          39.000000
75%          52.000000
max         121.000000
Name: EDAD, dtype: float64

In [10]:
simple_sample_2['EDAD'].describe()


count    336712.000000
mean         40.892668
std          16.873728
min           0.000000
25%          28.000000
50%          39.000000
75%          52.000000
max         120.000000
Name: EDAD, dtype: float64

In [11]:
simple_sample_1['INTUBADO'].value_counts() / len(simple_sample_1) * 100


97    88.833484
2      9.792642
1      1.233398
99     0.140476
Name: INTUBADO, dtype: float64

In [12]:
simple_sample_2['INTUBADO'].value_counts() / len(simple_sample_2) * 100


97    88.840909
2      9.803630
1      1.216173
99     0.139288
Name: INTUBADO, dtype: float64

### Muestro aleatorio estratificado

In [13]:
def stratified_sample_gen(df, percent):
    df['strata'] = np.random.choice(['1', '2'], size=df.shape[0])
    from sklearn.model_selection import StratifiedShuffleSplit
    
    split = StratifiedShuffleSplit(n_splits=1, train_size=percent, random_state=0)
    for i, j in split.split(df, df['strata']):
        stratified_sample = df.iloc[j]

    return stratified_sample
    

In [14]:
strat_sample_1 = stratified_sample_gen(df, 0.1)

In [15]:
strat_sample_2 = stratified_sample_gen(df, 0.05)

#### Una mejor aproximación, usar en otras ocasiones

In [19]:
test_strat_sample = df.groupby('INTUBADO', group_keys=False).apply(lambda x: x.sample(10))

In [24]:
def stratified_sample_gen(df: pd.DataFrame, key: str, n=0.1) -> pd.DataFrame:
    if type(n) == float:
        strat_sample = df.groupby(key, group_keys=False).apply(lambda x: x.sample(frac=n))
    else:
        strat_sample = df.groupby(key, group_keys=False).apply(lambda x: x.sample(n))
        
    return strat_sample

In [25]:
test_strat_sample = stratified_sample_gen(df, 'INTUBADO', 10)

In [26]:
test_strat_sample['INTUBADO'].value_counts()

99    10
97    10
2     10
1     10
Name: INTUBADO, dtype: int64

#### Métrica

In [16]:
strat_sample_1['EDAD'].describe()

count    6.060825e+06
mean     4.088946e+01
std      1.686655e+01
min      0.000000e+00
25%      2.800000e+01
50%      3.900000e+01
75%      5.200000e+01
max      1.210000e+02
Name: EDAD, dtype: float64

In [17]:
strat_sample_2['EDAD'].describe()

count    6.397537e+06
mean     4.089333e+01
std      1.686911e+01
min      0.000000e+00
25%      2.800000e+01
50%      3.900000e+01
75%      5.200000e+01
max      1.210000e+02
Name: EDAD, dtype: float64

In [18]:
strat_sample_1['INTUBADO'].value_counts() / len(strat_sample_1) * 100

97    88.884319
2      9.758539
1      1.218316
99     0.138826
Name: INTUBADO, dtype: float64

In [19]:
strat_sample_2['INTUBADO'].value_counts() / len(strat_sample_2) * 100

97    88.879283
2      9.763210
1      1.218656
99     0.138850
Name: INTUBADO, dtype: float64

### Muestro aleatorio sistemático

In [20]:
def systematic_sampling(df, step: int) -> pd.DataFrame:
    i = np.arange(0, df.shape[0], step)
    systematic_sample = df.iloc[i]
    return systematic_sample
    

In [21]:
sys_sample_1 = systematic_sampling(df, step=27)
sys_sample_2 = systematic_sampling(df, step=92)


#### Métricas

In [22]:
sys_sample_1['EDAD'].describe()


count    249417.000000
mean         40.865005
std          16.844224
min           0.000000
25%          28.000000
50%          39.000000
75%          52.000000
max         119.000000
Name: EDAD, dtype: float64

In [23]:
sys_sample_2['EDAD'].describe()


count    73199.000000
mean        40.897704
std         16.885825
min          0.000000
25%         28.000000
50%         39.000000
75%         52.000000
max        120.000000
Name: EDAD, dtype: float64

In [24]:
sys_sample_1['INTUBADO'].value_counts() / len(sys_sample_1) * 100

97    88.904926
2      9.764371
1      1.191579
99     0.139124
Name: INTUBADO, dtype: float64

In [25]:
sys_sample_2['INTUBADO'].value_counts() / len(sys_sample_2) * 100

97    89.001216
2      9.685925
1      1.188541
99     0.124319
Name: INTUBADO, dtype: float64

### Muestra por conglomerados (cluster)

Subconjunto de personas mayores de 30 años

In [26]:
data = df[df['EDAD'] > 30]

In [27]:
cluster_sample_1 = data.sample(n=int(data.shape[0] * 0.1))
cluster_sample_2 = data.sample(n=int(data.shape[0] * 0.05))

#### Métricas

In [28]:
cluster_sample_1['EDAD'].describe()

count    467494.000000
mean         48.981764
std          13.243081
min          31.000000
25%          38.000000
50%          47.000000
75%          57.000000
max         121.000000
Name: EDAD, dtype: float64

In [29]:
cluster_sample_2['EDAD'].describe()

count    233747.000000
mean         48.991906
std          13.226628
min          31.000000
25%          38.000000
50%          47.000000
75%          57.000000
max         121.000000
Name: EDAD, dtype: float64

In [30]:
cluster_sample_1['INTUBADO'].value_counts() / len(cluster_sample_1) * 100

97    85.964740
2     12.288928
1      1.611785
99     0.134547
Name: INTUBADO, dtype: float64

In [31]:
cluster_sample_2['INTUBADO'].value_counts() / len(cluster_sample_2) * 100

97    85.960889
2     12.278660
1      1.601732
99     0.158719
Name: INTUBADO, dtype: float64

## Muestreos no probabilísticos

### Muestreo intencional

Seleccionando a todos los individuos de edades entre 20 y 30 años

In [32]:
int_sample_1 = df[(df['EDAD'] < 30) & (df['EDAD'] > 20)]

Seleccionando a individuos mayores de 80 años

In [33]:
int_sample_2 = df[df['EDAD'] > 85]

#### Métricas

In [34]:
int_sample_1['EDAD'].describe()

count    1.311240e+06
mean     2.537181e+01
std      2.500117e+00
min      2.100000e+01
25%      2.300000e+01
50%      2.600000e+01
75%      2.800000e+01
max      2.900000e+01
Name: EDAD, dtype: float64

In [35]:
int_sample_2['EDAD'].describe()

count    46409.000000
mean        89.965395
std          4.379072
min         86.000000
25%         87.000000
50%         89.000000
75%         91.000000
max        121.000000
Name: EDAD, dtype: float64

In [36]:
int_sample_1['INTUBADO'].value_counts() / len(int_sample_1) * 100

97    97.188768
2      2.602575
1      0.155578
99     0.053080
Name: INTUBADO, dtype: float64

In [37]:
int_sample_2['INTUBADO'].value_counts() / len(int_sample_2) * 100

97    47.544657
2     47.320563
1      4.380616
99     0.754164
Name: INTUBADO, dtype: float64

### Muestreo por conveniencia

In [38]:
conv_sample_1 = int_sample_1[int_sample_1['INTUBADO'] == 1]
conv_sample_2 = int_sample_2[int_sample_2['INTUBADO'] == 1]

#### Métricas

In [39]:
conv_sample_1['EDAD'].describe()

count    2040.000000
mean       25.728922
std         2.513826
min        21.000000
25%        24.000000
50%        26.000000
75%        28.000000
max        29.000000
Name: EDAD, dtype: float64

In [40]:
conv_sample_2['EDAD'].describe()

count    2033.000000
mean       89.097885
std         3.266503
min        86.000000
25%        87.000000
50%        88.000000
75%        90.000000
max       116.000000
Name: EDAD, dtype: float64

In [41]:
conv_sample_1['INTUBADO'].value_counts() / len(conv_sample_1) * 100

1    100.0
Name: INTUBADO, dtype: float64

In [42]:
conv_sample_2['INTUBADO'].value_counts() / len(conv_sample_2) * 100

1    100.0
Name: INTUBADO, dtype: float64

###

### Muestreo accidental

Seleccionando los primeros $n_1$ y $n_2$ para que hayan muestras del 10% y 5%

In [43]:
n1 = int(df.shape[0] * 0.1)
n2 = int(df.shape[0] * 0.05)

acc_sample_1 = df.head(n=n1)
acc_sample_2 = df.head(n=n2)

#### Métricas

In [44]:
acc_sample_1['EDAD'].describe()

count    673424.000000
mean         41.458423
std          17.112849
min           0.000000
25%          30.000000
50%          40.000000
75%          52.000000
max         121.000000
Name: EDAD, dtype: float64

In [45]:
acc_sample_2['EDAD'].describe()

count    336712.000000
mean         40.026524
std          17.556302
min           0.000000
25%          28.000000
50%          39.000000
75%          51.000000
max         121.000000
Name: EDAD, dtype: float64

In [46]:
acc_sample_1['INTUBADO'].value_counts() / len(acc_sample_1) * 100

97    79.666599
2     15.955921
1      3.416124
99     0.961356
Name: INTUBADO, dtype: float64

In [47]:
acc_sample_2['INTUBADO'].value_counts() / len(acc_sample_2) * 100

97    80.540937
2     14.918684
1      2.856150
99     1.684229
Name: INTUBADO, dtype: float64