In [13]:
import pandas as pd
import numpy as np


### example 1

In [17]:
data = {
    'Student_ID': range(1, 101),
    'School': np.random.choice(['School_A', 'School_B', 'School_C', 'School_D', 'School_E'], size=100),
    'Test_Score': np.random.randint(50, 100, size=100)
}
df=pd.DataFrame(data)
df.head()


Unnamed: 0,Student_ID,School,Test_Score
0,1,School_E,67
1,2,School_C,82
2,3,School_A,52
3,4,School_A,79
4,5,School_E,53


In [23]:
# Group students by school (clusters)
print("\nNumber of students in each school:")

print(df['School'].value_counts())


Number of students in each school:
School
School_D    25
School_C    22
School_E    18
School_B    18
School_A    17
Name: count, dtype: int64


In [29]:
# random select cluster (2 school)
selected_cluster=np.random.choice(df['School'].unique(),size=2,replace=False)
print("selected cluster(school)\n",selected_cluster)


selected cluster(school)
 ['School_A' 'School_D']


In [31]:
# filter dataset from selected cluster (school)
sample=df[df["School"].isin(selected_cluster)]
print(sample)

    Student_ID    School  Test_Score
2            3  School_A          52
3            4  School_A          79
6            7  School_A          80
8            9  School_A          79
11          12  School_A          52
12          13  School_D          94
13          14  School_A          65
14          15  School_A          96
15          16  School_A          51
21          22  School_D          58
23          24  School_D          77
27          28  School_D          56
28          29  School_D          80
30          31  School_A          55
31          32  School_D          95
36          37  School_D          71
38          39  School_A          78
43          44  School_D          92
46          47  School_D          91
49          50  School_D          90
51          52  School_D          90
52          53  School_D          83
53          54  School_D          61
58          59  School_A          91
59          60  School_D          97
60          61  School_D          58
6

In [39]:
# Calculate the mean test score from the sample
mean_test_score=sample['Test_Score'].mean()
print("mean score test from sample :  ",mean_test_score)


mean score test from sample :   74.33333333333333


### example 2

In [46]:


# Step 1: Create a synthetic dataset
np.random.seed(42)

# Create regions, districts, and villages
regions = ['North', 'South', 'East', 'West']
districts_per_region = 5
villages_per_district = 10
individuals_per_village = 100

# Generate dataset
data = []
for region in regions:
    for district in range(1, districts_per_region + 1):
        for village in range(1, villages_per_district + 1):
            for individual in range(1, individuals_per_village + 1):
                data.append({
                    'Region': region,
                    'District': f"{region}_District_{district}",
                    'Village': f"{region}_District_{district}_Village_{village}",
                    'Individual_ID': f"{region}_D{district}_V{village}_I{individual}",
                    'Diabetes': np.random.choice([0, 1], p=[0.85, 0.15])  # 15% prevalence rate
                })

# Create DataFrame
df = pd.DataFrame(data)

# Step 2: Perform Cluster Sampling
# Select 2 random regions
selected_regions = np.random.choice(df['Region'].unique(), size=2, replace=False)
print("Selected Regions:", selected_regions)

# Filter for selected regions
region_sample = df[df['Region'].isin(selected_regions)]

# Select 3 random districts from each selected region
selected_districts = region_sample['District'].drop_duplicates().groupby(region_sample['Region']).apply(
    lambda x: x.sample(3, random_state=42)).values
print("Selected Districts:", selected_districts)

# Filter for selected districts
district_sample = region_sample[region_sample['District'].isin(selected_districts)]

# Select 5 random villages from each selected district
selected_villages = district_sample['Village'].drop_duplicates().groupby(district_sample['District']).apply(
    lambda x: x.sample(5, random_state=42)).values
print("Selected Villages:", selected_villages)

# Filter for selected villages
final_sample = district_sample[district_sample['Village'].isin(selected_villages)]

# Step 3: Analyze the Sample
print("\nFinal Sample Size:", final_sample.shape[0])
diabetes_rate = final_sample['Diabetes'].mean()
print("Estimated Diabetes Prevalence in Sample:", diabetes_rate)


Selected Regions: ['North' 'East']
Selected Districts: ['East_District_2' 'East_District_5' 'East_District_3' 'North_District_2'
 'North_District_5' 'North_District_3']
Selected Villages: ['East_District_2_Village_9' 'East_District_2_Village_2'
 'East_District_2_Village_6' 'East_District_2_Village_1'
 'East_District_2_Village_8' 'East_District_3_Village_9'
 'East_District_3_Village_2' 'East_District_3_Village_6'
 'East_District_3_Village_1' 'East_District_3_Village_8'
 'East_District_5_Village_9' 'East_District_5_Village_2'
 'East_District_5_Village_6' 'East_District_5_Village_1'
 'East_District_5_Village_8' 'North_District_2_Village_9'
 'North_District_2_Village_2' 'North_District_2_Village_6'
 'North_District_2_Village_1' 'North_District_2_Village_8'
 'North_District_3_Village_9' 'North_District_3_Village_2'
 'North_District_3_Village_6' 'North_District_3_Village_1'
 'North_District_3_Village_8' 'North_District_5_Village_9'
 'North_District_5_Village_2' 'North_District_5_Village_6'
 