In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
raw_data=pd.read_csv('/content/diabetes.csv')

In [None]:
np.random.seed(2121)

In [None]:
# Here i have taken a random sample of 25 observations from population dataframe
sample_data_size = 25
sample_dataframe = raw_data.sample(n=sample_data_size)

In [None]:
#Here i am printing the sample dataframe
sample_dataframe
sample_dataframe.shape

In [None]:
#calculating the mean Glucose and highest Glucose values of this sample

sample_mean_glucose_value = sample_dataframe['Glucose'].mean()
sample_max_glucose_value = sample_dataframe['Glucose'].max()
print('Sample mean value is : ' , sample_mean_glucose_value)
print('Sample highest value is : ',  sample_max_glucose_value )

In [None]:
#calculating the mean Glucose and highest Glucose values of actual population

population_mean_glucose_value = raw_data['Glucose'].mean()
population_max_glucose_value = raw_data['Glucose'].max()

print('Population of mean value is : ' , population_mean_glucose_value)
print('Population highest value is : ',  population_max_glucose_value )

In [None]:
#Comparison of both the sample and population statistic values using the charts
#creating the data variables to plot on chart
x_values = ['Sample Mean','Population Mean','Sample Max', 'Population Max']
y_values = [sample_mean_glucose_value,population_mean_glucose_value,sample_max_glucose_value,population_max_glucose_value]
# Set the colors for the bars
colors = ['blue', 'blue', 'green', 'green']
#creating the barplot to represent the data
sns.barplot(x=x_values, y=y_values,palette=colors)
plt.xlabel('Statistics')
plt.ylabel('Glucose level')
plt.title('Comparison of the Mean & Max  Statistics of Glucose (Sample vs. Population)')
plt.show()

In [None]:
# b question . In this question i am considering the same seed value.
samplebmi_98thpercentile = np.percentile(sample_dataframe['BMI'], 98)
populationbmi_98thpercentile = np.percentile(raw_data['BMI'], 98)
#printing the 98th percentile of BMi values
print("The 98th percentile of BMI value of sampledata is : ", samplebmi_98thpercentile )
print("The 98th percentile of BMI value of populationdata is : ", populationbmi_98thpercentile )

In [None]:
# Seting the style for the plot
sns.set_style('whitegrid')
# Creating a histogram of the BMI values for the sample
sns.histplot(sample_dataframe, x='BMI', element='step', color='red', label='Sample')
# Creating a histogram of the BMI values for the population
sns.histplot(raw_data, x='BMI', element='step', color='pink', label='Population')
# Adding the  vertical lines at 98th percentile value of BMI
plt.axvline(samplebmi_98thpercentile, color='red', linestyle='-', label='Sample 98th Percentile')
plt.axvline(populationbmi_98thpercentile, color='pink',linestyle='-', label='Population 98th Percentile')
# Add labels and title to the plot
plt.xlabel('BMI value')
plt.ylabel('Count')
plt.title('BMI Distribution of Sample vs. Population (with 98th Percentile)' )
# Add a legend to the plot
plt.legend(loc='upper right')
plt.show()

In [None]:
#c question.
#Here i am calculating mean,standard deviation, percentile for BP.
bp_mean = raw_data['BloodPressure'].mean()
bp_std = raw_data['BloodPressure'].std()
bp_percentile = raw_data['BloodPressure'].quantile([0.5,0.75,0.90])

print('BloodPressure statistics in the population data:')
print(f'Mean: {bp_mean:.2f}')
print(f'Standard deviation: {bp_std:.2f}')
print(f'50th , 75th & 95th percentiles: {bp_percentile[0.50],bp_percentile[0.75],bp_percentile[0.90]}')

In [None]:
#Generating the bootstrap samples
#creating an empty array's to store the bootstrap samples statistic values
bloodpressure_mean_bootstrap = np.empty(500)
bloodpressure_sd_bootstrap = np.empty(500)
bloodpressure_percentile_bootstrap = np.empty((3, 500))
n_bootstrap_samples = 500
n_obs_per_bootstrap_sample = 150
for i in range(n_bootstrap_samples):
    sample_data_i = np.random.choice(raw_data['BloodPressure'], size=n_obs_per_bootstrap_sample, replace=True)
    bloodpressure_mean_bootstrap[i] = np.mean(sample_data_i)
    bloodpressure_sd_bootstrap[i] = np.std(sample_data_i)
    bloodpressure_percentile_bootstrap[:, i] = np.quantile(sample_data_i, [0.5, 0.75, 0.90])
# Seting the style for the plot
sns.set_style('whitegrid')

# Creating a histogram of the mean of bootstrap samples blood pressure values
sns.histplot(bloodpressure_mean_bootstrap, element='step', color='pink', label='Bootstrap samples mean')
# Adding the vertical lines at the mean of means of bootstrap samples and population mean
plt.axvline(np.mean(bloodpressure_mean_bootstrap), color='yellow', linestyle='--', label='BootStrap mean')
plt.axvline(bp_mean, color='black',linestyle='--', label='population mean')

# Add labels and title to the plot
plt.xlabel('Blood Pressure')
plt.ylabel('Count')
plt.title('Blood Pressure Distribution among bootstrap sample mean vs. Population mean ' )

# Add a legend to the plot
plt.legend(loc='upper right')
plt.show()


In [None]:
#Comparing the bootstrap means with the population sd
# Seting the style for the plot
sns.set_style('whitegrid')
# Create a histogram of the sd of bootstrap samples blood pressure values
sns.histplot(bloodpressure_sd_bootstrap, element='step', color='pink', label='Bootstrap samples sd')
# Add vertical lines at the mean of means of bootstrap samples and population mean
plt.axvline(np.mean(bloodpressure_sd_bootstrap), color='red', linestyle='--', label='BootStrap sd')
plt.axvline(bp_std, color='blue',linestyle='--', label='population sd')
# Adding the labels and title to the plot
plt.xlabel('Blood Pressure')
plt.ylabel('Count')
plt.title('BP Distribution of bootstrap samples SD vs. Population SD' )

# Add a legend to the plot
plt.legend(loc='upper right')
plt.show()

In [None]:
#comparing the percentiles of the bootstrap samples and population
#calculating 75th percentile
plt.subplot(1, 3, 2)
sns.barplot(x=['Bootstrap Sample ', 'Population '], y=[np.mean(bloodpressure_percentile_bootstrap[1,]), bp_percentile[0.75]], color='blue')
plt.ylabel('75th Percentile of BP')
#calculating the 90th percentile
plt.subplot(1, 3, 1)
sns.barplot(x=['Bootstrap Sample ', 'Population'], y=[np.mean(bloodpressure_percentile_bootstrap[2,]), bp_percentile[0.9]], color='black')
plt.ylabel('90th Percentile of BP')
plt.title('Comparing 75th & 90th Percentile of BP: Bootstrap Samples vs Population')
plt.show()

