<a href="https://colab.research.google.com/github/KeziahAcheampong/Keziah_A/blob/main/week5_pandas_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import the pandas library.
# Pandas allows us to load, organize, clean, and analyze structured data using DataFrames.
import pandas as pd


In [None]:
# Load the NHANES dataset from a GitHub URL into a pandas DataFrame.
# The DataFrame is stored in the variable 'df' so we can manipulate and analyze it.
df = pd.read_csv('https://raw.githubusercontent.com/osoliman/DSC110/refs/heads/main/Datasets/nhanes_final.csv')
# Display the first 5 rows of the dataset.
# This helps us quickly inspect the structure and confirm the data loaded correctly.
df.head()
# Display only the first row.
# This is useful for checking column names and understanding the format of the data.
df.head(1)
# Display the last 5 rows of the dataset.
# This helps verify that the full dataset was imported.
df.tail()




Unnamed: 0,id,gender,age,race_ethnicity,education,bmi,height_cm,systolic_bp,has_diabetes,hba1c,fasting_glucose
11928,142306.0,1.0,9.0,2.0,,15.4,128.0,,2.0,,
11929,142307.0,2.0,49.0,4.0,5.0,,143.8,127.0,1.0,6.2,
11930,142308.0,1.0,50.0,2.0,4.0,26.4,173.3,106.0,2.0,,
11931,142309.0,1.0,40.0,2.0,4.0,25.5,179.1,127.0,2.0,5.2,96.0
11932,142310.0,2.0,80.0,3.0,3.0,27.6,161.7,128.0,2.0,5.3,


In [None]:
# Display the shape of the DataFrame.
# The first number represents rows and the second represents columns.
df.shape
# Display only the number of rows in the dataset.
# This tells us how many observations we have.
df.shape[0]
# Print a formatted message showing total rows and columns.
# This provides a clear summary of dataset size.
print(f'this file has {df.shape[0]} rows and {df.shape[1]} columns')


this file has 11933 rows and 11 columns


In [None]:
# Check if 1 + 1 equals 2.
# This returns True because the statement is correct.
1+1==2
# Check if 1 + 1 equals 3.
# This returns False because the statement is incorrect.
1+1==3


False

In [None]:
# Count the number of missing (NaN) values in each column.
# This helps us identify which variables need cleaning.
df.isna().sum()
# Calculate the percentage of missing values in each column.
# This shows how much data is missing relative to the dataset size.
df.isna().mean() * 100


Unnamed: 0,0
id,0.0
gender,0.0
age,0.0
race_ethnicity,0.0
education,34.685326
bmi,29.011984
height_cm,28.77734
systolic_bp,37.00662
has_diabetes,1.617364
hba1c,43.727478


In [None]:
# Remove rows where 'has_diabetes' is missing.
# We create a new cleaned dataset called df_clean.
df_clean = df.dropna(subset=['has_diabetes']).copy()
# Compare the number of rows before and after dropping missing values.
# This shows whether any rows were removed.
print(f"Started with {len(df)} rows")
print(f"Now have {len(df_clean)} rows")


Started with 11933 rows
Now have 11740 rows


In [None]:
# Calculate the mean systolic blood pressure.
# The mean will be used to replace missing values.
mean_bp = df_clean['systolic_bp'].mean()
# Replace missing systolic blood pressure values with the mean.
# This method is called mean imputation and prevents losing data.
df_clean['systolic_bp'] = df_clean['systolic_bp'].fillna(mean_bp)
# Check that there are no remaining missing values in systolic_bp.
df_clean['systolic_bp'].isna().sum()


np.int64(0)

In [None]:
# Create a new categorical variable called 'bmi_category'.
# pd.cut divides BMI values into clinical ranges.
df['bmi_category'] = pd.cut(df['bmi'],
                             bins=[0, 18.5, 25, 30, 100],
                             labels=['Underweight', 'Normal', 'Overweight', 'Obese'])
# Count how many individuals fall into each BMI category.
# This helps summarize weight classification in the dataset.
df['bmi_category'].value_counts()


Unnamed: 0_level_0,count
bmi_category,Unnamed: 1_level_1
Obese,2666
Normal,2400
Overweight,2189
Underweight,1216


In [None]:
# Display unique values in the education column.
# This helps us understand how education is coded numerically.
df['education'].unique()
# Create a dictionary to convert numeric education codes into readable labels.
education_map = {
    '1.0': 'Less than 9th grade',
    '2.0': '9-11th grade',
    '3.0': 'High school grad',
    '4.0': 'Some college or AA degree',
    '5.0': 'College graduate',
    '9.0': 'Do not know',
    'None': 'Missing'
}
# Apply the mapping dictionary to create a new column called 'education_label'.
# This makes the variable easier to interpret.
df['education_label'] = df['education'].map(education_map)
# Count the number of individuals in each education category.
df['education_label'].value_counts()


Unnamed: 0_level_0,count
education_label,Unnamed: 1_level_1


In [None]:
# Calculate the mean systolic blood pressure.
# The mean represents the average value.
df['systolic_bp'].mean()
# Calculate the standard deviation of systolic blood pressure.
# This measures how spread out the values are.
df['systolic_bp'].std()


18.561052201733208

In [None]:
# Create a cross-tabulation comparing diabetes status and BMI category.
# This shows the frequency distribution between two categorical variables.
pd.crosstab(df['has_diabetes'], df['bmi_category'])


bmi_category,Underweight,Normal,Overweight,Obese
has_diabetes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1.0,4,109,233,504
2.0,1211,2251,1886,2042
3.0,0,39,70,120
9.0,1,1,0,0


In [None]:
# Group the data by diabetes status and calculate the average BMI.
# This compares mean BMI across different diabetes categories.
df.groupby('has_diabetes')['bmi'].mean()


Unnamed: 0_level_0,bmi
has_diabetes,Unnamed: 1_level_1
1.0,32.981412
2.0,26.439797
3.0,32.068559
9.0,19.25
