In [3]:
import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('healthcare_dataset.csv')

# Display the first few rows
print(df.head())

            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Date   Medication  \
0    18856.281306    

In [6]:
# Check for missing values
print(df.isnull().sum())

# If there are missing values, handle them accordingly
# For demonstration, let's fill missing 'chol' values with the median
if df['Test Results'].isnull().sum() > 0:
    df['Test Results'].fillna(df['Test Results'].median(), inplace=True)

# Drop any rows with remaining missing values
df.dropna(inplace=True)


Name                  0
Age                   0
Gender                0
Blood Type            0
Medical Condition     0
Date of Admission     0
Doctor                0
Hospital              0
Insurance Provider    0
Billing Amount        0
Room Number           0
Admission Type        0
Discharge Date        0
Medication            0
Test Results          0
dtype: int64


In [7]:
print(df.head())           # First 5 rows
print(df.info())           # Data types and non-null counts
print(df.describe())       # Summary stats for numeric columns


            Name  Age  Gender Blood Type Medical Condition Date of Admission  \
0  Bobby JacksOn   30    Male         B-            Cancer        2024-01-31   
1   LesLie TErRy   62    Male         A+           Obesity        2019-08-20   
2    DaNnY sMitH   76  Female         A-           Obesity        2022-09-22   
3   andrEw waTtS   28  Female         O+          Diabetes        2020-11-18   
4  adrIENNE bEll   43  Female        AB+            Cancer        2022-09-19   

             Doctor                    Hospital Insurance Provider  \
0     Matthew Smith             Sons and Miller         Blue Cross   
1   Samantha Davies                     Kim Inc           Medicare   
2  Tiffany Mitchell                    Cook PLC              Aetna   
3       Kevin Wells  Hernandez Rogers and Vang,           Medicare   
4    Kathleen Hanna                 White-White              Aetna   

   Billing Amount  Room Number Admission Type Discharge Date   Medication  \
0    18856.281306    

In [8]:
# Basic pandas operation
print(df[['Name', 'Age', 'Gender']]) # to select specific columns


                    Name  Age  Gender
0          Bobby JacksOn   30    Male
1           LesLie TErRy   62    Male
2            DaNnY sMitH   76  Female
3           andrEw waTtS   28  Female
4          adrIENNE bEll   43  Female
...                  ...  ...     ...
55495  eLIZABeTH jaCkSOn   42  Female
55496         KYle pEREz   61  Female
55497       HEATher WaNG   38  Female
55498     JENniFER JOneS   43    Male
55499       jAMES GARCiA   53  Female

[55500 rows x 3 columns]


In [9]:
# filter patients older than 50
older_patients = df[df['Age'] > 50]
print(older_patients)

                   Name  Age  Gender Blood Type Medical Condition  \
1          LesLie TErRy   62    Male         A+           Obesity   
2           DaNnY sMitH   76  Female         A-           Obesity   
8       JASmINe aGuIlaR   82    Male        AB+            Asthma   
9      ChRISTopher BerG   58  Female        AB-            Cancer   
10     mIchElLe daniELs   72    Male         O+            Cancer   
...                 ...  ...     ...        ...               ...   
55492  LAuREn MCcormIcK   51    Male         O+         Arthritis   
55493      gLoRIA BOWen   57  Female         B-         Arthritis   
55494     JEssIcA WHiTe   67    Male         O+         Arthritis   
55496        KYle pEREz   61  Female        AB-           Obesity   
55499      jAMES GARCiA   53  Female         O+         Arthritis   

      Date of Admission            Doctor         Hospital Insurance Provider  \
1            2019-08-20   Samantha Davies          Kim Inc           Medicare   
2        

In [10]:
# count unique values in gender
print(df['Gender'].value_counts())

Gender
Male      27774
Female    27726
Name: count, dtype: int64


In [11]:
# Group by 'Hospital' and get average billing
avg_billing = df.groupby('Hospital')['Billing Amount'].mean()
print(avg_billing)

Hospital
Abbott Inc                       38052.041917
Abbott Ltd                       29877.586483
Abbott Moore and Williams,       24799.596339
Abbott and Thompson, Sullivan    16738.569765
Abbott, Peters and Hoffman       18842.396863
                                     ...     
and Zimmerman Sons               32706.652625
and Zuniga Davis Carlson,        42867.041298
and Zuniga Francis Peterson,     33689.630726
and Zuniga Sons                  33950.170483
and Zuniga Thompson, Blake       22067.428763
Name: Billing Amount, Length: 39876, dtype: float64


In [12]:
# Sort by age
sorted_df = df.sort_values(by='Age', ascending=False)
print(sorted_df.head())

                    Name  Age  Gender Blood Type Medical Condition  \
53825      mIchaeL POtTs   89    Male         A+      Hypertension   
52857    THomAs PHIllIpS   89  Female         A+      Hypertension   
52372    DeBORah McBrIdE   89  Female         O+           Obesity   
54044  MiChAEL DOmINGuEz   89    Male         O+            Cancer   
54813  JerEmY hArdIN JR.   89    Male         A+          Diabetes   

      Date of Admission          Doctor                     Hospital  \
53825        2023-01-19     Mary Vaughn  and Perez, Bennett Townsend   
52857        2022-05-22      Mark White     and Martin, Davidson Cox   
52372        2021-05-28   Robert Garcia      and Miller, Olson Olson   
54044        2021-09-07   Bridget Irwin                    PLC White   
54813        2019-11-05  Willie Stevens                 Gray-Solomon   

      Insurance Provider  Billing Amount  Room Number Admission Type  \
53825              Cigna     9542.739709          289      Emergency   
52

In [None]:
# Apply NumPy Operations

In [13]:
# Convert 'Billing Amount' to a NumPy array
billing_array = df['Billing Amount'].to_numpy()
print(billing_array)

[18856.28130598 33643.32728658 27955.09607884 ... 27620.76471743
 32451.09235849  4010.13417164]


In [14]:
# Find max, min, mean billing amount

print("Max Billing:", np.max(billing_array))
print("Min Billing:", np.min(billing_array))
print("Average Billing:", np.mean(billing_array))

Max Billing: 52764.276736469175
Min Billing: -2008.4921398591305
Average Billing: 25539.316097211795


In [15]:
# Normalize the 'Billing Amount' column (0 to 1 scale)

df['Billing_Normalized'] = (billing_array - np.min(billing_array)) / (np.max(billing_array) - np.min(billing_array))
print(df[['Billing Amount', 'Billing_Normalized']].head())

   Billing Amount  Billing_Normalized
0    18856.281306            0.380933
1    33643.327287            0.650904
2    27955.096079            0.547053
3    37909.782410            0.728798
4    14238.317814            0.296622


In [16]:
# Add a new column with NumPy logic
# Categorize patients based on age

df['Age Group'] = np.where(df['Age'] < 18, 'Child',
                    np.where(df['Age'] < 45, 'Adult', 'Senior'))
print(df[['Name', 'Age', 'Age Group']].head())

            Name  Age Age Group
0  Bobby JacksOn   30     Adult
1   LesLie TErRy   62    Senior
2    DaNnY sMitH   76    Senior
3   andrEw waTtS   28     Adult
4  adrIENNE bEll   43     Adult
