In [None]:
# Data Science for Medicine and Biology
#
# Nezu Life Sciences
#
# Feel free to modify, redistribute and above all,
# create something with this code.
#
# Tiago Lopes, PhD
# March 2024

# Import a large dataset from the Nezu Life Sciences repository
# Print its "shape" (number of rows and columns)
# and its first 5 rows.

import pandas as pd

URL = "https://raw.githubusercontent.com/Nezu-life/Machine_Learning_Course_English/main/Intro_to_Python/datasets/synthetic_patient_records.csv"

patients = pd.read_csv(URL, sep=",")



In [None]:
# Renames one of the columns of the data frame

patients = patients.rename(columns={"Patient ID": "Patient_ID"})

print(patients.head(5))


  Patient_ID   Age  Gender  Height  Weight Blood_Type  Cholesterol  \
0   PID-0001  69.0    Male   177.1   146.6         A-        220.2   
1   PID-0002   NaN    Male   192.1   105.8         B+          NaN   
2   PID-0003  89.0   Other   174.2    84.7         O+        136.3   
3   PID-0004  78.0  Female   141.8   108.0        AB-        147.8   
4   PID-0005  38.0    Male   195.9    63.9         A-        224.2   

   Blood_Pressure_h  Blood_Pressure_l  Diabetes  Smoker  
0             138.0              84.0     False   False  
1              99.0              81.0     False   False  
2             105.0              75.0      True    True  
3              97.0              88.0      True   False  
4             128.0              87.0     False   False  


In [None]:
# Adds a new column (BMI) to the data frame

patients["Height"] = patients["Height"] / 100

patients["BMI"] = patients["Weight"] / patients["Height"] ** 2

print(patients.head(5))


  Patient_ID   Age  Gender  Height  Weight Blood_Type  Cholesterol  \
0   PID-0001  69.0    Male   1.771   146.6         A-        220.2   
1   PID-0002   NaN    Male   1.921   105.8         B+          NaN   
2   PID-0003  89.0   Other   1.742    84.7         O+        136.3   
3   PID-0004  78.0  Female   1.418   108.0        AB-        147.8   
4   PID-0005  38.0    Male   1.959    63.9         A-        224.2   

   Blood_Pressure_h  Blood_Pressure_l  Diabetes  Smoker        BMI  
0             138.0              84.0     False   False  46.740876  
1              99.0              81.0     False   False  28.670214  
2             105.0              75.0      True    True  27.911753  
3              97.0              88.0      True   False  53.711996  
4             128.0              87.0     False   False  16.650680  


In [None]:
# Removes a column from the data frame

patients = patients.drop("Height", axis=1)

print(patients.head(5))


  Patient_ID   Age  Gender  Weight Blood_Type  Cholesterol  Blood_Pressure_h  \
0   PID-0001  69.0    Male   146.6         A-        220.2             138.0   
1   PID-0002   NaN    Male   105.8         B+          NaN              99.0   
2   PID-0003  89.0   Other    84.7         O+        136.3             105.0   
3   PID-0004  78.0  Female   108.0        AB-        147.8              97.0   
4   PID-0005  38.0    Male    63.9         A-        224.2             128.0   

   Blood_Pressure_l  Diabetes  Smoker        BMI  
0              84.0     False   False  46.740876  
1              81.0     False   False  28.670214  
2              75.0      True    True  27.911753  
3              88.0      True   False  53.711996  
4              87.0     False   False  16.650680  


In [None]:
# Remove all smokers from the data frame and reset indexes

patients = patients[patients['Smoker'] != True]

patients = patients.reset_index(drop=True)

print(patients)


    Patient_ID   Age  Gender  Weight Blood_Type  Cholesterol  \
0     PID-0001  69.0    Male   146.6         A-        220.2   
1     PID-0002   NaN    Male   105.8         B+          NaN   
2     PID-0004  78.0  Female   108.0        AB-        147.8   
3     PID-0005  38.0    Male    63.9         A-        224.2   
4     PID-0006  92.0   Other    94.4         O-        231.2   
..         ...   ...     ...     ...        ...          ...   
501   PID-0994  63.0   Other   148.1         O-        215.3   
502   PID-0995  29.0  Female   115.0         A-        232.9   
503   PID-0997  79.0  Female   121.5         B+        209.0   
504   PID-0998  99.0  Female     NaN         B+        216.6   
505   PID-0999  77.0  Female    55.3         B+        221.4   

     Blood_Pressure_h  Blood_Pressure_l  Diabetes  Smoker        BMI  
0               138.0              84.0     False   False  46.740876  
1                99.0              81.0     False   False  28.670214  
2                9

In [None]:
# Lists the number of NaNs (not a numbers) in the data frame

print(patients.isna().sum())


Patient_ID          18
Age                 12
Gender              15
Weight               7
Blood_Type           9
Cholesterol         10
Blood_Pressure_h    13
Blood_Pressure_l    13
Diabetes             0
Smoker               0
BMI                 23
dtype: int64


In [None]:
# Removes all rows that have NaNs in the Patient_ID column

patients = patients.dropna(subset=['Patient_ID'])

patients = patients.reset_index(drop=True)

print(patients)


    Patient_ID   Age  Gender  Weight Blood_Type  Cholesterol  \
0     PID-0001  69.0    Male   146.6         A-        220.2   
1     PID-0002   NaN    Male   105.8         B+          NaN   
2     PID-0004  78.0  Female   108.0        AB-        147.8   
3     PID-0005  38.0    Male    63.9         A-        224.2   
4     PID-0006  92.0   Other    94.4         O-        231.2   
..         ...   ...     ...     ...        ...          ...   
483   PID-0994  63.0   Other   148.1         O-        215.3   
484   PID-0995  29.0  Female   115.0         A-        232.9   
485   PID-0997  79.0  Female   121.5         B+        209.0   
486   PID-0998  99.0  Female     NaN         B+        216.6   
487   PID-0999  77.0  Female    55.3         B+        221.4   

     Blood_Pressure_h  Blood_Pressure_l  Diabetes  Smoker        BMI  
0               138.0              84.0     False   False  46.740876  
1                99.0              81.0     False   False  28.670214  
2                9

In [None]:
# Reads and concatenates two data frames
URL_1 = "https://raw.githubusercontent.com/Nezu-life/Machine_Learning_Course_English/main/Intro_to_Python/datasets/synthetic_patient_records.csv"
URL_2 = "https://raw.githubusercontent.com/Nezu-life/Machine_Learning_Course_English/main/Intro_to_Python/datasets/synthetic_patient_records_b.csv"

patients_1 = pd.read_csv(URL_1, sep=",")
patients_2 = pd.read_csv(URL_2, sep=",")

patients_final = pd.concat([patients_1, patients_2])

patients_final = patients_final.reset_index(drop=True)

print(patients_final.shape)

print(patients_final.head(5))

(2000, 12)
  Patient ID   Age  Gender  Height  Weight Blood_Type  Cholesterol  \
0   PID-0001  69.0    Male   177.1   146.6         A-        220.2   
1   PID-0002   NaN    Male   192.1   105.8         B+          NaN   
2   PID-0003  89.0   Other   174.2    84.7         O+        136.3   
3   PID-0004  78.0  Female   141.8   108.0        AB-        147.8   
4   PID-0005  38.0    Male   195.9    63.9         A-        224.2   

   Blood_Pressure_h  Blood_Pressure_l  Diabetes  Smoker Blood_Pressure  
0             138.0              84.0     False   False            NaN  
1              99.0              81.0     False   False            NaN  
2             105.0              75.0      True    True            NaN  
3              97.0              88.0      True   False            NaN  
4             128.0              87.0     False   False            NaN  


In [None]:
# Question of lesson
patients = patients.drop("Weight", axis=1)

print(patients.head(5))


  Patient_ID   Age  Gender Blood_Type  Cholesterol  Blood_Pressure_h  \
0   PID-0001  69.0    Male         A-        220.2             138.0   
1   PID-0002   NaN    Male         B+          NaN              99.0   
2   PID-0004  78.0  Female        AB-        147.8              97.0   
3   PID-0005  38.0    Male         A-        224.2             128.0   
4   PID-0006  92.0   Other         O-        231.2               NaN   

   Blood_Pressure_l  Diabetes  Smoker        BMI  
0              84.0     False   False  46.740876  
1              81.0     False   False  28.670214  
2              88.0      True   False  53.711996  
3              87.0     False   False  16.650680  
4               NaN      True   False  28.687812  
