# Methods and imports

In [1]:
import numpy as np
import pandas as pd
from tabulate import tabulate

def mean_gender(gender):
    return np.nanmean(gender_heart[1][gender_heart[0] == gender])# mean of the heart rate of a specefic gender
    
def std_gender(gender):
    return np.nanstd(gender_heart[1][gender_heart[0] == gender]) # std of the heart rate of a specefic gender

def IQR(arr):
    return np.subtract(np.nanpercentile(arr, 75, axis=0), np.nanpercentile(arr, 25, axis=0))

def display(data, header='keys', format='psql'): # header = 'firstrow'
    return tabulate(data, headers=header, tablefmt=format) # tablefmt='fancy_grid'

def total_blood_pressure(Systolic, Diastolic):
    return np.nansum([Systolic , Diastolic], axis=0)

def matrix_distances(matrix):
    return np.nansum((matrix[:, np.newaxis, :] - matrix[np.newaxis, :, :]) ** 2, axis=-1)

### 1 - Read the "heartFailureDataset.txt" dataset into a Numpy array. You can use the appropriate Numpy function for reading tabular data.


In [2]:
try:
    file = pd.read_csv('HeartFailureDataset 1.csv') # read the csv file
    arr_data = np.array(file.values)     #store the data without attributes in a numpy array
    arr_features = np.array(file.columns.tolist())   #store the attributes only in a numpy array
    
except (Exception, BaseException):
    print('Error, search about it  ♠☻♠')

### 2 -  Calculate the mean and standard deviation of the “heart rate” values for each “gender” across all patients. Then, identify the “gender: with the lowest mean heart rate.

In [3]:
gender_heart = np.array([arr_data[:,3], arr_data[:,14]], dtype = float) # [[gender_column],[heart...]]
# take the genderia and heart rate and put them together in a numpy array of float dtype to deal with them

mean_1 , mean_2 = mean_gender(1) , mean_gender(2) # To not calculate it again in the comparison part

print(f'mean gender 1 = {mean_1}')
print(f'standard deviation gender 1 = {std_gender(1)}\n')
print(f'mean gender 2 = {mean_2}')
print(f'standard deviation gender 1 = {std_gender(2)}\n')
print(f"The gender of the lowest mean heart rate is {'2' if mean_1 > mean_2 else '1'}") # comparison part

mean gender 1 = 84.80731991110707
standard deviation gender 1 = 16.072054208238853

mean gender 2 = 84.36778837735726
standard deviation gender 1 = 15.954616665632308

The gender of the lowest mean heart rate is 2


### 3 - Calculate the median and interquartile range of all features. Then, identify the feature with the smallest and largest interquartile range.

**(axis = 0) ==> column ||  (axis = 1) ==> row**

In [4]:
median_columns = np.nanmedian(arr_data[:,1:], axis=0)
IQR_columns = IQR(arr_data[:,1:]) #depending on the box plot concept....

# print(np.shape(arr_features))
# print(np.shape(IQR_columns))

max_feature = arr_features[np.argmax(IQR_columns) + 1] # +1 to avoid the ID from arr_feature
min_feature = arr_features[np.argmin(IQR_columns) + 1] # +1 to avoid the ID from arr_feature

print(f'The max interquartil = {max_feature} and the min interquartil = {min_feature}\n')

print(display({"Median": median_columns, "IQR": IQR_columns}))

The max interquartil = NT-proBNP and the min interquartil = outcome

+------------+---------------+
|     Median |           IQR |
|------------+---------------|
|    0       |     0         |
|   77       |    20         |
|    2       |     1         |
|   28.3125  |     9.30705   |
|    1       |     1         |
|    0       |     1         |
|    0       |     0         |
|    0       |     1         |
|    0       |     1         |
|    0       |     0         |
|    0       |     1         |
|    0       |     1         |
|    0       |     0         |
|   83.6108  |    23.5359    |
|  116.128   |    23.2337    |
|   58.4615  |    13.2904    |
|   20.3723  |     5.46551   |
|   36.6508  |     0.735946  |
|   96.4523  |     2.9175    |
| 1675       |  1520         |
|   30.8     |     6.8525    |
|    3.49    |     0.78      |
|   29.75    |     2.99      |
|   32.9857  |     1.81389   |
|   90       |     7.60714   |
|   15.5062  |     2.4775    |
|    9.68    |     5.3       |
|

### 4 - Using the np.sum function, calculate the total “blood pressure” (“Systolic blood  pressure” and “Diastolic blood pressure”) value for each patient. Then, print the top 10 “patient IDs” with the highest blood pressure values.

In [5]:
# print(arr_features[15]) # ==> Systolic blood pressure
# print(arr_features[16]) # ==> Diastolic blood pressure
total_blood = total_blood_pressure(arr_data[:,15], arr_data[:,16])
IDs = np.array(arr_data[:,0]).astype(int) #take the IDs and put them in a seperate numpy array
indexes = np.argpartition(total_blood, -10, axis=0)[-10:] #Take the top 10 indeces of the total_blood

print(display({'IDs_Top10':IDs[indexes], 'Total_blood':total_blood[indexes]}))

+-------------+---------------+
|   IDs_Top10 |   Total_blood |
|-------------+---------------|
|      114085 |       248.96  |
|      184453 |       249.909 |
|      196981 |       251.97  |
|      118932 |       258.8   |
|      196856 |       259.739 |
|      163199 |       280.031 |
|      190823 |       268.842 |
|      169263 |       258.278 |
|      178001 |       283.2   |
|      189112 |       252.391 |
+-------------+---------------+


### 5 - Sort the patients in descending order based on their “glucose” values. Print the patient IDs of the top 10 patients based on their "glucose" values.

In [6]:
# print(arr_features[38])  # ==>  glucose
glucose = np.argsort(arr_data[:,38], axis=0)[::-1] 
max_val = np.nanargmax(arr_data[:,38][glucose]) # index of the largest value

# for i in arr_data[:,38][glucose]:
#     print(i)
# print(arr_data[:,38][glucose][max_val]) #414.1

IDs_top_10 = IDs[glucose][max_val:max_val+10]
glucose_top_10 = arr_data[:,38][glucose][max_val:max_val+10]

print(display({'IDs_Top10':IDs_top_10, 'Values':glucose_top_10}))

+-------------+----------+
|   IDs_Top10 |   Values |
|-------------+----------|
|      194828 |  414.1   |
|      173649 |  369     |
|      121701 |  365.75  |
|      159518 |  352.333 |
|      193287 |  349.667 |
|      114936 |  348.444 |
|      101851 |  345.2   |
|      166548 |  342.5   |
|      192990 |  339.909 |
|      174478 |  338.846 |
+-------------+----------+


### 6 - Using fancy indexing, substitute the "Creatinine" values of the top 5 patients, with the corresponding median value of that feature.

In [7]:
# print(arr_features[36]) # ==> creatinine column
median_creatinine = np.nanmedian(arr_data[:,36], axis=0) # ==> 1.2875
indexes = np.argpartition(arr_data[:,36], -5, axis=0)[-5:] # get the indexes of the top 5

top_5_before_edit = arr_data[:,36][indexes]
arr_data[indexes,36] = median_creatinine

print(display({'IDs':IDs[indexes],'Top 5 before edit':top_5_before_edit,'Top 5 after edit':arr_data[:,36][indexes]}))

+--------+---------------------+--------------------+
|    IDs |   Top 5 before edit |   Top 5 after edit |
|--------+---------------------+--------------------|
| 182383 |              9.0125 |             1.2875 |
| 154590 |              9.2    |             1.2875 |
| 125433 |             12.45   |             1.2875 |
| 194346 |             15.5273 |             1.2875 |
| 122477 |             12.8375 |             1.2875 |
+--------+---------------------+--------------------+


### 7 - Using np.argpartition, identify the top 100 values of each feature. Subsequently, compute the mean of each feature, considering only the 100 highest values.

In [8]:
arr_without_nan = np.nan_to_num(arr_data[:,1:], nan=0) # to remove the warning

indexes_top_100 = np.argpartition(arr_without_nan, -100, axis=0)[-100:]
top_100_values = np.take_along_axis(arr_without_nan, indexes_top_100, axis=0)
mean_for_top_100 = np.mean(top_100_values, axis=0)

print(display({'mean':mean_for_top_100, 'feature':arr_features[1:]}))

+-------------+--------------------------+
|        mean | feature                  |
|-------------+--------------------------|
|     1       | outcome                  |
|    90.37    | age                      |
|     2       | gendera                  |
|    50.3714  | BMI                      |
|     1       | hypertensive             |
|     1       | atrialfibrillation       |
|     1       | CHD with no MI           |
|     1       | diabetes                 |
|     1       | deficiencyanemias        |
|     1       | depression               |
|     1       | Hyperlipemia             |
|     1       | Renal failure            |
|     0.89    | COPD                     |
|   115.36    | heart rate               |
|   154.173   | Systolic blood pressure  |
|    82.072   | Diastolic blood pressure |
|    28.8595  | Respiratory rate         |
|    37.8776  | temperature              |
|    99.5513  | SP O2                    |
|  4791.88    | Urine output             |
|    43.435

### 8 - Calculate the mean and standard deviation of “Respiratory rate” values. Then, identify the patient IDs with 2 standard deviation away from the mean of “Respiratory rate” feature.


In [9]:
# print(arr_features[17])  # ==>  Respiratory rate
mean_Respiratory = np.nanmean(arr_data[:,17], axis=0)
std_Respiratory = np.nanstd(arr_data[:,17], axis=0)

lower , upper = mean_Respiratory - 2 * std_Respiratory , mean_Respiratory + 2 * std_Respiratory
range_out = np.where((arr_data[:,17] < lower) | (arr_data[:,17] > upper))

print(display({'IDs with 2 std away':IDs[range_out]}))

+-----------------------+
|   IDs with 2 std away |
|-----------------------|
|                127360 |
|                191289 |
|                116888 |
|                113812 |
|                138440 |
|                190054 |
|                153366 |
|                116367 |
|                133975 |
|                152960 |
|                145790 |
|                108084 |
|                107777 |
|                126474 |
|                107462 |
|                141222 |
|                130354 |
|                128899 |
|                180135 |
|                173649 |
|                151364 |
|                154468 |
|                155273 |
|                109577 |
|                153207 |
|                166585 |
|                155044 |
|                110347 |
|                120626 |
|                128969 |
|                133499 |
|                192198 |
|                110335 |
|                175630 |
|                166387 |
|           

### 9 - Compute the distance matrix among patients and determine the k-NN (k-nearest neighbors), where k = 3, for each patient using the np.argpartition function.

In [10]:
distances = matrix_distances(arr_data[:,1:])
K = 3
nearest_partition = np.argpartition(distances, K+1, axis=1)
nearest_partition

array([[ 875,    0,  835, ..., 1174, 1175, 1176],
       [   1,  959,   27, ..., 1174, 1175, 1176],
       [   2,  735, 1013, ..., 1174, 1175, 1176],
       ...,
       [1174,  817,  816, ...,    7,    3,  588],
       [ 822, 1176,  919, ...,    7,    3,  588],
       [ 558, 1176,  643, ...,    7,    3,  588]], dtype=int64)

### 10 - Compute the Pearson correlation coefficient between each pair of patients. Subsequently, identify the pair of patients with the highest correlation coefficient. Provide the correlation matrix, which should be a square matrix.


In [11]:
corr = np.corrcoef(arr_data[:,1:], rowvar=False)
np.fill_diagonal(corr, 0)

index = np.nanargmax(corr) # take the highest correlation coefficient index as ravel index from the corr
row, col = np.unravel_index(index, corr.shape) # we want to unravel this index to row and column

value = corr[row, col]

print(f"({row},{col})")
print(value)

(20,21)
0.8980516424873554
