In [1]:
# Import the relevant libraries (Pandas, os & Counter)

import pandas as pd
import os
from collections import Counter

In [2]:
main_csv = "diabetes_risk_prediction_raw_dataset.csv" # fetch the local csv file 

In [3]:
signs_and_symptoms = ("Polyuria", "Polydipsia", "Sudden_weight_loss", "Weakness",	"Polyphagia",	"Genital_thrush", \
"Visual_blurring", "Itching",	"Irritability", "Delayed_healing", "Partial_paresis",	"Muscle_stiffness", "Alopecia", "Obesity")


In [4]:
# create the main dataframe using the csv file

def create_dataframe(csv):
    main_df = pd.read_csv(csv)
    return main_df

In [5]:
# We'll split the population into 2 groups: those above 45 years and those below 45 years

def split_population_by_age(df):
    df_below_45 = df.loc[df['Age'] <= 45] # Less than or equal to 45 years of age
    df_above_45 = df.loc[df['Age'] > 45]
    return df_below_45, df_above_45

In [6]:
# We'll spilt the population by gender: males and females

def split_population_by_gender(df):
    df_males = df.loc[df['Gender'] == 'Male']
    df_females = df.loc[df['Gender'] == 'Female']
    return df_males, df_females

In [7]:
# We'll split the population by the patients' diabetes status (positive or negative)

def split_population_by_diabetes_status(df):
    df_positive = df.loc[df['Status'] == 'Positive']
    df_negative = df.loc[df['Status'] == 'Negative']
    return df_positive, df_negative

In [8]:
# Count the occurence of positive and negative cases

def aggregate_patients_status(df):                      
    status_counter_dict = Counter(df['Status'])
    positive_status_num = status_counter_dict['Positive']
    negative_status_num = status_counter_dict['Negative']
    return positive_status_num, negative_status_num

In [9]:
# Build a summary dataframe after splitting the main dataframe based on age and gender

def build_summary_dataframe(dict_from_split):
    summary_df = pd.DataFrame(dict_from_split)
    return summary_df

In [10]:
# Save the summary dataframe to file in csv format

def save_dataframe_to_csv(df, csv_filename):
    if not csv_filename in os.listdir():
        df.to_csv(csv_filename, index=False)
        print(f'\n {csv_filename} saved successfully.')
    elif csv_filename in os.listdir():
        print(f'\n {csv_filename} already exists!')      
    return True

In [11]:
# Count the frequency of the signs and symptoms

def count_signs_and_symptoms_frequency(df, signs_and_symptoms):
    symptoms_frequency_dict = {'Present': ['Yes', 'No']} # Main Dictionary will hold the symptoms that are present or not present 

    for symptom in signs_and_symptoms:
        symptom_counter_dict = Counter(df[symptom]) 
        #print(f'\n {symptom} => {symptom_counter_dict}
        yes_counts = symptom_counter_dict['Yes'] # Count the occurences of Yes (i.e symptom present)
        no_counts = symptom_counter_dict['No']  # Count the occurences of No (i.e symptom present)
        yes_no_counts_list = [yes_counts, no_counts]
        temp_dict = {symptom:yes_no_counts_list}
        symptoms_frequency_dict.update(temp_dict)

    return symptoms_frequency_dict

In [12]:
def main():
    # useful dictionaries of the filenames that will be used when saving csv files to disk 
    csv_summaries_dict = {'Age':'Age_diabetes_summary.csv', 'Gender': 'Gender_diabetes_summary.csv'}    # summary based on age and gender
    csv_splits_age_dict = {'Below_45': 'Diabetes_status_below_45.csv', 'Above_45': 'Diabetes_status_above_45.csv'} # split by age
    csv_splits_gender_dict = {'Male': 'Diabetes_status_males.csv', 'Female': 'Diabetes_status_females.csv'} # split by gender
    # this dictionary keeps track of the frequency of occurence of signs and symptoms after splitting by age, gender and DM status
    csv_signs_and_symptoms_frequency_dict = {'Below_45': 'Signs_and_symptoms_below_45.csv', 'Above_45': 'Signs_and_symptoms_above_45.csv',
                                             "Males": 'Signs_and_symptoms_males.csv', "Females": 'Signs_and_symptoms_females.csv',
                                             "Positive": "Signs_and_symptoms_positive_status.csv", "Negative": "Signs_and_symptoms_negative_status.csv"}
    #---------------------------------------------------------------------------------------------------------------------
    # This section deals with the first category of splits (by age), considering patients 45 or below and patients over 45
    
    main_df = create_dataframe(main_csv)
    df_below_45, df_above_45 = split_population_by_age(main_df)
    print('\n SUMMARY OF DIABETES STATUS BASED ON AGE ( <45 years)')
    positive_status_below_45, negative_status_below_45 = aggregate_patients_status(df_below_45)
    print(f'\n Number of positive cases (below 45) = {positive_status_below_45}')
    print(f'\n Number of negative cases (below 45) = {negative_status_below_45}')

    print('\n SUMMARY OF DIABETES STATUS BASED ON AGE ( >45 years)')
    positive_status_above_45, negative_status_above_45 = aggregate_patients_status(df_above_45)
    print(f'\n Number of positive cases (above 45) = {positive_status_above_45}')
    print(f'\n Number of negative cases (above 45) = {negative_status_above_45}')

    dict_by_age = {'Age': ['Below_45', 'Above_45'], 'Positive': [positive_status_below_45, positive_status_above_45], \
                     'Negative': [negative_status_below_45, negative_status_above_45]}
    summary_df_age = build_summary_dataframe(dict_by_age)
    save_dataframe_to_csv(summary_df_age, csv_summaries_dict['Age']) # save the concise summary of diabetes status based on age
    save_dataframe_to_csv(df_below_45, csv_splits_age_dict['Below_45']) # save df with all datapoints of patients below 45
    save_dataframe_to_csv(df_above_45, csv_splits_age_dict['Above_45']) # save df with all datapoints of patients above 45

# ------------------------------------------------------------------------------------------------------------------------
    #This section deals with the second split category (by gender)
    df_males, df_females = split_population_by_gender(main_df)
    positive_status_males, negative_status_males = aggregate_patients_status(df_males)
    positive_status_females, negative_status_females = aggregate_patients_status(df_females)
    print('\n SUMMARY OF DIABETES STATUS BASED ON GENDER (MALES)')
    print(f'\n Number of male positive cases = {positive_status_males}')
    print(f'\n Number of male negative cases = {negative_status_males}')

    print('\n SUMMARY OF DIABETES STATUS BASED ON GENDER (FEMALES)')
    print(f'\n Number of female positive cases = {positive_status_females}')
    print(f'\n Number of female negative cases = {negative_status_females}')
    
    dict_by_gender = {'Gender': ['Male', 'Female'], 'Positive': [positive_status_males, positive_status_females], \
                      'Negative': [negative_status_males, negative_status_females]}
    summary_df_gender = build_summary_dataframe(dict_by_gender)
    save_dataframe_to_csv(summary_df_gender, csv_summaries_dict['Gender']) # save the concise summary diabetes status based on gender
    save_dataframe_to_csv(df_males, csv_splits_gender_dict['Male']) # save df with all datapoints of males
    save_dataframe_to_csv(df_females, csv_splits_gender_dict['Female']) # save df with all datapoints of females

    print('\n SIGNS AND SYMPTOMS FOR PATIENTS BELOW 45YRS OF AGE')
    symptoms_frequency_below_45_dict = count_signs_and_symptoms_frequency(df_below_45, signs_and_symptoms)
    symptoms_frequency_below_45_df = build_summary_dataframe(symptoms_frequency_below_45_dict)
    save_dataframe_to_csv(symptoms_frequency_below_45_df, csv_signs_and_symptoms_frequency_dict['Below_45'])
    # save the signs and symptoms and frequency of occurence in patients below 45yrs of age

    print('\n SIGNS AND SYMPTOMS FOR PATIENTS ABOVE 45YRS OF AGE')
    symptoms_frequency_above_45_dict = count_signs_and_symptoms_frequency(df_above_45, signs_and_symptoms)
    symptoms_frequency_above_45_df = build_summary_dataframe(symptoms_frequency_above_45_dict)
    save_dataframe_to_csv(symptoms_frequency_above_45_df, csv_signs_and_symptoms_frequency_dict['Above_45'])
    # save the signs and symptoms and frequency of occurence in patients above 45yrs of age
    
    print('\n SIGNS AND SYMPTOMS FOR ALL MALE PATIENTS')
    symptoms_frequency_males_dict = count_signs_and_symptoms_frequency(df_males, signs_and_symptoms)
    symptoms_frequency_males_df = build_summary_dataframe(symptoms_frequency_males_dict)
    save_dataframe_to_csv(symptoms_frequency_males_df, csv_signs_and_symptoms_frequency_dict['Males'])
    # save the signs and symptoms and frequency of occurence in male patients

    print('\n SIGNS AND SYMPTOMS FOR ALL FEMALE PATIENTS')
    symptoms_frequency_females_dict = count_signs_and_symptoms_frequency(df_females, signs_and_symptoms)
    symptoms_frequency_females_df = build_summary_dataframe(symptoms_frequency_females_dict)
    save_dataframe_to_csv(symptoms_frequency_females_df, csv_signs_and_symptoms_frequency_dict['Females'])
    # save the signs and symptoms and frequency of occurence in female patients

    #----------------------------------------------------------------------------------------------------------------------------------
    # This section looks at the frequency of occurence of symptoms after splitting the patients based on their diabetic mellitus status
    df_positive_status, df_negative_status = split_population_by_diabetes_status(main_df)
    print('\n SIGNS AND SYMPTOMS FOR ALL PATIENTS WITH POSITIVE DIABETES STATUS')
    symptoms_frequency_positive_status_dict = count_signs_and_symptoms_frequency(df_positive_status, signs_and_symptoms)
    symptoms_frequency_positive_status_df = build_summary_dataframe(symptoms_frequency_positive_status_dict)
    save_dataframe_to_csv(symptoms_frequency_positive_status_df, csv_signs_and_symptoms_frequency_dict['Positive'])

    print('\n SIGNS AND SYMPTOMS FOR ALL PATIENTS WITH NEGATIVE DIABETES STATUS')
    symptoms_frequency_negative_status_dict = count_signs_and_symptoms_frequency(df_negative_status, signs_and_symptoms)
    symptoms_frequency_negative_status_df = build_summary_dataframe(symptoms_frequency_negative_status_dict)
    save_dataframe_to_csv(symptoms_frequency_negative_status_df, csv_signs_and_symptoms_frequency_dict['Negative'])


if __name__ == '__main__':
    main()




 SUMMARY OF DIABETES STATUS BASED ON AGE ( <45 years)

 Number of positive cases (below 45) = 128

 Number of negative cases (below 45) = 103

 SUMMARY OF DIABETES STATUS BASED ON AGE ( >45 years)

 Number of positive cases (above 45) = 192

 Number of negative cases (above 45) = 97

 Age_diabetes_summary.csv saved successfully.

 Diabetes_status_below_45.csv saved successfully.

 Diabetes_status_above_45.csv saved successfully.

 SUMMARY OF DIABETES STATUS BASED ON GENDER (MALES)

 Number of male positive cases = 147

 Number of male negative cases = 181

 SUMMARY OF DIABETES STATUS BASED ON GENDER (FEMALES)

 Number of female positive cases = 173

 Number of female negative cases = 19

 Gender_diabetes_summary.csv saved successfully.

 Diabetes_status_males.csv saved successfully.

 Diabetes_status_females.csv saved successfully.

 SIGNS AND SYMPTOMS FOR PATIENTS BELOW 45YRS OF AGE

 Signs_and_symptoms_below_45.csv saved successfully.

 SIGNS AND SYMPTOMS FOR PATIENTS ABOVE 45YRS OF