In [28]:
import pandas as pd

file_path = r"C:\Users\ashraf deen\OneDrive\Desktop\Heart Disease\Heart-Disease-Prediction---Cognitives\Datasets\dataset4.csv"

try:
    # Read the CSV file using semicolon as a separator and specify the column names
    # Explicitly define column names based on the task description
    column_names = ['id', 'age', 'gender', 'height', 'weight', 'ap_hi', 'ap_lo', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']
    df = pd.read_csv(file_path, sep=';', names=column_names, header=0) # Assuming the first row is a header

    print(f"Successfully loaded data from {file_path}")
    # Display the first few rows
    display(df.head())
    # Print the data types of each column
    display(df.info())
    # Check for and report any missing values
    display(df.isnull().sum())

    # Identify categorical features (assuming non-numeric types or specific integer codes)
    # Based on the initial task description and common understanding of such datasets
    categorical_features = ['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']
    print("\nPotential categorical features:")
    print(categorical_features)

    # Identify numerical features (assuming numeric types excluding identified categorical and 'id')
    numerical_features = ['age', 'height', 'weight', 'ap_hi', 'ap_lo']
    print("\nPotential numerical features:")
    print(numerical_features)

    # Discuss potential preprocessing steps
    print("\nPotential Preprocessing Steps:")
    print("For categorical features (gender, cholesterol, gluc, smoke, alco, active, cardio):")
    print("- One-Hot Encoding or Label Encoding depending on the nature of the feature and the model.")
    print("- Handling potential inconsistencies in categorical values if any are discovered.")
    print("For numerical features (age, height, weight, ap_hi, ap_lo):")
    print("- Scaling (e.g., StandardScaler or MinMaxScaler) to bring features to a similar range.")
    print("- Handling outliers in features like 'ap_hi' and 'ap_lo' (blood pressure) which can have physiologically impossible values.")
    print("- Checking for and addressing potential data entry errors or inconsistencies.")


except FileNotFoundError:
    print(f"Error: The file '{file_path}' was not found. Please ensure the file is in the correct directory.")
    df = None
except Exception as e:
    print(f"An error occurred during file loading: {e}")
    df = None

Successfully loaded data from C:\Users\ashraf deen\OneDrive\Desktop\Heart Disease\Heart-Disease-Prediction---Cognitives\Datasets\dataset4.csv


Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           70000 non-null  int64  
 1   age          70000 non-null  int64  
 2   gender       70000 non-null  int64  
 3   height       70000 non-null  int64  
 4   weight       70000 non-null  float64
 5   ap_hi        70000 non-null  int64  
 6   ap_lo        70000 non-null  int64  
 7   cholesterol  70000 non-null  int64  
 8   gluc         70000 non-null  int64  
 9   smoke        70000 non-null  int64  
 10  alco         70000 non-null  int64  
 11  active       70000 non-null  int64  
 12  cardio       70000 non-null  int64  
dtypes: float64(1), int64(12)
memory usage: 6.9 MB


None

id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64


Potential categorical features:
['gender', 'cholesterol', 'gluc', 'smoke', 'alco', 'active', 'cardio']

Potential numerical features:
['age', 'height', 'weight', 'ap_hi', 'ap_lo']

Potential Preprocessing Steps:
For categorical features (gender, cholesterol, gluc, smoke, alco, active, cardio):
- One-Hot Encoding or Label Encoding depending on the nature of the feature and the model.
- Handling potential inconsistencies in categorical values if any are discovered.
For numerical features (age, height, weight, ap_hi, ap_lo):
- Scaling (e.g., StandardScaler or MinMaxScaler) to bring features to a similar range.
- Handling outliers in features like 'ap_hi' and 'ap_lo' (blood pressure) which can have physiologically impossible values.
- Checking for and addressing potential data entry errors or inconsistencies.


In [29]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Check for missing values
print("Checking for missing values...")
missing_values = df.isnull().sum()
print(missing_values)

if missing_values.sum() > 0:
    print("\nHandling missing values...")
    # Example: Fill missing numerical values with the mean
    for col in numerical_features:
        if col in df.columns and df[col].isnull().any():
            df[col].fillna(df[col].mean(), inplace=True)
    # Example: Fill missing categorical values with the mode
    for col in categorical_features:
        if col in df.columns and df[col].isnull().any():
            df[col].fillna(df[col].mode()[0], inplace=True)
    print("Missing values handled.")
else:
    print("\nNo missing values found.")


# Define transformers for numerical and categorical features
# Exclude 'id' from numerical features for scaling
# Ensure that the columns exist in the dataframe before adding them to the lists
numerical_features_for_scaling = [col for col in numerical_features if col != 'id' and col in df.columns]
categorical_features_in_df = [col for col in categorical_features if col in df.columns]


numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a column transformer to apply different transformations to different columns
# 'id' column is passed through as it's not needed for scaling or encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features_for_scaling),
        ('cat', categorical_transformer, categorical_features_in_df)],
    remainder='passthrough') # Keep the 'id' column

# Create a preprocessing pipeline
preprocess_pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

# Apply preprocessing
df_processed = preprocess_pipeline.fit_transform(df)

# Display the shape of the processed data
print("\nShape of the processed data:")
print(df_processed.shape)

# Display a few rows of the processed data (Note: This will be a numpy array after one-hot encoding)
print("\nFirst 5 rows of the processed data (numpy array):")
print(df_processed[:5])

# You can get the feature names after one-hot encoding if needed for later steps
# This requires fitting the OneHotEncoder separately or accessing it from the fitted pipeline
# For now, we'll just display the processed numpy array.

Checking for missing values...
id             0
age            0
gender         0
height         0
weight         0
ap_hi          0
ap_lo          0
cholesterol    0
gluc           0
smoke          0
alco           0
active         0
cardio         0
dtype: int64

No missing values found.

Shape of the processed data:
(70000, 22)

First 5 rows of the processed data (numpy array):
[[-0.43606151  0.44345206 -0.84787326 -0.12218198 -0.0882385   0.
   1.          1.          0.          0.          1.          0.
   0.          1.          0.          1.          0.          0.
   1.          1.          0.          0.        ]
 [ 0.30768633 -1.01816804  0.74983117  0.07261016 -0.03517999  1.
   0.          0.          0.          1.          1.          0.
   0.          1.          0.          1.          0.          0.
   1.          0.          1.          1.        ]
 [-0.24799666  0.07804703 -0.70894244  0.00767945 -0.14129701  1.
   0.          0.          0.          1.          1

In [30]:
# This code block focuses on preparing data for prompting a generative model.
# Actual LLM interaction will be in subsequent steps.

# Display the processed data (if you want to see how it looks before using it for prompts)
# print("\nProcessed Data (first 5 rows):")
# print(df_processed[:5])

# Example of how you might structure data from your dataset to create prompts.
# This is a conceptual example and will need to be adapted based on the specific LLM API used
# and the desired format of the input and output.

# Let's assume we want to create prompts that provide health advice based on a user's
# health attributes, using examples from the dataset to guide the LLM.

# First, let's look at the original DataFrame again to understand the structure
# display(df.head())

# We can create prompt examples by pairing health attributes with the 'cardio' outcome
# and potentially deriving some simple advice based on the attributes.

def create_health_summary_advice_prompt(health_data):
    """
    Creates a prompt for an LLM to generate a health summary and advice
    based on provided health data.

    Args:
        health_data (dict): A dictionary containing the user's health attributes.

    Returns:
        str: The formatted prompt string.
    """
    prompt = "Based on the following health data, provide a health summary and advice:\n\n"
    for key, value in health_data.items():
        prompt += f"{key}: {value}\n"

    # Add context or instructions for the LLM
    prompt += "\nProvide a summary of potential health risks based on this data, focusing on cardiovascular health. Then, offer actionable advice to maintain or improve health, considering the provided attributes."
    prompt += "Keep the summary concise and the advice practical."

    return prompt

# Example of creating a prompt for the first individual in the dataset
# (assuming df is your loaded pandas DataFrame)
if df is not None and not df.empty:
    first_individual_data = df.iloc[0].drop('id').to_dict() # Exclude 'id' for the prompt
    example_prompt = create_health_summary_advice_prompt(first_individual_data)
    print("\nExample Prompt generated from the first row of the dataset:")
    print(example_prompt)

    # In a real application, you would then send this prompt to an LLM API.
    # We will cover the LLM interaction in a subsequent step.

else:
    print("\nDataFrame is not loaded or is empty. Cannot generate example prompt.")


Example Prompt generated from the first row of the dataset:
Based on the following health data, provide a health summary and advice:

age: 18393.0
gender: 2.0
height: 168.0
weight: 62.0
ap_hi: 110.0
ap_lo: 80.0
cholesterol: 1.0
gluc: 1.0
smoke: 0.0
alco: 0.0
active: 1.0
cardio: 0.0

Provide a summary of potential health risks based on this data, focusing on cardiovascular health. Then, offer actionable advice to maintain or improve health, considering the provided attributes.Keep the summary concise and the advice practical.


In [34]:
# Import the necessary library for interacting with the LLM
# This example uses Google's Generative AI library.
# You might need to install it: !pip install google-generativeai
import google.generativeai as genai
import os
# from google.colab import userdata
genai.configure(api_key="AIzaSyBzlOQ7y7n0z-P6WcEwPV8wDIb3oeKsMPI")
# --- LLM Setup ---
# Configure the API key
# try:
#     # Access the API key from Colab's secrets manager
#     GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
#     genai.configure(api_key=GOOGLE_API_KEY)
# except userdata.SecretNotFoundError:
#     print("API key not found. Please store your Google API key in Colab's secrets manager with the name 'GOOGLE_API_KEY'.")
#     genai = None # Set genai to None if API key is not found
# except Exception as e:
#     print(f"An error occurred during API key configuration: {e}")
#     genai = None

# List available models
if genai is not None:
    print("Listing available models:")
    try:
        for m in genai.list_models():
            print(f"Name: {m.name}, Supported Generation Methods: {m.supported_generation_methods}")
    except Exception as e:
        print(f"An error occurred while listing models: {e}")

# Initialize the Generative Model
if genai is not None:
    try:
        # Use a suitable model that supports 'generateContent'
        # Updated model name to 'gemini-1.5-flash-latest'
        gemini_model = genai.GenerativeModel('gemini-1.5-flash-latest')
        print("Generative Model initialized.")
    except Exception as e:
        print(f"An error occurred during model initialization: {e}")
        gemini_model = None
else:
    gemini_model = None


# --- Summarizer and Advisor Function ---
def get_health_summary_and_advice(health_data):
    """
    Generates a health summary and advice using an LLM based on provided health data.

    Args:
        health_data (dict): A dictionary containing the user's health attributes.

    Returns:
        str: The generated health summary and advice, or an error message.
    """
    if gemini_model is None:
        return "LLM is not initialized. Cannot generate summary and advice."

    prompt = create_health_summary_advice_prompt(health_data) # Reuse the prompt creation function from the previous step

    try:
        # Send the prompt to the LLM
        response = gemini_model.generate_content(prompt)

        # Return the generated text
        return response.text

    except Exception as e:
        return f"An error occurred during LLM interaction: {e}"

# --- Example Usage (using data from the dataset) ---
if df is not None and not df.empty and gemini_model is not None:
    # Use data from the first individual in the dataset as an example
    first_individual_data = df.iloc[0].drop('id').to_dict()
    print("\nGenerating health summary and advice for the first individual in the dataset...")
    health_summary_advice = get_health_summary_and_advice(first_individual_data)
    print("\n--- Health Summary and Advice ---")
    print(health_summary_advice)
elif df is None or df.empty:
    print("\nDataFrame is not loaded or is empty. Cannot generate example summary and advice.")
elif gemini_model is None:
     print("\nGenerative Model is not initialized. Cannot generate example summary and advice.")

Listing available models:
Name: models/embedding-gecko-001, Supported Generation Methods: ['embedText', 'countTextTokens']
Name: models/gemini-1.5-pro-latest, Supported Generation Methods: ['generateContent', 'countTokens']
Name: models/gemini-1.5-pro-002, Supported Generation Methods: ['generateContent', 'countTokens', 'createCachedContent']
Name: models/gemini-1.5-pro, Supported Generation Methods: ['generateContent', 'countTokens']
Name: models/gemini-1.5-flash-latest, Supported Generation Methods: ['generateContent', 'countTokens']
Name: models/gemini-1.5-flash, Supported Generation Methods: ['generateContent', 'countTokens']
Name: models/gemini-1.5-flash-002, Supported Generation Methods: ['generateContent', 'countTokens', 'createCachedContent']
Name: models/gemini-1.5-flash-8b, Supported Generation Methods: ['createCachedContent', 'generateContent', 'countTokens']
Name: models/gemini-1.5-flash-8b-001, Supported Generation Methods: ['createCachedContent', 'generateContent', 'count

In [None]:
def get_user_input_and_provide_advice():
    """
    Prompts the user for health data, generates a summary and advice using the LLM,
    and displays the results.
    """
    user_health_data = {}

    print("Please enter your health data:")

    try:
        user_health_data['age'] = float(input("Age (in days): "))
        user_health_data['gender'] = float(input("Gender (1: female, 2: male): "))
        user_health_data['height'] = float(input("Height (in cm): "))
        user_health_data['weight'] = float(input("Weight (in kg): "))
        user_health_data['ap_hi'] = float(input("Systolic blood pressure (ap_hi): "))
        user_health_data['ap_lo'] = float(input("Diastolic blood pressure (ap_lo): "))
        user_health_data['cholesterol'] = float(input("Cholesterol (1: normal, 2: above normal, 3: well above normal): "))
        user_health_data['gluc'] = float(input("Glucose (1: normal, 2: above normal, 3: well above normal): "))
        user_health_data['smoke'] = float(input("Smoking (0: no, 1: yes): "))
        user_health_data['alco'] = float(input("Alcohol intake (0: no, 1: yes): "))
        user_health_data['active'] = float(input("Physical activity (0: no, 1: yes): "))
        user_health_data['cardio'] = float(input("Presence or absence of cardiovascular disease (0: no, 1: yes): "))

        print("\nGenerating health summary and advice based on your input...")
        health_summary_advice = get_health_summary_and_advice(user_health_data)

        print("\n--- Your Health Summary and Advice ---")
        print(health_summary_advice)

    except ValueError:
        print("Invalid input. Please enter numerical values where expected.")
    except Exception as e:
        print(f"An error occurred: {e}")

# Run the function to test with user input
if gemini_model is not None:
    get_user_input_and_provide_advice()
else:
    print("\nGenerative Model is not initialized. Cannot test with user input.")

Please enter your health data:



=== Enhanced Summary with Personalized Suggestions ===

You are a clinical AI assistant. Based on the patient details and risk predictions below, provide: A concise clinical summary of the patient's cardiovascular risk. At least three clear and actionable health suggestions tailored to the patient’s specific metrics, including diet, exercise, medication, and lifestyle changes.


KeyError: 'age_years'

['id;age;gender;height;weight;ap_hi;ap_lo;cholesterol;gluc;smoke;alco;active;cardio']
