### Load Data

In [None]:
import numpy as np
import pandas as pd

## Download csv -> Copy to your google drive -> Change to the your file location

data_path = "/content/train.csv"
test_path = "/content/test.csv"
# data_path = "/kaggle/input/hearth-disease-recognition/train.csv"

df = pd.read_csv(data_path)
df_submission = pd.read_csv(test_path)

In [None]:
df.head()

Unnamed: 0,ID,History of HeartDisease or Attack,High Blood Pressure,Told High Cholesterol,Cholesterol Checked,Body Mass Index,Smoked 100+ Cigarettes,Diagnosed Stroke,Diagnosed Diabetes,Leisure Physical Activity,Heavy Alcohol Consumption,Health Care Coverage,Doctor Visit Cost Barrier,General Health,Difficulty Walking,Sex,Education Level,Income Level,Age,Vegetable or Fruit Intake (1+ per Day)
0,train_000001,No,Yes,Yes,Yes,40.68,Yes,No,No,No,No,Yes,No,Very Poor,Yes,Female,High school graduate,"$15,000 to less than $20,000",64,Yes
1,train_000002,No,No,No,No,24.36,Yes,No,No,Yes,No,No,Yes,Fair,No,Female,College graduate,"Less than $10,000",50,No
2,train_000003,No,Yes,Yes,Yes,27.33,No,No,No,No,No,Yes,Yes,Very Poor,Yes,Female,High school graduate,"$75,000 or more",61,Yes
3,train_000004,No,Yes,No,Yes,27.01,No,No,No,Yes,No,Yes,No,Good,No,Female,Some high school,"$35,000 to less than $50,000",74,Yes
4,train_000005,,Yes,Yes,Yes,34.56,Yes,No,No,Yes,No,Yes,Yes,Very Poor,Yes,Male,Some high school,"$15,000 to less than $20,000",98,Yes


In [None]:
df["History of HeartDisease or Attack"].isnull().sum()

1694

In [None]:
# Drop rows where 'History of HeartDisease or Attack' is null and store the dropped rows in a new DataFrame
dropped_rows = df[df['History of HeartDisease or Attack'].isnull()]

# Keep the remaining rows in the original DataFrame
df = df.dropna(subset=["History of HeartDisease or Attack"])

# Count remaining null values in 'History of HeartDisease or Attack'
null_count = df["History of HeartDisease or Attack"].isnull().sum()

print(f"Remaining null values in 'History of HeartDisease or Attack': {null_count}")

Remaining null values in 'History of HeartDisease or Attack': 0


I think we should try drop it out first.

In [None]:
# Check columns list
df.keys()

Index(['ID', 'History of HeartDisease or Attack', 'High Blood Pressure',
       'Told High Cholesterol', 'Cholesterol Checked', 'Body Mass Index',
       'Smoked 100+ Cigarettes', 'Diagnosed Stroke', 'Diagnosed Diabetes',
       'Leisure Physical Activity', 'Heavy Alcohol Consumption',
       'Health Care Coverage', 'Doctor Visit Cost Barrier', 'General Health',
       'Difficulty Walking', 'Sex', 'Education Level', 'Income Level', 'Age',
       'Vegetable or Fruit Intake (1+ per Day)'],
      dtype='object')

In [None]:
# Check data type for each column
df.dtypes

Unnamed: 0,0
ID,object
History of HeartDisease or Attack,object
High Blood Pressure,object
Told High Cholesterol,object
Cholesterol Checked,object
Body Mass Index,float64
Smoked 100+ Cigarettes,object
Diagnosed Stroke,object
Diagnosed Diabetes,object
Leisure Physical Activity,object


In [None]:
# Descriptive statistics
df.describe()

Unnamed: 0,Body Mass Index,Age
count,209665.0,221390.0
mean,28.181723,54.660215
std,6.686984,17.773171
min,11.21,18.0
25%,23.87,42.0
50%,27.06,56.0
75%,31.08,67.0
max,98.63,100.0


In [None]:
df["History of HeartDisease or Attack"].value_counts()

Unnamed: 0_level_0,count
History of HeartDisease or Attack,Unnamed: 1_level_1
No,203322
Yes,18068


In [None]:
text_columns = []
numeric_columns = []

for col in df.columns:
  if df[col].dtype == 'object':
    text_columns.append(col)
  else:
    numeric_columns.append(col)

print("Text Columns:", text_columns)
print("Numeric Columns:", numeric_columns)

Text Columns: ['ID', 'History of HeartDisease or Attack', 'High Blood Pressure', 'Told High Cholesterol', 'Cholesterol Checked', 'Smoked 100+ Cigarettes', 'Diagnosed Stroke', 'Diagnosed Diabetes', 'Leisure Physical Activity', 'Heavy Alcohol Consumption', 'Health Care Coverage', 'Doctor Visit Cost Barrier', 'General Health', 'Difficulty Walking', 'Sex', 'Education Level', 'Income Level', 'Vegetable or Fruit Intake (1+ per Day)']
Numeric Columns: ['Body Mass Index', 'Age']


In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Assuming 'df' is your DataFrame
income_order = ['$75,000 or more', '$50,000 to less than $75,000',
               '$35,000 to less than $50,000', '$25,000 to less than $35,000',
               '$20,000 to less than $25,000', '$15,000 to less than $20,000',
               '($10,000 to less than $15,000', 'Less than $10,000']

# Create a mapping dictionary
income_mapping = {income: i for i, income in enumerate(income_order)}

# Apply the mapping to create a new encoded column
df['Income_Level_Encoded'] = df['Income Level'].map(income_mapping)

dropped_rows['Income_Level_Encoded'] = dropped_rows['Income Level'].map(income_mapping)

df_submission['Income_Level_Encoded'] = df_submission['Income Level'].map(income_mapping)

In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Assuming your DataFrame is named 'df' and the income level column is 'income_level'
encoder = OrdinalEncoder(categories=[['Less than $10,000', '($10,000 to less than $15,000', '$15,000 to less than $20,000', '$20,000 to less than $25,000', '$25,000 to less than $35,000', '$35,000 to less than $50,000', '$50,000 to less than $75,000', '$75,000 or more']])

df['Income_Level_Ordinal'] = encoder.fit_transform(df[['Income Level']])

df_submission['Income_Level_Ordinal'] = encoder.fit_transform(df_submission[['Income Level']])

dropped_rows['Income_Level_Ordinal'] = encoder.fit_transform(dropped_rows[['Income Level']])

In [None]:
df['Income_Level_Ordinal'].unique()

array([2., 0., 7., 5., 3., 1., 6., 4.])

In [None]:
# Label Encoding

from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
encoder = LabelEncoder()

# List of columns to encode
columns_to_encode = ['History of HeartDisease or Attack', 'High Blood Pressure', 'Income_Level_Encoded',
                     'Told High Cholesterol', 'Cholesterol Checked', 'Smoked 100+ Cigarettes',
                     'Diagnosed Stroke', 'Diagnosed Diabetes', 'Leisure Physical Activity',
                     'Heavy Alcohol Consumption', 'Health Care Coverage', 'Doctor Visit Cost Barrier',
                     'General Health', 'Difficulty Walking', 'Sex', 'Education Level',
                    'Vegetable or Fruit Intake (1+ per Day)']

# List of columns to encode
submission_to_encode = ['High Blood Pressure', 'Income_Level_Encoded',
                     'Told High Cholesterol', 'Cholesterol Checked', 'Smoked 100+ Cigarettes',
                     'Diagnosed Stroke', 'Diagnosed Diabetes', 'Leisure Physical Activity',
                     'Heavy Alcohol Consumption', 'Health Care Coverage', 'Doctor Visit Cost Barrier',
                     'General Health', 'Difficulty Walking', 'Sex', 'Education Level',
                    'Vegetable or Fruit Intake (1+ per Day)']

# Loop through the columns and apply Label Encoding
for column in columns_to_encode:
    df[column + '_encoded'] = encoder.fit_transform(df[column])

for column in submission_to_encode:
    df_submission[column + '_encoded'] = encoder.fit_transform(df_submission[column])
    dropped_rows[column + '_encoded'] = encoder.fit_transform(dropped_rows[column])

In [None]:
df = df.drop(columns=columns_to_encode)
df_submission = df_submission.drop(columns=submission_to_encode)
dropped_rows = dropped_rows.drop(columns=submission_to_encode)

In [None]:
df.head()

Unnamed: 0,ID,Body Mass Index,Income Level,Age,Income_Level_Ordinal,History of HeartDisease or Attack_encoded,High Blood Pressure_encoded,Income_Level_Encoded_encoded,Told High Cholesterol_encoded,Cholesterol Checked_encoded,...,Diagnosed Diabetes_encoded,Leisure Physical Activity_encoded,Heavy Alcohol Consumption_encoded,Health Care Coverage_encoded,Doctor Visit Cost Barrier_encoded,General Health_encoded,Difficulty Walking_encoded,Sex_encoded,Education Level_encoded,Vegetable or Fruit Intake (1+ per Day)_encoded
0,train_000001,40.68,"$15,000 to less than $20,000",64,2.0,0,1,5,1,1,...,0,0,0,1,0,4,1,0,2,1
1,train_000002,24.36,"Less than $10,000",50,0.0,0,0,7,0,0,...,0,1,0,0,1,1,0,0,0,0
2,train_000003,27.33,"$75,000 or more",61,7.0,0,1,0,1,1,...,0,0,0,1,1,4,1,0,2,1
3,train_000004,27.01,"$35,000 to less than $50,000",74,5.0,0,1,2,0,1,...,0,1,0,1,0,2,0,0,5,1
5,train_000006,25.11,"$75,000 or more",67,7.0,0,1,0,1,1,...,0,1,0,1,0,2,0,1,0,1


In [None]:
df["History of HeartDisease or Attack_encoded"].unique()

array([0, 1])

In [None]:
df["Income_Level_Encoded_encoded"].unique()

array([5, 7, 0, 2, 4, 6, 1, 3])

In [None]:
from sklearn.preprocessing import StandardScaler

# Assuming 'numeric_columns' list is defined as in your previous code

# numeric_columns = ['Body Mass Index', 'Age', 'Income_Level_TargetEncoded', 'Income_Level_Midpoint_encoded']
numeric_columns = ['Body Mass Index', 'Age', 'Income_Level_Encoded_encoded'] # Don't include the income!
# Create a StandardScaler object
scaler = StandardScaler()

# Fit and transform the numerical columns
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

df_submission[numeric_columns] = scaler.fit_transform(df_submission[numeric_columns])

dropped_rows[numeric_columns] = scaler.fit_transform(dropped_rows[numeric_columns])

In [None]:
df.head(5)

Unnamed: 0,ID,Body Mass Index,Income Level,Age,Income_Level_Ordinal,History of HeartDisease or Attack_encoded,High Blood Pressure_encoded,Income_Level_Encoded_encoded,Told High Cholesterol_encoded,Cholesterol Checked_encoded,...,Diagnosed Diabetes_encoded,Leisure Physical Activity_encoded,Heavy Alcohol Consumption_encoded,Health Care Coverage_encoded,Doctor Visit Cost Barrier_encoded,General Health_encoded,Difficulty Walking_encoded,Sex_encoded,Education Level_encoded,Vegetable or Fruit Intake (1+ per Day)_encoded
0,train_000001,1.86905,"$15,000 to less than $20,000",0.5255,2.0,0,1,1.371969,1,1,...,0,0,0,1,0,4,1,0,2,1
1,train_000002,-0.571518,"Less than $10,000",-0.262206,0.0,0,0,2.313398,0,0,...,0,1,0,0,1,1,0,0,0,0
2,train_000003,-0.127371,"$75,000 or more",0.356706,7.0,0,1,-0.981603,1,1,...,0,0,0,1,1,4,1,0,2,1
3,train_000004,-0.175225,"$35,000 to less than $50,000",1.088147,5.0,0,1,-0.040174,0,1,...,0,1,0,1,0,2,0,0,5,1
5,train_000006,-0.45936,"$75,000 or more",0.694294,7.0,0,1,-0.981603,1,1,...,0,1,0,1,0,2,0,1,0,1


In [None]:
# Check for null values in the entire DataFrame
null_counts = df.isnull().sum()
print("Null values in each column:\n", null_counts)

Null values in each column:
 ID                                                    0
Body Mass Index                                   11725
Income Level                                          0
Age                                                   0
Income_Level_Ordinal                                  0
History of HeartDisease or Attack_encoded             0
High Blood Pressure_encoded                           0
Income_Level_Encoded_encoded                          0
Told High Cholesterol_encoded                         0
Cholesterol Checked_encoded                           0
Smoked 100+ Cigarettes_encoded                        0
Diagnosed Stroke_encoded                              0
Diagnosed Diabetes_encoded                            0
Leisure Physical Activity_encoded                     0
Heavy Alcohol Consumption_encoded                     0
Health Care Coverage_encoded                          0
Doctor Visit Cost Barrier_encoded                     0
General Health_enco

In [None]:
# Calculate the correlation matrix, excluding non-numeric columns
correlation_matrix = df.select_dtypes(include=np.number).corr()

# Extract the correlation of BMI with other features
bmi_correlations = correlation_matrix['Body Mass Index']

# Print the correlations
print(bmi_correlations)

Body Mass Index                                   1.000000
Age                                               0.003893
Income_Level_Ordinal                             -0.081775
History of HeartDisease or Attack_encoded         0.056948
High Blood Pressure_encoded                       0.217642
Income_Level_Encoded_encoded                      0.081775
Told High Cholesterol_encoded                     0.001457
Cholesterol Checked_encoded                       0.084248
Smoked 100+ Cigarettes_encoded                    0.017905
Diagnosed Stroke_encoded                          0.024072
Diagnosed Diabetes_encoded                        0.208739
Leisure Physical Activity_encoded                -0.138235
Heavy Alcohol Consumption_encoded                -0.046348
Health Care Coverage_encoded                     -0.007983
Doctor Visit Cost Barrier_encoded                 0.048792
General Health_encoded                            0.151496
Difficulty Walking_encoded                        0.1910

In [None]:
# prompt: sorting the bmi_correlations to see which higher and doing the abs.

# Sort the BMI correlations by absolute value in descending order
sorted_bmi_correlations = bmi_correlations.abs().sort_values(ascending=False)

sorted_bmi_correlations


Unnamed: 0,Body Mass Index
Body Mass Index,1.0
High Blood Pressure_encoded,0.217642
Diagnosed Diabetes_encoded,0.208739
Difficulty Walking_encoded,0.191098
General Health_encoded,0.151496
Leisure Physical Activity_encoded,0.138235
Cholesterol Checked_encoded,0.084248
Education Level_encoded,0.083545
Income_Level_Encoded_encoded,0.081775
Income_Level_Ordinal,0.081775


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

# 1. Prepare the data
df_train = df[df['Body Mass Index'].notnull()]
X = df_train[['High Blood Pressure_encoded', 'Diagnosed Diabetes_encoded', 'General Health_encoded', 'Difficulty Walking_encoded', 'Leisure Physical Activity_encoded' , 'History of HeartDisease or Attack_encoded']]
y = df_train['Body Mass Index']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# Replace Linear Regression with Gradient Boosting Regressor
model = GradientBoostingRegressor(random_state=42)

# The rest of the code remains the same
model.fit(X_train, y_train)
y_pred_val = model.predict(X_val)
mse = mean_squared_error(y_val, y_pred_val)
print(f"Mean Squared Error: {mse}")

print(y_pred_val)

Mean Squared Error: 0.8975238876030172
[ 0.13533188 -0.23750031 -0.19515408 ... -0.06664785 -0.25297796
  0.20978957]


In [None]:
# 4. Predict missing BMI values
df_missing = df[df['Body Mass Index'].isnull()]
X_missing = df_missing[['High Blood Pressure_encoded', 'Diagnosed Diabetes_encoded', 'General Health_encoded', 'Difficulty Walking_encoded', 'Leisure Physical Activity_encoded' , 'History of HeartDisease or Attack_encoded']]
predicted_bmi = model.predict(X_missing)

# Replace missing values in the original DataFrame
df.loc[df['Body Mass Index'].isnull(), 'Body Mass Index'] = predicted_bmi

In [None]:
df.keys()

Index(['ID', 'Body Mass Index', 'Income Level', 'Age', 'Income_Level_Ordinal',
       'History of HeartDisease or Attack_encoded',
       'High Blood Pressure_encoded', 'Income_Level_Encoded_encoded',
       'Told High Cholesterol_encoded', 'Cholesterol Checked_encoded',
       'Smoked 100+ Cigarettes_encoded', 'Diagnosed Stroke_encoded',
       'Diagnosed Diabetes_encoded', 'Leisure Physical Activity_encoded',
       'Heavy Alcohol Consumption_encoded', 'Health Care Coverage_encoded',
       'Doctor Visit Cost Barrier_encoded', 'General Health_encoded',
       'Difficulty Walking_encoded', 'Sex_encoded', 'Education Level_encoded',
       'Vegetable or Fruit Intake (1+ per Day)_encoded'],
      dtype='object')

In [None]:
keyschosen = ['Body Mass Index', 'Age',
       'History of HeartDisease or Attack_encoded',
       'High Blood Pressure_encoded',
        'Income_Level_Encoded_encoded',
        'Income_Level_Ordinal',
       'Told High Cholesterol_encoded', 'Cholesterol Checked_encoded',
       'Smoked 100+ Cigarettes_encoded', 'Diagnosed Stroke_encoded',
       'Diagnosed Diabetes_encoded', 'Leisure Physical Activity_encoded',
       'Heavy Alcohol Consumption_encoded', 'Health Care Coverage_encoded',
       'Doctor Visit Cost Barrier_encoded', 'General Health_encoded',
       'Difficulty Walking_encoded', 'Sex_encoded', 'Education Level_encoded',
       'Vegetable or Fruit Intake (1+ per Day)_encoded']

In [None]:
df_chosen_test = df[keyschosen]
df_chosen_test.head()

X_forest = df_chosen_test.drop('History of HeartDisease or Attack_encoded', axis=1)
y_forest = df_chosen_test['History of HeartDisease or Attack_encoded']

X_train, X_test, y_train, y_test = train_test_split(X_forest, y_forest, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train the Random Forest Classifier
rf_classifier = RandomForestClassifier()
rf_classifier.fit(X_train, y_train)

# Get feature importances
feature_importances = rf_classifier.feature_importances_

# Print feature importances
for feature_name, importance in zip(X_train.columns, feature_importances):
  print(f"{feature_name}: {importance}")

Body Mass Index: 0.2849142163899669
Age: 0.21368634601119482
High Blood Pressure_encoded: 0.028127399016211117
Income_Level_Encoded_encoded: 0.051462089106463334
Income_Level_Ordinal: 0.05124182285013313
Told High Cholesterol_encoded: 0.025828291769934668
Cholesterol Checked_encoded: 0.007992269450936496
Smoked 100+ Cigarettes_encoded: 0.02087980202587731
Diagnosed Stroke_encoded: 0.027536598812935336
Diagnosed Diabetes_encoded: 0.02094665416898476
Leisure Physical Activity_encoded: 0.02774094816230919
Heavy Alcohol Consumption_encoded: 0.00867751933047226
Health Care Coverage_encoded: 0.008101429764631497
Doctor Visit Cost Barrier_encoded: 0.015170731420630922
General Health_encoded: 0.07132263795502429
Difficulty Walking_encoded: 0.026103673542667644
Sex_encoded: 0.022032851771286804
Education Level_encoded: 0.06724113305406164
Vegetable or Fruit Intake (1+ per Day)_encoded: 0.02099358539627783


In [None]:
feature_importance_df = pd.DataFrame({'Feature': X_train.columns, 'Importance': feature_importances})

feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print(feature_importance_df)

                                           Feature  Importance
0                                  Body Mass Index    0.284914
1                                              Age    0.213686
14                          General Health_encoded    0.071323
17                         Education Level_encoded    0.067241
3                     Income_Level_Encoded_encoded    0.051462
4                             Income_Level_Ordinal    0.051242
2                      High Blood Pressure_encoded    0.028127
10               Leisure Physical Activity_encoded    0.027741
8                         Diagnosed Stroke_encoded    0.027537
15                      Difficulty Walking_encoded    0.026104
5                    Told High Cholesterol_encoded    0.025828
16                                     Sex_encoded    0.022033
18  Vegetable or Fruit Intake (1+ per Day)_encoded    0.020994
9                       Diagnosed Diabetes_encoded    0.020947
7                   Smoked 100+ Cigarettes_encoded    0

---

In [None]:
keyschosen = ['Body Mass Index', 'Age',
       'History of HeartDisease or Attack_encoded',
       'High Blood Pressure_encoded',
        'Income_Level_Encoded_encoded',
        'Income_Level_Ordinal',
       'Told High Cholesterol_encoded',
       'Smoked 100+ Cigarettes_encoded', 'Diagnosed Stroke_encoded',
       'Diagnosed Diabetes_encoded', 'Leisure Physical Activity_encoded',
       'General Health_encoded',
       'Difficulty Walking_encoded', 'Sex_encoded', 'Education Level_encoded',
       'Vegetable or Fruit Intake (1+ per Day)_encoded',
      'Doctor Visit Cost Barrier_encoded'
       ]
df_chosen = df[keyschosen]
df_chosen.head()

Unnamed: 0,Body Mass Index,Age,History of HeartDisease or Attack_encoded,High Blood Pressure_encoded,Income_Level_Encoded_encoded,Income_Level_Ordinal,Told High Cholesterol_encoded,Smoked 100+ Cigarettes_encoded,Diagnosed Stroke_encoded,Diagnosed Diabetes_encoded,Leisure Physical Activity_encoded,General Health_encoded,Difficulty Walking_encoded,Sex_encoded,Education Level_encoded,Vegetable or Fruit Intake (1+ per Day)_encoded,Doctor Visit Cost Barrier_encoded
0,1.86905,0.5255,0,1,1.371969,2.0,1,1,0,0,0,4,1,0,2,1,0
1,-0.571518,-0.262206,0,0,2.313398,0.0,0,1,0,0,1,1,0,0,0,0,1
2,-0.127371,0.356706,0,1,-0.981603,7.0,1,0,0,0,0,4,1,0,2,1,1
3,-0.175225,1.088147,0,1,-0.040174,5.0,0,0,0,0,1,2,0,0,5,1,0
5,-0.45936,0.694294,0,1,-0.981603,7.0,1,1,0,0,1,2,0,1,0,1,0


In [None]:
df_chosen.shape

(221390, 17)

In [None]:
subkeyschosen =['Body Mass Index', 'Age',
       'High Blood Pressure_encoded', 'Income_Level_Ordinal',
       'Told High Cholesterol_encoded', 'Diagnosed Stroke_encoded',
        'Leisure Physical Activity_encoded',
      'General Health_encoded',
       'Difficulty Walking_encoded', 'Sex_encoded', 'Education Level_encoded',
       'Vegetable or Fruit Intake (1+ per Day)_encoded',
       'Diagnosed Diabetes_encoded',
       'Doctor Visit Cost Barrier_encoded',
       'Smoked 100+ Cigarettes_encoded'
       ]
df_submission_chosen = df_submission[subkeyschosen]

In [None]:
df_submission_chosen.shape

(74361, 15)

In [None]:
df_submission_chosen.head()

Unnamed: 0,Body Mass Index,Age,High Blood Pressure_encoded,Income_Level_Ordinal,Told High Cholesterol_encoded,Diagnosed Stroke_encoded,Leisure Physical Activity_encoded,General Health_encoded,Difficulty Walking_encoded,Sex_encoded,Education Level_encoded,Vegetable or Fruit Intake (1+ per Day)_encoded,Diagnosed Diabetes_encoded,Doctor Visit Cost Barrier_encoded,Smoked 100+ Cigarettes_encoded
0,-0.539876,0.807178,1,3.0,1,0,1,2,0,0,4,1,0,0,0
1,0.102688,0.199267,1,6.0,0,0,0,1,0,0,0,0,0,0,1
2,1.034708,0.564014,1,0.0,1,0,0,1,1,0,4,1,0,0,1
3,-0.548969,-0.469436,0,6.0,0,0,0,1,0,0,4,1,0,0,1
4,-0.12615,-1.077348,0,4.0,0,0,0,1,0,1,4,1,0,0,1


In [None]:
!pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Collecting imbalanced-learn (from imblearn)
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn->imblearn)
  Downloading sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.4/238.4 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn, imblearn
Successfully installed imbalanced-learn-0.13.0 imblearn-0.0 sklearn-compat-0.1.3


In [None]:
from imblearn.under_sampling import RandomUnderSampler

# Remove the columns

# Separate features (X) and target variable (y)
X = df_chosen.drop('History of HeartDisease or Attack_encoded', axis=1)
y = df_chosen['History of HeartDisease or Attack_encoded']

# y = df['History of HeartDisease or Attack_encoded']

# Initialize RandomUnderSampler
# rus = RandomUnderSampler(sampling_strategy=0.5 , random_state=42)
rus = RandomUnderSampler(random_state=42)

# Resample the data
X_resampled, y_resampled = rus.fit_resample(X, y)

# Create a new balanced DataFrame
df_balanced = pd.DataFrame(X_resampled, columns=X.columns)
df_balanced['History of HeartDisease or Attack_encoded'] = y_resampled

In [None]:
df_balanced['History of HeartDisease or Attack_encoded'].value_counts()

Unnamed: 0_level_0,count
History of HeartDisease or Attack_encoded,Unnamed: 1_level_1
0,18068
1,18068


In [None]:
from sklearn.model_selection import train_test_split

# Split the resampled data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42
)

In [None]:
len(X_train)
y_train.value_counts()

Unnamed: 0_level_0,count
History of HeartDisease or Attack_encoded,Unnamed: 1_level_1
1,14454
0,14454


In [None]:
!pip -q install -U pip
!pip -q install -U setuptools wheel
!pip -q install autogluon

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m22.6 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.34.0 requires jedi>=0.16, which is not installed.[0m[31m
[0m  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [

Customize the AutoGluon to have F2 Score

In [None]:
from sklearn.metrics import fbeta_score

def f2_score(y_true, y_pred):
  return fbeta_score(y_true, y_pred, beta=2)  # beta=2 for F2 score

In [None]:
from autogluon.core.metrics import make_scorer

ag_f2_scorer = make_scorer(
    name='f2_score',  # Choose a name for your scorer
    score_func=f2_score,  # Pass your F2 score function
    optimum=1,  # The optimum value (maximum for F2 score)
    greater_is_better=True  # Indicate that higher is better
)

In [None]:
from autogluon.tabular import TabularPredictor

# Combine features and target for AutoGluon
train_data = X_train.copy()
train_data['History of HeartDisease or Attack_encoded'] = y_train

test_data = X_test.copy()
test_data['History of HeartDisease or Attack_encoded'] = y_test

# Define the label column
label_column = 'History of HeartDisease or Attack_encoded'

# Create and train the AutoGluon predictor
predictor = TabularPredictor(label=label_column).fit(train_data)
# predictor = TabularPredictor(label=label_column, eval_metric=ag_f2_scorer).fit(train_data)

No path specified. Models will be saved in: "AutogluonModels/ag-20250216_005259"
Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          96
Memory Avail:       328.85 GB / 334.56 GB (98.3%)
Disk Space Avail:   207.40 GB / 225.33 GB (92.0%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong

In [None]:
from google.colab import drive
drive.mount('/content/drive')

ValueError: mount failed

In [None]:
from sklearn.metrics import classification_report

# Evaluate the predictor
leaderboard = predictor.leaderboard(test_data)
print(leaderboard)

# Predict on test data
y_pred = predictor.predict(test_data)
print(classification_report(y_test,y_pred))

In [None]:
# Save the predictor
predictor.save('/content/AutogluonModels/ag-20250214_183827')

In [None]:
subkeyschosen =['Body Mass Index', 'Age',
       'High Blood Pressure_encoded',
        'Income_Level_Encoded_encoded',
        'Income_Level_Ordinal',
       'Told High Cholesterol_encoded', 'Diagnosed Stroke_encoded',
        'Leisure Physical Activity_encoded',
      'General Health_encoded',
       'Difficulty Walking_encoded', 'Sex_encoded', 'Education Level_encoded',
       'Vegetable or Fruit Intake (1+ per Day)_encoded',
       'Diagnosed Diabetes_encoded',
       'Doctor Visit Cost Barrier_encoded',
       'Smoked 100+ Cigarettes_encoded'
        'Heavy Alcohol Consumption_encoded',
        'Health Care Coverage_encoded',
        'Cholesterol Checked_encoded'
       ]
df_submission_chosen = df_submission[subkeyschosen]

In [None]:
from sklearn.metrics import classification_report

# Predict on the submission data
df_submission_predictions = predictor.predict(df_submission_chosen)

  # that has no feature names.
  # that has no feature names.


In [None]:
# Create a list of formatted IDs
num_rows = len(df_submission_predictions)
formatted_ids = [f'test_{i:06}' for i in range(1, num_rows + 1)]

# Create a submission DataFrame with formatted IDs
submission_df = pd.DataFrame({'ID': formatted_ids,
                             'History of HeartDisease or Attack': df_submission_predictions})

mapping = {0: 'No', 1: 'Yes'}
submission_df['History of HeartDisease or Attack'] = submission_df['History of HeartDisease or Attack'].map(mapping)

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)

In [None]:
submission_df["History of HeartDisease or Attack"].value_counts()

Unnamed: 0_level_0,count
History of HeartDisease or Attack,Unnamed: 1_level_1
No,49266
Yes,25095


In [None]:
# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission.csv', index=False)