### Importing the Library

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

In [None]:
# Configuration
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
pd.set_option('display.max_columns', 500)

# Filter Warning
import warnings
warnings.filterwarnings('ignore')

In [None]:
def read_yaml_to_dict(file_path):
    """Reads a YAML file and returns its content as a dictionary."""
    import yaml
    try:
        with open(file_path, 'r') as file:
            data = yaml.safe_load(file)
        return data
    except FileNotFoundError:
        print(f"Error: File not found: {file_path}")
        return None
    except yaml.YAMLError as e:
        print(f"Error parsing YAML file: {e}")
        return None

In [None]:
features = read_yaml_to_dict("Modified Data/variable.yaml")
df = pd.read_csv("Modified Data/imputed_data.csv")

In [None]:
df.head()

In [None]:
rename = {
  # Binary
  "SEXVAR": "Gender",
  "BPHIGH6": "High Blood Pressure",
  "CHCKDNY2": "Kidney Disease",
  "CHOLMED3": "Taking medicine for high cholesterol",
  "CVDCRHD4": "Angina or Coronary Heart Disease",
  "CVDSTRK3": "Stroke",
  "DIABETE4": "Diabetes",
  "HAVARTH4": "Arthritis",
  "SMOKE100": "Smoked at Least 100 Cigarettes",
  "TOLDHI3": "Cholesterol Is High",
  "EXERANY2": "Exercise in Past 30 Days",
  "CVDINFR4": "Heart Attack",  # Target
  # Ordinal
  "ECIGNOW2": "E-cigarettes Frequency",
  "GENHLTH": "General Health",
  "USENOW3": "Smokeless Tobacco Products",
  "_AGEG5YR": "Age Range",
  # Numeric
  "_AGE80": "Age",
  "PHYSHLTH": "Number of Days Physical Health Not Good",
  "MENTHLTH": "Number of Days Mental Health Not Good",
  "STRENGTH": "Physical activities frequence",
  "ALCDAY4": "Days in past 30 had alcoholic beverage",
  "WEIGHT2": "Weight in Pounds",
  "HEIGHT3": "Reported Height in Feet",
  "_BMI5": "BMI"
}
df = df.rename(columns=rename)
df.head()

In [None]:
df["Gender"].value_counts()
sns.countplot(x='Gender', data=df)

In [None]:
age_range_order = ["Age 18 to 24", "Age 25 to 29", "Age 30 to 34", "Age 35 to 39", "Age 40 to 44", "Age 45 to 49", "Age 50 to 54", "Age 55 to 59", "Age 60 to 64", "Age 65 to 69", "Age 70 to 74", "Age 75 to 79", "Age 80 or older"]
df["Age Range"].value_counts()
g = sns.countplot(x='Age Range', hue="Gender", order=age_range_order ,data=df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
g.set_title

In [None]:
general_health_order = ["Poor", "Fair", "Good", "Very good", "Excellent"]
df["General Health"].value_counts()
g = sns.countplot(x='General Health', hue="Gender", order=general_health_order ,data=df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)

In [None]:
E_cigarettes_frequency_order = ["Never", "Not Right Now", "Some days", "Everyday"]
df["E-cigarettes Frequency"].value_counts()
g = sns.countplot(x='E-cigarettes Frequency', hue="Gender", order=E_cigarettes_frequency_order ,data=df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)

In [None]:
smokeless_tobacco_order = ["Not at all", "Some days", "Every day"]
df["Smokeless Tobacco Products"].value_counts()
g = sns.countplot(x='Smokeless Tobacco Products', hue="Gender", order=smokeless_tobacco_order ,data=df)
g.set_xticklabels(g.get_xticklabels(), rotation=90)


### Encode the data (Category feature)

In [None]:
ordinal_categories = [age_range_order, general_health_order, E_cigarettes_frequency_order, smokeless_tobacco_order]
ordinal_features = ["Age Range", "General Health", "E-cigarettes Frequency", "Smokeless Tobacco Products"]

ordinal_encoder = OrdinalEncoder(categories=ordinal_categories)
df[ordinal_features] = ordinal_encoder.fit_transform(df[ordinal_features])
df[ordinal_features] = df[ordinal_features].astype(int)

df.head()

In [None]:
binary_features = ["Gender", "High Blood Pressure", "Kidney Disease", "Taking medicine for high cholesterol", "Angina or Coronary Heart Disease", 
                   "Heart Attack", "Stroke", "Diabetes", "Arthritis", "Smoked at Least 100 Cigarettes", "Cholesterol Is High", "Exercise in Past 30 Days"]

label_encoder = LabelEncoder()
for features in binary_features:
    df[features] = label_encoder.fit_transform(df[features])

df.head()

In [None]:
def summarize_correlations(corr_matrix):
    summary = {
        "Very Strong": [],
        "Strong": [],
        "Moderate": [],
        "Weak": [],
        "Very Weak/No Correlation": []
    }

    for col in corr_matrix.columns:
        for idx in corr_matrix.index:
            if col != idx:  # Avoid self-correlation
                value = corr_matrix.loc[idx, col]
                if abs(value) >= 0.9:
                    summary["Very Strong"].append((idx, col, value))
                elif abs(value) >= 0.7:
                    summary["Strong"].append((idx, col, value))
                elif abs(value) >= 0.5:
                    summary["Moderate"].append((idx, col, value))
                elif abs(value) >= 0.3:
                    summary["Weak"].append((idx, col, value))
                else:
                    summary["Very Weak/No Correlation"].append((idx, col, value))

    return summary

In [None]:
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, cmap='coolwarm')
correlation_summary = summarize_correlations(correlation_matrix)

for category, correlations in correlation_summary.items():
    print(f"\n{category} Correlations:")
    for feature1, feature2, value in correlations:
        print(f"{feature1} - {feature2}: {value:.2f}")


### Creating New Features

In [None]:
df["Age_BMI"] = df["Age"] * df["BMI"]  # Interaction between age and BMI
df["Exercise_BMI"] = df["Exercise in Past 30 Days"] * df["BMI"]  # Effect of exercise on BMI
df["Alcohol_Smoking"] = df["Days in past 30 had alcoholic beverage"] * df["Smoked at Least 100 Cigarettes"]  # Relationship between drinking and smoking

In [None]:
df["Health_Risk_Score"] = df[["High Blood Pressure", "Diabetes", "Taking medicine for high cholesterol", "Stroke"]].sum(axis=1)

In [None]:
df.head()

In [None]:
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, cmap='coolwarm')
correlation_summary = summarize_correlations(correlation_matrix)

for category, correlations in correlation_summary.items():
    print(f"\n{category} Correlations:")
    for feature1, feature2, value in correlations:
        print(f"{feature1} - {feature2}: {value:.2f}")

In [None]:
# From https://scikit-learn.org/stable/auto_examples/linear_model/plot_ols.html
# Code source: Jaques Grobler
# License: BSD 3 clause

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler

target = "Heart Attack"
X = df.drop(columns=[target])
y = df[target]

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 
# Scale the data using StandardScaler
scaler = StandardScaler()
diabetes_X_train = scaler.fit_transform(X_train)
diabetes_X_test = scaler.transform(X_test)

####################################################################
# Quick Helper Method - Find Best Parameters
# Define parameter grid
param_grid = {'alpha': [0.001, 0.01, 0.1, 1, 10]}
 
# Perform grid search with cross-validation
lasso_cv = GridSearchCV(linear_model.Lasso(), param_grid, cv=5)
lasso_cv.fit(X_train, y_train)
 
# Print best parameter values and score
print("Best Parameters:", lasso_cv.best_params_)
print("Best Score:", lasso_cv.best_score_)
####################################################################

# Fit Lasso regression model
lasso = linear_model.Lasso(alpha=1)
lasso.fit(X_train, y_train)
 
# Evaluate model performance on test set
y_pred = lasso.predict(X_test)

# The coefficients
print("Coefficients: \n", lasso.coef_)
# The mean squared error
print("Mean squared error: %.2f" % mean_squared_error(y_test, y_pred))
# The coefficient of determination: 1 is perfect prediction
print("Coefficient of determination: %.2f" % lasso.score(X_test, y_test))

In [None]:
from sklearn.cluster import KMeans

# Run K-Means for different K values
inertia = []
K_values = range(2, 11)  # Testing from 2 to 10 clusters

for k in K_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(df)  # df_processed = Preprocessed dataset
    inertia.append(kmeans.inertia_)

# Plot the Elbow Curve
plt.plot(K_values, inertia, marker='o')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal K')
plt.show()

In [None]:
from kneed import KneeLocator

knee_locator = KneeLocator(range(2, 11), inertia, curve="convex", direction="decreasing")
optimal_k = knee_locator.knee

print(f"Optimal number of clusters: {optimal_k}")

In [None]:
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df["Cluster"] = kmeans.fit_predict(df)

# Check cluster distribution
df["Cluster"].value_counts()

In [None]:
# Visualize clusters based on important features
sns.boxplot(x="Cluster", y="BMI", data=df)
plt.title("BMI Distribution Across Clusters")
plt.show()

sns.boxplot(x="Cluster", y="Age", data=df)
plt.title("Age Distribution Across Clusters")
plt.show()

In [None]:
# Reduce to 2D using PCA for visualization
pca = PCA(n_components=2)
reduced_features = pca.fit_transform(df)
df["PCA1"] = reduced_features[:, 0]
df["PCA2"] = reduced_features[:, 1]

# Plot clusters with colors
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df["PCA1"], y=df["PCA2"], hue=df["Cluster"], palette="tab10", alpha=0.7)
plt.xlabel("Principal Component 1")
plt.ylabel("Principal Component 2")
plt.title("K-Means Clustering Visualization (PCA Reduced)")
plt.legend(title="Cluster")
plt.show()

In [None]:
target = "Heart Attack"
X = df.drop(columns=[target])
y = df[target]

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [None]:
from sklearn.decomposition import TruncatedSVD


# Apply SVD for dimensionality reduction
num_components = 10  # Adjust based on explained variance
svd = TruncatedSVD(n_components=num_components)
X_svd = svd.fit_transform(X_scaled)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_svd, y, test_size=0.2, random_state=42)

# # Choose the number of components (dimensions)
# num_components = 2  # Reduce data to 2D for visualization

# svd = TruncatedSVD(n_components=num_components)
# X_svd = svd.fit_transform(df)

# # Explained variance ratio
# explained_variance = np.cumsum(svd.explained_variance_ratio_)

# # Plot explained variance to decide on the number of components
# plt.plot(range(1, num_components + 1), explained_variance, marker="o")
# plt.xlabel("Number of Components")
# plt.ylabel("Explained Variance")
# plt.title("Explained Variance vs. Number of SVD Components")
# plt.show()


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Train Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predict
y_pred = log_reg.predict(X_test)

# Evaluate
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.svm import SVC

# Train SVM
svm_model = SVC(kernel="rbf", C=1.0, gamma="scale")
svm_model.fit(X_train, y_train)

# Predict
y_pred_svm = svm_model.predict(X_test)

# Evaluate
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm))

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict
y_pred_rf = rf_model.predict(X_test)

# Evaluate
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))

In [None]:
import numpy as np
feature_importance = np.abs(svd.components_).sum(axis=0)
sorted_features = np.argsort(-feature_importance)

print("Top SVD Components Contributing to Heart Attack Risk:")
print(sorted_features[:5])  # Show top 5 important features

In [None]:
from sklearn.cluster import KMeans

# Determine optimal K using the Elbow Method
inertia = []
K_values = range(2, 11)

for k in K_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_svd)
    inertia.append(kmeans.inertia_)

# Plot the Elbow Curve
plt.plot(K_values, inertia, marker="o")
plt.xlabel("Number of Clusters (K)")
plt.ylabel("Inertia")
plt.title("Elbow Method for Optimal K (After SVD)")
plt.show()

# Choose optimal K and apply K-Means
optimal_k = 4  # Example choice based on elbow point
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
clusters = kmeans.fit_predict(X_svd)

# Add cluster labels to the dataset
df["Cluster_SVD"] = clusters


In [None]:
plt.scatter(X_svd[:, 0], X_svd[:, 1], c=clusters, cmap="coolwarm", edgecolors="k", alpha=0.7)
plt.xlabel("SVD Component 1")
plt.ylabel("SVD Component 2")
plt.title("Clusters After SVD Reduction")
plt.colorbar(label="Cluster")
plt.show()