# PhawAI

In [None]:
# I tried with multiple methods to get the most acurrate predictions for the f1-score
# Tried with NN, Random Forest, Adaboost and XGBoost
# The best results were obtained with XGBoost, ADASYN and MinMaxScaler in the last section of this file
# The section is named "## Improved XGBOOST model with Hyperparameter Tuning"

## First try XGboost

### Install required libraries for running the whole .ipynb file without issues

In [None]:
pip install scikit-learn==1.3.0 pandas numpy matplotlib seaborn xgboost imblearn optuna torch

### Data Preprocessing and Cleaning

In [None]:
import pandas as pd

train = pd.read_csv("train.csv")
test_public = pd.read_csv("test_public.csv")
test_private = pd.read_csv("test_private.csv")

test_public_ids = test_public["ID"].copy()
test_private_ids = test_private["ID"].copy()

print("Dataset shapes:")
print("Train:", train.shape)
print("Test Public:", test_public.shape)
print("Test Private:", test_private.shape)

In [None]:
# Won't consider columns that are completely null
train_null_cols = [col for col in train.columns if train[col].isna().all()]
print("Columns in train that are completely null:", train_null_cols)
if train_null_cols:
    train.drop(columns=train_null_cols, inplace=True)
    print("Dropped columns", train_null_cols)
print("Updated training set shape after deleting columns:", train.shape)

# Removing rows with missing values
train.dropna(inplace=True)
print("Updated training set shape after deleting rows with null values:", train.shape)

In [None]:
print("Training set info:")
print(train.info())

In [None]:
print("Head rows")
print(train.head())

### Correlation matrix and Feature Selection

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Print shapes
print("Training:", train.shape)
print("Test Public:", test_public.shape)
print("Test Private:", test_private.shape)

# Use only numerical columns for corr matrix
numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()
corr_matrix = train[numeric_cols].corr()
print("Correlation matrix table")
print(corr_matrix)

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, cmap='coolwarm', annot=False)
plt.title("Correlation Matrix Graph")
plt.show()

In [None]:
# Correlation with target variable defined below "CHD_OR_MI"
if "CHD_OR_MI" in corr_matrix.columns:
    target_corr = corr_matrix["CHD_OR_MI"].sort_values(ascending=False)
    print("Correlation with target variable:")
    print(target_corr)
else:
    print("Target column Missing")

threshold = 0.05 # Modify this value as needed to improve feature selection and model performance for F1 score
if "CHD_OR_MI" in corr_matrix.columns:
    selected_cols = target_corr[abs(target_corr) >= threshold].index.tolist()
    print(f"Features with absolute correlation greater or equal than {threshold}:")
    print(selected_cols)
    train = train[selected_cols]
    print("Updated Train Shape", train.shape)
    print("Remaining columns", list(train.columns))
else:
    print("Missing Target Column")

In [None]:
train_columns = list(train.columns)

test_public_aligned_cols = [col for col in test_public.columns if col in train_columns]
test_private_aligned_cols = [col for col in test_private.columns if col in train_columns]

test_public = test_public[test_public_aligned_cols]
test_private = test_private[test_private_aligned_cols]

print("Test Public shape", test_public.shape)
print("Test Private shape", test_private.shape)

In [None]:
print(train.head())

In [None]:
print(test_public.head())

In [None]:
print(test_private.head())

### Data Normalization with MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

print("Train shape before MinMaxScaler:", train.shape)
print("Test Public shape before MinMaxScaler:", test_public.shape)
print("Test Private shape before MinMaxScaler:", test_private.shape)

exclude_cols = ["CHD_OR_MI", "ID"] # Excluding specific columns from normalization (target and id are excluded since they're not relevant for the normalization process)
numeric_cols_to_normalize = [col for col in train.select_dtypes(include=[np.number]).columns if col not in exclude_cols]

print("Numeric columns to be normalized:", numeric_cols_to_normalize)

scaler = MinMaxScaler()
train[numeric_cols_to_normalize] = scaler.fit_transform(train[numeric_cols_to_normalize])

if len(numeric_cols_to_normalize) > 0:
    if set(numeric_cols_to_normalize).issubset(set(test_public.columns)):
        test_public[numeric_cols_to_normalize] = scaler.transform(test_public[numeric_cols_to_normalize])
    if set(numeric_cols_to_normalize).issubset(set(test_private.columns)):
        test_private[numeric_cols_to_normalize] = scaler.transform(test_private[numeric_cols_to_normalize])

In [None]:
print("Train after normalization:", train.shape)
print("Public after normalization:", test_public.shape)
print("Private after normalization:", test_private.shape)

In [None]:
print("Normalized Training ds")
print(train.head())

### Training the Model and Handling imbalaced data with ADASYN

In [None]:
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

X = train.drop(columns=["CHD_OR_MI", "ID"], errors="ignore")
y = train["CHD_OR_MI"]

print("X shape:", X.shape)
print("y shape:", y.shape)

# 80/20 split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

In [None]:
print("Handling class imbalance with ADASYN")
adasyn = ADASYN(sampling_strategy="minority", random_state=42, n_neighbors=5)
X_train_balanced, y_train_balanced = adasyn.fit_resample(X_train, y_train)
print("X_train_balanced shape:", X_train_balanced.shape)
print("y_train_balanced shape:", y_train_balanced.shape)

In [None]:
print("Training XGBoost")
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_balanced, y_train_balanced)

y_val_pred = xgb_model.predict(X_val)
print("Classification Report:")
print(classification_report(y_val, y_val_pred))
conf_mat = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix:", conf_mat)

In [None]:
plt.figure(figsize=(5, 4))
plt.imshow(conf_mat, interpolation="nearest", cmap="Blues")
plt.title("Confusion Matrix Validation Set")
plt.colorbar()
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.xticks([0, 1], ["No", "Yes"])
plt.yticks([0, 1], ["No", "Yes"])

for i in range(conf_mat.shape[0]):
    for j in range(conf_mat.shape[1]):
        plt.text(j, i, str(conf_mat[i, j]), 
                 ha="center", va="center", 
                 color="white" if conf_mat[i, j] > conf_mat.max() / 2 else "black")

plt.tight_layout()
plt.show()

### Model Evaluation

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

selected_columns = X.columns
X_full = train[selected_columns].copy()
y_full = train["CHD_OR_MI"].copy()

print("X_full shape:", X_full.shape)
print("y_full shape:", y_full.shape)

adasyn_full = ADASYN(sampling_strategy="minority", random_state=42, n_neighbors=5) # Modify n_neighbors as needed to improve F1 score
X_full_balanced, y_full_balanced = adasyn_full.fit_resample(X_full, y_full)

print("X_full_balanced shape:", X_full_balanced.shape)
print("y_full_balanced shape:", y_full_balanced.shape)

### Save preditcions to csv file

In [None]:
X_test_public = test_public[selected_columns].copy()  # Align test_public with the selected columns
X_test_private = test_private[selected_columns].copy()  # Align test_private with the selected columns

# Predictions
y_test_public_pred = xgb_model.predict(X_test_public)
y_test_private_pred = xgb_model.predict(X_test_private)

# New df
final_predictions = pd.DataFrame({
    "ID": pd.concat([test_public_ids, test_private_ids], ignore_index=True),
    "CHD_OR_MI": np.concatenate([y_test_public_pred, y_test_private_pred])
})

# Save to csv
final_predictions.to_csv("resultados.csv", index=False)
print("Predictions saved to 'resultados.csv'")

## Improved XGBOOST model with Hyperparameter Tuning

### Data Preprocessing and Cleaning

In [None]:
import pandas as pd

train = pd.read_csv("train.csv")
test_public = pd.read_csv("test_public.csv")
test_private = pd.read_csv("test_private.csv")

test_public_ids = test_public["ID"].copy()
test_private_ids = test_private["ID"].copy()

print("Dataset shapes:")
print("Train:", train.shape)
print("Test Public:", test_public.shape)
print("Test Private:", test_private.shape)

In [None]:
# Won't consider columns that are completely null
train_null_cols = [col for col in train.columns if train[col].isna().all()]
print("Columns in train that are completely null:", train_null_cols)
if train_null_cols:
    train.drop(columns=train_null_cols, inplace=True)
    print("Dropped columns", train_null_cols)
print("Updated training set shape after deleting columns:", train.shape)

# Removing rows with missing values
train.dropna(inplace=True)
print("Updated training set shape after deleting rows with null values:", train.shape)

In [None]:
print("Training set info:")
print(train.info())

In [None]:
print("Head rows")
print(train.head())

### Correlation matrix and Feature Selection

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Print shapes
print("Training:", train.shape)
print("Test Public:", test_public.shape)
print("Test Private:", test_private.shape)

# Use only numerical columns for corr matrix
numeric_cols = train.select_dtypes(include=[np.number]).columns.tolist()
corr_matrix = train[numeric_cols].corr()
print("(ノಠ益ಠ)ノ彡┻━┻")
print("Correlation matrix table")
print(corr_matrix)

In [None]:
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, cmap='coolwarm', annot=False)
plt.title("Correlation Matrix Graph")
plt.show()

In [None]:
# Correlation with target variable defined below "CHD_OR_MI"
if "CHD_OR_MI" in corr_matrix.columns:
    target_corr = corr_matrix["CHD_OR_MI"].sort_values(ascending=False)
    print("Correlation with target variable:")
    print(target_corr)
else:
    print("Target column Missing")

threshold = 0.0001 # Modified to include all features
if "CHD_OR_MI" in corr_matrix.columns:
    selected_cols = target_corr[abs(target_corr) >= threshold].index.tolist()
    print(f"Features with absolute correlation greater or equal than {threshold}:")
    print(selected_cols)
    train = train[selected_cols]
    print("Updated Train Shape", train.shape)
    print("Remaining columns", list(train.columns))
else:
    print("Missing Target Column")

In [None]:
train_columns = list(train.columns)
test_public_aligned_cols = [col for col in test_public.columns if col in train_columns]
test_private_aligned_cols = [col for col in test_private.columns if col in train_columns]

test_public = test_public[test_public_aligned_cols]
test_private = test_private[test_private_aligned_cols]

print("Test Public shape", test_public.shape)
print("Test Private shape", test_private.shape)

In [None]:
print(train.head())

In [None]:
print(test_public.head())

In [None]:
print(test_private.head())

### Data Normalization with MinMaxScaler

In [None]:
from sklearn.preprocessing import MinMaxScaler

print("Train shape before MinMaxScaler:", train.shape)
print("Test Public shape before MinMaxScaler:", test_public.shape)
print("Test Private shape before MinMaxScaler:", test_private.shape)

exclude_cols = ["CHD_OR_MI", "ID"] # Excluding specific columns from normalization (target and id are excluded since they're not relevant for the normalization process)
numeric_cols_to_normalize = [
    col for col in train.select_dtypes(include=[np.number]).columns
    if col not in exclude_cols
]

print("Numeric columns to be normalized:", numeric_cols_to_normalize)

scaler = MinMaxScaler()
train[numeric_cols_to_normalize] = scaler.fit_transform(train[numeric_cols_to_normalize])

if len(numeric_cols_to_normalize) > 0:
    if set(numeric_cols_to_normalize).issubset(set(test_public.columns)):
        test_public[numeric_cols_to_normalize] = scaler.transform(test_public[numeric_cols_to_normalize])
    if set(numeric_cols_to_normalize).issubset(set(test_private.columns)):
        test_private[numeric_cols_to_normalize] = scaler.transform(test_private[numeric_cols_to_normalize])

In [None]:
print("Train after normalization:", train.shape)
print("Public after normalization:", test_public.shape)
print("Private after normalization:", test_private.shape)

In [None]:
print("Normalized Training ds")
print(train.head())

### Training the Model and Handling imbalaced data with ADASYN

In [None]:
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

X = train.drop(columns=["CHD_OR_MI", "ID"], errors="ignore")
y = train["CHD_OR_MI"]

print("X shape:", X.shape)
print("y shape:", y.shape)

# 80/20 split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.001, random_state=42, stratify=y # Yea I know, I'm using 0.001 for testing purposes
)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print("X_val shape:", X_val.shape)
print("y_val shape:", y_val.shape)

In [None]:

print("Handling class imbalance with ADASYN")
adasyn = ADASYN(sampling_strategy="minority", random_state=42, n_neighbors=5) # Modify n_neighbors as needed to improve F1 score
X_train_balanced, y_train_balanced = adasyn.fit_resample(X_train, y_train)

print("X_train_balanced shape:", X_train_balanced.shape)
print("y_train_balanced shape:", y_train_balanced.shape)

In [None]:
print("Training the XGBoost model")
xgb_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric='logloss',
    random_state=42,
    n_estimators=1000,
    max_depth=7,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1
)
xgb_model.fit(X_train_balanced, y_train_balanced)

y_val_pred = xgb_model.predict(X_val)
print("Classification Report:")
print(classification_report(y_val, y_val_pred))
conf_mat = confusion_matrix(y_val, y_val_pred)
print("Confusion Matrix", conf_mat)

In [None]:
plt.figure(figsize=(5, 4))
plt.imshow(conf_mat, interpolation="nearest", cmap="Blues")
plt.title("Confusion Matrix Validation Set")
plt.colorbar()
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.xticks([0, 1], ["No", "Yes"])
plt.yticks([0, 1], ["No", "Yes"])

for i in range(conf_mat.shape[0]):
    for j in range(conf_mat.shape[1]):
        plt.text(j, i, str(conf_mat[i, j]), 
                 ha="center", va="center", 
                 color="white" if conf_mat[i, j] > conf_mat.max() / 2 else "black")

plt.tight_layout()
plt.show()

### Hyperparameter Tuning

In [None]:
pip uninstall scikit-learn

In [None]:
pip install scikit-learn==1.3.0

In [None]:

from sklearn.model_selection import RandomizedSearchCV

param_distributions = {
    "n_estimators": [100, 300, 500, 800, 1000, 1600],
    "max_depth": [3, 5, 7, 9, 11],
    "learning_rate": [0.1, 0.05, 0.01, 0.005, 0.0001],
    "subsample": [0.7, 0.8, 1.0],
    "colsample_bytree": [0.7, 0.8, 1.0],
    "gamma": [0, 1, 5, 10]
}

base_xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

random_search = RandomizedSearchCV(
    estimator=base_xgb,
    param_distributions=param_distributions,
    n_iter=30, # Iterations number, modify as needed to improve F1 score
    scoring='f1',
    n_jobs=-1,
    cv=3,
    verbose=1,
    random_state=42
)

print("Randomized Search for best hyperparameters")
random_search.fit(X_train_balanced, y_train_balanced)

best_model = random_search.best_estimator_
print("Best hyperparameters:")
print(random_search.best_params_)

y_val_pred_best = best_model.predict(X_val)

print("Classification Report:")
print(classification_report(y_val, y_val_pred_best))

conf_mat_best = confusion_matrix(y_val, y_val_pred_best)
print("Confusion Matrix:", conf_mat_best)

plt.figure(figsize=(5, 4))
plt.imshow(conf_mat_best, interpolation="nearest", cmap="Blues")
plt.title("Confusion Matrix Validation Set")
plt.colorbar()
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.xticks([0, 1], ["No", "Yes"])
plt.yticks([0, 1], ["No", "Yes"])

for i in range(conf_mat.shape[0]):
    for j in range(conf_mat.shape[1]):
        plt.text(j, i, str(conf_mat[i, j]), 
                 ha="center", va="center", 
                 color="white" if conf_mat[i, j] > conf_mat.max() / 2 else "black")

plt.tight_layout()
plt.show()



### Model Evaluation

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

selected_columns = X.columns
X_full = train[selected_columns].copy()
y_full = train["CHD_OR_MI"].copy()

print("X_full shape:", X_full.shape)
print("y_full shape:", y_full.shape)

adasyn_full = ADASYN(sampling_strategy="minority", random_state=42, n_neighbors=5) # Modify n_neighbors as needed to improve F1 score
X_full_balanced, y_full_balanced = adasyn_full.fit_resample(X_full, y_full)

print("X_full_balanced shape:", X_full_balanced.shape)
print("y_full_balanced shape:", y_full_balanced.shape)

In [None]:
# Cross-validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42) # Modify n_splits as needed to improve F1 score
cv_scores = cross_val_score(best_model, X_full_balanced, y_full_balanced, cv=cv, scoring='f1')
print("F1 score from cross-validation:", cv_scores)
print("Mean F1-score:", np.mean(cv_scores))

print("Retraining the best model on the entire balanced dataset")
best_model.fit(X_full_balanced, y_full_balanced)

y_full_pred = best_model.predict(X_full)
print("Classification Report Final:")
print(classification_report(y_full, y_full_pred))

final_cm = confusion_matrix(y_full, y_full_pred)
print("Confusion Matrix Final:", final_cm)

plt.figure(figsize=(12, 8))
plt.imshow(final_cm, interpolation="nearest", cmap="Blues")
plt.title("Confusion Matrix Tuned")
plt.colorbar()
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.xticks([0, 1], ["No", "Yes"])
plt.yticks([0, 1], ["No", "Yes"])
plt.tight_layout()

for i in range(conf_mat.shape[0]):
    for j in range(conf_mat.shape[1]):
        plt.text(j, i, str(conf_mat[i, j]), 
                 ha="center", va="center", 
                 color="white" if conf_mat[i, j] > conf_mat.max() / 2 else "black")

plt.show()


### Save predictions to csv file

In [None]:

X_test_public = test_public[selected_columns].copy()
X_test_private = test_private[selected_columns].copy()

y_test_public_pred = best_model.predict(X_test_public)
y_test_private_pred = best_model.predict(X_test_private)

final_predictions = pd.DataFrame({
    "ID": pd.concat([test_public_ids, test_private_ids], ignore_index=True),
    "CHD_OR_MI": np.concatenate([y_test_public_pred, y_test_private_pred])
})

# Ensure the format is correct
print(final_predictions.head())

# Save to csv
final_predictions.to_csv("resultadosXGBoost8.csv", index=False)
print("Final predictions saved to 'resultadosXGBoost7.csv'.") # Renamed it manually to resultados.csv