# COVID-19 Prediction Model with Enhanced Techniques

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.feature_selection import SelectKBest, f_classif
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline

In [2]:

train_files = ['lbp-train-fold_0.csv', 'lbp-train-fold_1.csv', 'lbp-train-fold_2.csv', 'lbp-train-fold_3.csv', 'lbp-train-fold_4.csv']
test_file = 'lbp-test.csv'

# Carregando os dados de treino
train_dfs = [pd.read_csv(file) for file in train_files]
df = pd.concat(train_dfs, ignore_index=True)

# Carregando os dados de teste
test_df = pd.read_csv(test_file)

# Preprocessamento
df['ExerciseAngina'] = df['ExerciseAngina'].map({'Y': 1, 'N': 0})
df['ChestPainType'] = df['ChestPainType'].map({'ATA': 1, 'NAP': 2, 'ASY': 3, 'TA': 4})
df['ST_Slope'] = df['ST_Slope'].map({'Up': 1, 'Flat': 2, 'Down': 3})
test_df['ExerciseAngina'] = test_df['ExerciseAngina'].map({'Y': 1, 'N': 0})
test_df['ChestPainType'] = test_df['ChestPainType'].map({'ATA': 1, 'NAP': 2, 'ASY': 3, 'TA': 4})
test_df['ST_Slope'] = test_df['ST_Slope'].map({'Up': 1, 'Flat': 2, 'Down': 3})


X = df.drop(columns='HeartDisease')
y = df['HeartDisease']
X_test = test_df.drop(columns='HeartDisease')
y_test = test_df['HeartDisease']


KeyError: 'ExerciseAngina'

In [None]:

# Balancing data using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [None]:

# Feature selection with SelectKBest
selector = SelectKBest(score_func=f_classif, k=5)  # Select top 5 features
X_resampled = selector.fit_transform(X_resampled, y_resampled)
X_test_selected = selector.transform(X_test)  # Apply same transformation to test set


In [None]:

# Hyperparameter tuning with cross-validation using GridSearchCV
param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
clf = DecisionTreeClassifier(random_state=42)
grid_search = GridSearchCV(clf, param_grid, cv=StratifiedKFold(n_splits=5), scoring='accuracy', n_jobs=-1)
grid_search.fit(X_resampled, y_resampled)

# Best parameters
best_params = grid_search.best_params_
print("Best Parameters: ", best_params)
print("Best Cross-Validation Score: ", grid_search.best_score_)


In [None]:

# Train a Random Forest model on the resampled dataset
rf = RandomForestClassifier(n_estimators=100, random_state=42, **best_params)
rf.fit(X_resampled, y_resampled)
y_pred_rf = rf.predict(X_test_selected)

# Evaluation
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
print("Random Forest Accuracy: ", accuracy_rf)
print("Confusion Matrix:\n", conf_matrix_rf)


In [None]:

# Train an XGBoost model on the resampled dataset
xgb = XGBClassifier(random_state=42, eval_metric='mlogloss')
xgb.fit(X_resampled, y_resampled)
y_pred_xgb = xgb.predict(X_test_selected)

# Evaluation
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
print("XGBoost Accuracy: ", accuracy_xgb)
print("Confusion Matrix:\n", conf_matrix_xgb)
