In [30]:
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.discriminant_analysis import StandardScaler
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, mean_squared_error, accuracy_score
import seaborn as sns
import category_encoders as ce 
pd.set_option('future.no_silent_downcasting', True)

# TRAINING PART
# IMPORT CSV
hearts = pd.read_csv('train_heart.csv', sep=',')
# FILTER VALUES
chol_median = hearts.loc[hearts['Cholesterol'] != 0, 'Cholesterol'].median()
hearts['Cholesterol'].replace(0, chol_median, inplace=True)

resting_median = hearts.loc[hearts['RestingBP'] != 0, 'RestingBP'].median()
hearts['RestingBP'].replace(0, resting_median, inplace=True)


# # DROP UNNECESSARY COLS AND FIND y
X = hearts.copy()
categorical_columns = X.select_dtypes(include=['object']).columns

ordinal_encoder  = ce.OrdinalEncoder(cols=categorical_columns)
features_encoded  = ordinal_encoder.fit_transform(X, axis=1)
X = X.drop(['id', 'HeartDisease'], axis=1)
poly_features_transformer = PolynomialFeatures(degree=2, include_bias=False)
features_poly = poly_features_transformer.fit_transform(features_encoded)
polynomial_feature_names = poly_features_transformer.get_feature_names_out(input_features=ordinal_encoder.get_feature_names_out())

standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(features_poly)


y = hearts['HeartDisease']



# # ENCODE X
encoder = ce.OrdinalEncoder(cols=X.columns)
X_encoded = encoder.fit_transform(X)

# # DATA SPLICING (TEST AND TRAIN)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.3)

# # FIT MODEL



# Decision Tree Hyperparameter Grid
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Decision Tree Grid Search Setup
dt_grid_search = GridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1,
    n_jobs=-1
)

# Conduct Grid Search
dt_grid_search.fit(X_train, y_train)

# Optimal Decision Tree Estimator
optimal_dt = dt_grid_search.best_estimator_
y_pred = optimal_dt.predict(X_test)
# # Calculating the accuracy
accuracy_ada = accuracy_score(y_test, y_pred)
print(f'RandomForest Accuracy: {accuracy_ada}')
ada_boost_classifier = AdaBoostClassifier(estimator=optimal_dt)
ada_boost_classifier.fit(X_train, y_train)
y_pred = ada_boost_classifier.predict(X_test)
# # Calculating the accuracy
accuracy_ada = accuracy_score(y_test, y_pred)
print(f'AdaBoost Decision Tree Accuracy: {accuracy_ada}')



confusion_mat = confusion_matrix(y_test, y_pred)

# Visualizing the Confusion Matrix
plt.figure(figsize=(8, 6))
sns.heatmap(confusion_mat, annot=True, fmt="d", cmap="Blues", xticklabels=['Predicted No', 'Predicted Yes'], yticklabels=['Actual No', 'Actual Yes'])
plt.title('Confusion Matrix')
plt.ylabel('Actual Label')
plt.xlabel('Predicted Label')
plt.show()
plt.show()

# TESTING PART
# IMPORT CSV
test = pd.read_csv('test_heart.csv', sep=',')
#FILTER VALUES
chol_median = test.loc[test['Cholesterol'] != 0, 'Cholesterol'].median()
test['Cholesterol'].replace(0, chol_median, inplace=True)

resting_median = test.loc[test['RestingBP'] != 0, 'RestingBP'].median()
test['RestingBP'].replace(0, resting_median, inplace=True)

X_new = test.drop(['id'], axis=1)

# # ENCODE X
X_new_encoded = encoder.transform(X_new)

# # PREDICT AND PUT INTO PANDAS DATAFRAME
predictions = optimal_dt.predict(X_new_encoded)
# Round up the predictions to 1 or 0
rounded_predictions = [1 if pred > 0.5 else 0 for pred in predictions]
id_to_prediction_df = pd.DataFrame({
    'id': test['id'],
    'HeartDisease': rounded_predictions
})

# # OUTPUT VIA CSV
file_name = './submissiontest.csv'
id_to_prediction_df.to_csv(file_name, index=False)

print(f"File saved as {file_name}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hearts['Cholesterol'].replace(0, chol_median, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  hearts['RestingBP'].replace(0, resting_median, inplace=True)


AttributeError: 'numpy.ndarray' object has no attribute 'columns'