In [2]:
# @title
# Perform preprocessing
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:


# Read the data
heart_df = pd.read_csv("train_heart.csv", sep=',')

# Define feature columns and target variable
feature_cols = ['ChestPainType', 'Cholesterol', 'RestingECG', 'MaxHR', 'Oldpeak', 'ST_Slope']
X = heart_df[feature_cols]
y = heart_df['HeartDisease']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define preprocessing steps for numerical and categorical features
categorical_features = ['ChestPainType', 'RestingECG', 'ST_Slope']
numerical_features = ['Cholesterol', 'MaxHR', 'Oldpeak']

categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),
        ('num', numerical_transformer, numerical_features)
    ])

# Create a Pipeline with preprocessing and XGBoost classifier
xgb_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(objective='binary:logistic', random_state=42))
])

# Train the model
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred = xgb_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.8372093023255814


To push to a csv file based on the test_heart.csv

In [17]:
# Read the test data
test_df = pd.read_csv("test_heart.csv", sep=',')

# Define feature columns
feature_cols = ['ChestPainType', 'Cholesterol', 'RestingECG', 'MaxHR', 'Oldpeak', 'ST_Slope']

# Extract 'id' column
ids = test_df['id']

# Drop the 'id' column
X_test = test_df.drop(columns=['id'])

# Make predictions
y_pred = xgb_model.predict(X_test)

# Create a DataFrame with 'id' and predicted values
predictions_df = pd.DataFrame({'id': ids, 'predicted': y_pred})

# Print or save the predictions
print(predictions_df)
predictions_df.to_csv('predictions.csv', index=False)


      id  predicted
0    637          0
1    430          1
2    711          1
3    375          0
4    183          1
..   ...        ...
271  133          1
272   66          0
273  470          1
274  898          0
275  182          1

[276 rows x 2 columns]
