# GPT4

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
file_path = 'heart_disease_uci.csv'  # Update the file path
heart_data = pd.read_csv(file_path)

# Splitting the dataset into features (X) and target (y)
X = heart_data.drop(['id', 'num', 'dataset'], axis=1)  # Dropping irrelevant columns
y = heart_data['num']

# Encoding 'num' to binary (presence or absence of heart disease)
y = (y > 0).astype(int)

# Identifying categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Creating transformers for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combining transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Creating a pipeline with preprocessor and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=0))
])

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Fitting the model
model.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.81


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
file_path = 'heart_disease_uci.csv'  # Replace with your file path
heart_data = pd.read_csv(file_path)

# Explicitly define numerical and categorical columns
numerical_cols = ['trestbps', 'chol', 'thalch', 'oldpeak', 'ca']  # Update this based on your dataset
categorical_cols = ['sex', 'dataset', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'thal']  # Update this based on your dataset

# Preprocessing for numerical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define feature and target variables
X = heart_data.drop(['num', 'id'], axis=1)  # Exclude 'id' as it's not a feature
y = heart_data['num']

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Define and train the baseline model
baseline_model = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', RandomForestClassifier(random_state=0))])
baseline_model.fit(X_train, y_train)

# Predicting on the test set and evaluating
y_pred = baseline_model.predict(X_test)
baseline_accuracy = accuracy_score(y_test, y_pred)
print("Baseline Model Accuracy:", baseline_accuracy)


Baseline Model Accuracy: 0.5108695652173914


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Load the dataset
file_path = 'heart_disease_uci.csv'  # Replace with your file path
heart_data = pd.read_csv(file_path)

# Identify categorical and numerical columns
categorical_cols = heart_data.select_dtypes(include=['object', 'bool']).columns
numerical_cols = heart_data.select_dtypes(include=[np.number]).columns.drop('num')

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
])

# Define the target variable
y = heart_data['num']

# Convert target variable to binary (presence or absence of heart disease)
y = (y > 0).astype(int)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(heart_data.drop('num', axis=1), y, test_size=0.2, random_state=0)

# Create the model
model = RandomForestClassifier(n_estimators=100, random_state=0)

# Create and train the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Predict on the test data
y_pred = pipeline.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Model Accuracy: 0.89


In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif

# Load the dataset
file_path = 'heart_disease_uci.csv'  # Replace with your file path
heart_data = pd.read_csv(file_path)

# Dropping the 'id' and 'dataset' columns as they are not relevant for prediction
heart_data = heart_data.drop(columns=['id', 'dataset'])

# Converting 'num' to a binary classification problem (0 = no heart disease, 1 = heart disease)
heart_data['num'] = heart_data['num'].apply(lambda x: 1 if x > 0 else 0)

# Identifying numerical and categorical features
numerical_features = heart_data.select_dtypes(include=['int64', 'float64']).drop(columns=['num']).columns
categorical_features = heart_data.select_dtypes(include=['object']).columns

# Creating preprocessing pipelines
numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Handling missing values for numerical features
    ('scaler', StandardScaler())  # Standardizing numerical features
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handling missing values for categorical features
    ('encoder', OneHotEncoder(handle_unknown='ignore'))  # Encoding categorical features
])

# Combining preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ])

# Splitting the data into training and testing sets
X = heart_data.drop('num', axis=1)
y = heart_data['num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature selection
feature_selection = SelectKBest(score_func=f_classif, k='all')

# Model selection
model = RandomForestClassifier(random_state=42)

# Creating a full pipeline with preprocessing, feature selection, and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selection),
    ('classifier', model)
])

# Fitting the model
pipeline.fit(X_train, y_train)

# Predicting and evaluating the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.842391304347826


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score

# Load the dataset
file_path = 'heart_disease_uci.csv'  # Replace with your dataset path
heart_data = pd.read_csv(file_path)

# Dropping 'id' column if present
heart_data = heart_data.drop(columns=['id'], errors='ignore')

# Identifying numerical and categorical columns dynamically
numerical_features = heart_data.select_dtypes(include=['int64', 'float64']).drop(columns=['num']).columns
categorical_features = heart_data.select_dtypes(include=['object', 'bool']).columns

# Preprocessing pipelines for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)])

# Separating features and target variable
X = heart_data.drop('num', axis=1)
y = heart_data['num']

# Split the dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Applying the transformations
X_train_transformed = preprocessor.fit_transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

# Logistic Regression Model
logistic_model = LogisticRegression(max_iter=1000, random_state=0)
logistic_model.fit(X_train_transformed, y_train)

# Making predictions and evaluating the model
y_pred = logistic_model.predict(X_test_transformed)
accuracy = accuracy_score(y_test, y_pred)
cv_scores = cross_val_score(logistic_model, X_train_transformed, y_train, cv=5)

# Output results
print(f"Test Accuracy: {accuracy}")
print(f"Cross-Validation Scores: {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean()}")


Test Accuracy: 0.532608695652174
Cross-Validation Scores: [0.58783784 0.63945578 0.63265306 0.6122449  0.58503401]
Mean CV Score: 0.6114451185879759


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
file_path = 'heart_disease_uci.csv'  # Replace with your file path
heart_data = pd.read_csv(file_path)

# Separate features and target variable
X = heart_data.drop(['num', 'id'], axis=1)  # 'num' is the target, 'id' is not needed
y = heart_data['num']

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Initialize and train the Gradient Boosting Classifier
grad_boost = GradientBoostingClassifier(random_state=0)
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', grad_boost)])

# Fit the model
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy of the Gradient Boosting model: {accuracy:.2f}")


Accuracy of the Gradient Boosting model: 0.53


In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
file_path = 'heart_disease_uci.csv'  # Replace with your file path
heart_data = pd.read_csv(file_path)

# Identifying categorical and numerical columns
categorical_cols = heart_data.select_dtypes(include=['object', 'bool']).columns
numerical_cols = heart_data.select_dtypes(include=['float64', 'int64']).columns.drop(['id', 'num'])

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
])

# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=0)

# Create and evaluate the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

# Split data into train and test sets
X = heart_data.drop(['num', 'id'], axis=1)
y = heart_data['num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Preprocessing of training data, fit model 
pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, preds)
print(f'Accuracy: {accuracy}')


Accuracy: 0.5271739130434783


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import SelectKBest, f_classif

# Load the dataset
file_path = 'heart_disease_uci.csv'  # Replace with your file path
heart_data = pd.read_csv(file_path)

# Handling missing values and encoding categorical variables
categorical_features = ['sex', 'cp', 'restecg', 'slope', 'thal', 'fbs', 'exang']
numerical_features = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak', 'ca']

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Define the model
model = RandomForestClassifier(n_estimators=100, random_state=0)

# Create and evaluate the pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('feature_selection', SelectKBest(f_classif, k=10)),
                           ('model', model)])

# Split data into train and test sets
X = heart_data.drop(['id', 'dataset', 'num'], axis=1)
y = heart_data['num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Train the pipeline
pipeline.fit(X_train, y_train)

# Predict and evaluate the model
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")


Model Accuracy: 0.4891304347826087


In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
file_path = 'heart_disease_uci.csv'  # Replace with your file path
heart_data = pd.read_csv(file_path)

# Data Preprocessing

# Separating the features and target variable
X = heart_data.drop(columns=['num', 'id', 'dataset'])  # 'id' and 'dataset' seem irrelevant for prediction
y = heart_data['num']

# Identifying categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'bool']).columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# Creating transformers for numerical and categorical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Imputing missing values with mean for numerical data
    ('scaler', StandardScaler())  # Standardizing the numerical data
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Imputing missing values with most frequent value for categorical data
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-hot encoding for categorical data
])

# Combining transformers into a preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Model Selection
# Using Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=0)

# Creating a full pipeline with preprocessing and the model
clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('model', model)])

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# Model Training and Evaluation
# Training the model
clf.fit(X_train, y_train)

# Predicting on test data
y_pred = clf.predict(X_test)

# Calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy}")


Model Accuracy: 0.532608695652174


In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load the dataset
file_path = 'heart_disease_uci.csv'  # Replace with your file path
heart_data = pd.read_csv(file_path)

# Preprocessing for numerical features
numeric_features = heart_data.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

# Preprocessing for categorical features
categorical_features = heart_data.select_dtypes(include=['object', 'bool']).columns
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Combine preprocessing for numerical and categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

# Define target variable and features
X = heart_data.drop(['num', 'id', 'dataset'], axis=1)  # 'id' and 'dataset' columns are excluded
y = heart_data['num']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply the transformations
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(random_state=42)

# Train the model on the training data
rf_classifier.fit(X_train, y_train)

# Make predictions on the test data
y_pred = rf_classifier.predict(X_test)

# Calculate and print the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of the Random Forest Classifier:", accuracy)


ValueError: A given column is not a column of the dataframe

# Bard

In [27]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Download the dataset
url = "heart_disease_uci.csv"
data = pd.read_csv(url)

# Separate numerical and categorical features
numerical_features = data.select_dtypes(include=[np.number])
categorical_features = data.select_dtypes(exclude=[np.number])

# Apply StandardScaler to numerical features
scaler = StandardScaler()
numerical_features_scaled = scaler.fit_transform(numerical_features)

# Encode categorical features
le = LabelEncoder()
categorical_features_encoded = le.fit_transform(categorical_features)

# Combine transformed features
X = np.hstack((numerical_features_scaled, categorical_features_encoded))

# Extract target variable and reshape it into a one-dimensional array
y = le.fit_transform(data["target"])
y = y.reshape(-1)  # Reshape y into a one-dimensional array

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

# Print the accuracy
print("Accuracy:", accuracy)


ValueError: y should be a 1d array, got an array of shape (920, 8) instead.

In [29]:
import pandas as pd

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/"
filename = "processed.cleveland.data"

data = pd.read_csv(url + filename)

In [30]:
data.head()

Unnamed: 0,63.0,1.0,1.0.1,145.0,233.0,1.0.2,2.0,150.0,0.0,2.3,3.0,0.0.1,6.0,0
0,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
1,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
2,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
3,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
4,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0


In [33]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Download the UCI Heart Classification dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
df = pd.read_csv(url)

df.head()


Unnamed: 0,63.0,1.0,1.0.1,145.0,233.0,1.0.2,2.0,150.0,0.0,2.3,3.0,0.0.1,6.0,0
0,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
1,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
2,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
3,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
4,56.0,1.0,2.0,120.0,236.0,0.0,0.0,178.0,0.0,0.8,1.0,0.0,3.0,0


In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Download the UCI Heart Classification dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
data = pd.read_csv(url, header=None)

# Rename the columns
data.columns = ["age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "diag"]

# Convert categorical variables to numerical
data["sex"] = data["sex"].map({"M": 1, "F": 0})
data["cp"] = data["cp"].map({1: 0, 2: 1, 3: 2, 4: 3})
data["fbs"] = data["fbs"].map({"F": 0, "T": 1})
data["restecg"] = data["restecg"].map({0: 0, 1: 1, 2: 2})
data["exang"] = data["exang"].map({"N": 0, "Y": 1})
data["slope"] = data["slope"].map({1: 0, 2: 1, 3: 2})
data["thal"] = data["thal"].map({0: 0, 1: 1, 2: 2})

# Separate the features and the target variable
X = data.drop("diag", axis=1)
y = data["diag"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a random forest classifier
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)

# Evaluate the model on the test set
score = model.score(X_test, y_test)
print("Accuracy:", score)

# Make predictions on new data
new_data = pd.DataFrame({
    "age": [50],
    "sex": [1],
    "cp": [1],
    "trestbps": [120],
    "chol": [200],
    "fbs": [0],
    "restecg": [0],
    "thalach": [130],
    "exang": [0],
    "oldpeak": [2],
    "slope": [2],
    "ca": [0],
    "diag": [0]  # Add the missing "diag" feature with a placeholder value of 0
})

predictions = model.predict(new_data)
print("Predicted diagnosis:", predictions[0])


ValueError: Length mismatch: Expected axis has 14 elements, new values have 13 elements

In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Download the UCI Heart Classification dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
data = pd.read_csv(url, header=None)

# Extract features and target variable
features = data.iloc[:, 1:-1]
target = data.iloc[:, -1]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.25, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the testing set
y_pred = model.predict(X_test_scaled)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


ValueError: could not convert string to float: '?'

In [37]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Download the UCI Heart Classification dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
data = pd.read_csv(url, header=None, names=[
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "diag"
])

# Separate features and target variable
X = data.drop("diag", axis=1)
y = data["diag"]

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_scaled)

# Evaluate model performance using accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


ValueError: could not convert string to float: '?'

In [38]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Download the UCI Heart Classification dataset
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data', delimiter=',')

# Extract features and target variable
X = data.iloc[:, :-1]
y = data.iloc[:, -1]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate the model on the test set
predictions = model.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print('Accuracy:', accuracy)

# Print the model's coefficients
print('Coefficients:')
for ind, coef in enumerate(model.coef_):
    print(f'Feature {ind+1}: {coef:.3f}')


ValueError: could not convert string to float: '?'

In [43]:
np.unique(X_train['0.0.1'])

array(['0.0', '1.0', '2.0', '3.0', '?'], dtype=object)

In [41]:
X_train.dtypes

63.0     float64
1.0      float64
1.0.1    float64
145.0    float64
233.0    float64
1.0.2    float64
2.0      float64
150.0    float64
0.0      float64
2.3      float64
3.0      float64
0.0.1     object
6.0       object
dtype: object

In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Download the UCI Heart Classification dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
data = pd.read_csv(url, header=None, names=[
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "diag"])

# Separate features and target variable
X = data.drop("diag", axis=1)
y = data["diag"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the numerical features with error handling and row removal
try:
    # Remove rows containing invalid values (represented as '?')
    X_train.replace('?', pd.NA, inplace=True)
    X_train.dropna(inplace=True)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
except Exception as e:
    print("Error during standardization:", e)

# Train a KNeighborsClassifier model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = knn.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


ValueError: Found input variables with inconsistent numbers of samples: [237, 242]

In [47]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Download the UCI Heart Classification dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"
data = pd.read_csv(url, header=None, names=[
    "age", "sex", "cp", "trestbps", "chol", "fbs", "restecg", "thalach", "exang", "oldpeak", "slope", "ca", "diag"])

# Separate features and target variable
X = data.drop("diag", axis=1)
y = data["diag"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle invalid values and ensure consistent sample count
try:
    # Remove rows containing invalid values (represented as '?') from both X and y
    X_train.replace('?', pd.NA, inplace=True)
    X_test.replace('?', pd.NA, inplace=True)
    X_train.dropna(inplace=True)
    X_test.dropna(inplace=True)
    y_train.dropna(inplace=True)
    y_test.dropna(inplace=True)

    # Standardize the numerical features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
except Exception as e:
    print("Error during data cleaning:", e)

# Train a KNeighborsClassifier model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = knn.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


ValueError: Found input variables with inconsistent numbers of samples: [237, 242]

In [3]:
df.columns

Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
# Load the dataset
df = pd.read_csv('heart_disease_uci.csv')
# Preprocess the data
X = df.drop('num', axis=1) # Drop target variable
X = X.drop(columns=['age']) # Drop age feature
X = X.drop(columns=['sex']) # Drop sex feature
X = scaler.fit_transform(X) # Scale the data
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, df['target'], test_size=0.2, random_state=42)
# Define the model architecture
model = Sequential()
model.add(Dense(64, activation='relu', input_shape=(X.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(4, activation='softmax'))
# Compile the model
model.compile(optimizer=Adam(lr=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(X_train, to_categorical(y_train), epochs=10, batch_size=32, validation_data=(X_test, to_categorical(y_test)))
# Evaluate the model
y_pred = model.predict(X_test)
y_pred_class = y_pred.argmax(-1)
accuracy = accuracy_score(y_test, y_pred_class)
print(f'Accuracy: {accuracy:.3f}')
print(classification_report(y_test, y_pred_class))


ValueError: could not convert string to float: 'Cleveland'

In [15]:
df.columns

Index(['age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
       'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')

In [14]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
df = pd.read_csv('heart_disease_uci.csv')
# Preprocess the data
df = df.drop(columns=['id']) # drop id column
X = df.drop(columns=['num']) # features
y = df['num'] # target variable
# Split the data into training and testing sets
X['sex'] = pd.get_dummies(X['sex']).values.squeeze()
X['hospital'] = pd.get_dummies(X['hospital']).values.squeeze()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train a logistic regression model on the training data
model = LogisticRegression()
model.fit(X_train, y_train)
# Evaluate the model on the testing data
y_pred = model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
# Print the evaluation metrics
print("Confusion matrix:")
print(conf_mat)
print(f"Accuracy: {accuracy:.3f}")
print(f"Report: {report}")

KeyError: 'hospital'

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Load the dataset
df = pd.read_csv('heart_disease_uci.csv')
# Preprocess the data
X = df.drop('num', axis=1)
y = df['num']
# Split the data into training and testing sets
X['sex'] = pd.get_dummies(X['sex']).values.squeeze()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)
# Evaluate the model
y_pred = model.predict(X_test)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Classification Report:')
print(classification_report(y_test, y_pred))
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

ValueError: could not convert string to float: 'VA Long Beach'

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
# Load the dataset
df = pd.read_csv("heart_disease_uci.csv")
# Preprocess the data
X = df[['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num']]
y = df['heart_disease']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train a logistic regression model on the training data
model = LogisticRegression()
model.fit(X_train, y_train)
# Evaluate the model on the testing data
y_pred = model.predict(X_test)
conf_mat = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
# Print the evaluation metrics
print("Confusion matrix:")
print(conf_mat)
print(f"Accuracy: {accuracy:.3f}")
print(f"Report: {report}")

KeyError: 'heart_disease'

In [22]:
df.columns

Index(['id', 'age', 'sex', 'dataset', 'cp', 'trestbps', 'chol', 'fbs',
       'restecg', 'thalch', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'num'],
      dtype='object')