# 1. Linear Regression

In this notebook, we will build a linear regression model to predict the ST depression induced by exercise relative to rest (oldpeak).

First we will import the libraries we will use:

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
import matplotlib.pyplot as plt

Then we will read our dataset csv file and see the columns inside. 
In the 'UCI Heart Disease Data - Column Descriptions.txt' you can find the description of its column

In [8]:
df = pd.read_csv('heart_disease_uci.csv')
dataset_columns = df.columns.tolist()
dataset_columns

['id',
 'age',
 'sex',
 'dataset',
 'cp',
 'trestbps',
 'chol',
 'fbs',
 'restecg',
 'thalch',
 'exang',
 'oldpeak',
 'slope',
 'ca',
 'thal',
 'num']

Handle missing values, encode categorical variables, and scale numerical features:

In [9]:
# Checking the dataset's first few rows and missing values
print(df.head())
print(df.isnull().sum())

# Define columns that need encoding and scaling
categorical_features = ['cp', 'thal']  # Add other categorical features as needed
numerical_features = ['age', 'trestbps', 'chol', 'thalach']  # Add other numerical features as needed

# One-Hot Encoding for categorical variables and scaling for numerical variables
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Display the updated DataFrame to check transformations (optional)
print(df.describe(include='all'))



   id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True      2.6         flat  2.0   
3          normal   187.0  False      3.5  downsloping  0.0   
4  lv hypertrophy   172.0  False      1.4    upsloping  0.0   

                thal  num  
0       fixed defect    0  
1             normal    2  
2  reversable defect    1  
3             normal    0  
4             normal    0  
id      

Check transformed data types and preview data

In [10]:
print(df.dtypes)
print(df.head())

id            int64
age           int64
sex          object
dataset      object
cp           object
trestbps    float64
chol        float64
fbs          object
restecg      object
thalch      float64
exang        object
oldpeak     float64
slope        object
ca          float64
thal         object
num           int64
dtype: object
   id  age     sex    dataset               cp  trestbps   chol    fbs  \
0   1   63    Male  Cleveland   typical angina     145.0  233.0   True   
1   2   67    Male  Cleveland     asymptomatic     160.0  286.0  False   
2   3   67    Male  Cleveland     asymptomatic     120.0  229.0  False   
3   4   37    Male  Cleveland      non-anginal     130.0  250.0  False   
4   5   41  Female  Cleveland  atypical angina     130.0  204.0  False   

          restecg  thalch  exang  oldpeak        slope   ca  \
0  lv hypertrophy   150.0  False      2.3  downsloping  0.0   
1  lv hypertrophy   108.0   True      1.5         flat  3.0   
2  lv hypertrophy   129.0   True

Divide the dataset into training and testing sets:

In [11]:
# Preparing features and target variable
X = df.drop(['id', 'num', 'oldpeak'], axis=1)  # Adjust according to the features chosen
y = df['oldpeak']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


Train the logistic regression model:

In [12]:
# Create a pipeline that includes preprocessing and the regression model
pipeline = make_pipeline(preprocessor, LinearRegression())

# Train the model
pipeline.fit(X_train, y_train)

ValueError: A given column is not a column of the dataframe

Evaluate the model's performance using accuracy and other metrics:

In [None]:
# Make predictions using the testing set
y_pred = pipeline.predict(X_test)

# The coefficients and evaluation metrics
print('Coefficients:', pipeline.named_steps['linearregression'].coef_)
print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))

Visualize the model results 

In [None]:
# Visualize the actual vs predicted values
plt.scatter(X_test['thalach'], y_test, color='black', label='Actual values')
plt.scatter(X_test['thalach'], y_pred, color='blue', label='Predicted values')
plt.xlabel('Maximum Heart Rate Achieved (Scaled)')
plt.ylabel('Oldpeak (Scaled)')
plt.title('Test vs Prediction for Oldpeak')
plt.legend()
plt.show()