# STEP 1: Setup & Imports

# Task
Logistic Regression (Member 1)

Reason: Simple and interpretable, good for binary classification.

Libraries: Scikit-learn

Metrics: Accuracy, Precision, Recall, F1-score

Task: Train the model and evaluate its performance.

## Data loading

### Subtask:
Load the "heart_disease_uci.csv" dataset into a pandas DataFrame.


**Reasoning**:
Load the "heart_disease_uci.csv" dataset into a pandas DataFrame.



In [36]:
import pandas as pd
try:
    df = pd.read_csv('heart_disease_uci.csv')
    display(df.head())
except FileNotFoundError:
    print("Error: 'heart_disease_uci.csv' not found. Please upload the file to your Google Colab environment.")
    df = None

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0


## Data exploration

### Subtask:
Explore the loaded dataset to understand its structure and characteristics.


**Reasoning**:
Explore the dataset by examining its shape, info, descriptive statistics, target variable, missing values, and potential predictor variables.



In [37]:
# Data Shape and Info
print("Shape of the DataFrame:", df.shape)
df.info()

# Summary Statistics
print("\nDescriptive Statistics:\n", df.describe())

# Target Variable Identification
print("\nUnique values in 'num' column:", df['num'].unique())

# Missing Value Analysis
print("\nMissing values per column:\n", df.isnull().sum())

# Potential Predictor Variables (Initial Assessment)
print("\nPotential predictor variables:")
#  List potential predictor variables and note any potential issues.
#  For example, columns with a large number of missing values or highly skewed distributions might be problematic.
#  This is a preliminary assessment; further analysis might be needed.
for col in df.columns:
    if col != 'num': # Exclude the target variable
        print(f"- {col}: {df[col].dtype}")

Shape of the DataFrame: (920, 16)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 920 entries, 0 to 919
Data columns (total 16 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        920 non-null    int64  
 1   age       920 non-null    int64  
 2   sex       920 non-null    object 
 3   dataset   920 non-null    object 
 4   cp        920 non-null    object 
 5   trestbps  861 non-null    float64
 6   chol      890 non-null    float64
 7   fbs       830 non-null    object 
 8   restecg   918 non-null    object 
 9   thalch    865 non-null    float64
 10  exang     865 non-null    object 
 11  oldpeak   858 non-null    float64
 12  slope     611 non-null    object 
 13  ca        309 non-null    float64
 14  thal      434 non-null    object 
 15  num       920 non-null    int64  
dtypes: float64(5), int64(3), object(8)
memory usage: 115.1+ KB

Descriptive Statistics:
                id         age    trestbps        chol      thalch    

## Data cleaning

### Subtask:
Handle missing values in the dataset.


**Reasoning**:
Impute missing values in the 'ca', 'thal', and 'slope' columns using the mode, as these are categorical features, and then impute other missing values using the mean or median.



In [38]:
# Impute missing values in 'ca', 'thal', and 'slope' using mode
for col in ['ca', 'thal', 'slope']:
    df[col] = df[col].fillna(df[col].mode()[0])

# Impute other missing values using mean or median
for col in df.columns:
    if df[col].isnull().any():
        if pd.api.types.is_numeric_dtype(df[col]):
            if df[col].skew() > 1 or df[col].skew() < -1:
                df[col] = df[col].fillna(df[col].median())
            else:
                df[col] = df[col].fillna(df[col].mean())
        else:
            df[col] = df[col].fillna(df[col].mode()[0])
df_cleaned = df.copy()

# Verify no missing values
print("\nMissing values per column after imputation:\n", df_cleaned.isnull().sum())


Missing values per column after imputation:
 id          0
age         0
sex         0
dataset     0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalch      0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
num         0
dtype: int64


  df[col] = df[col].fillna(df[col].mode()[0])


## Data preparation

### Subtask:
Prepare the data for model training by converting categorical features to numerical representations and scaling numerical features.


**Reasoning**:
Prepare the data for model training by converting categorical features to numerical representations and scaling numerical features. This involves one-hot encoding categorical features and scaling numerical features using StandardScaler.



In [39]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Identify categorical and numerical features
categorical_cols = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal', 'dataset']
numerical_cols = ['age', 'trestbps', 'chol', 'thalch', 'oldpeak']

# One-hot encode categorical features
df_encoded = pd.get_dummies(df_cleaned, columns=categorical_cols, drop_first=True)

# Separate features and target
X = df_encoded.drop('num', axis=1)
y = df_encoded['num']

# Scale numerical features
scaler = StandardScaler()
X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

display(X.head())

Unnamed: 0,id,age,trestbps,chol,thalch,oldpeak,sex_Male,cp_atypical angina,cp_non-anginal,cp_typical angina,...,slope_flat,slope_upsloping,ca_1.0,ca_2.0,ca_3.0,thal_normal,thal_reversable defect,dataset_Hungary,dataset_Switzerland,dataset_VA Long Beach
0,1,1.007386,0.698041,0.311021,0.495698,1.368109,True,False,False,True,...,False,False,False,False,False,False,False,False,False,False
1,2,1.432034,1.511761,0.797713,-1.175955,0.611589,True,False,False,False,...,True,False,False,False,True,True,False,False,False,False
2,3,1.432034,-0.658158,0.274289,-0.340128,1.651804,True,False,False,False,...,True,False,False,True,False,False,True,False,False,False
3,4,-1.752828,-0.115679,0.46713,1.968345,2.502889,True,False,True,False,...,False,False,False,False,False,True,False,False,False,False
4,5,-1.32818,-0.115679,0.044717,1.371326,0.517024,False,True,False,False,...,False,True,False,False,False,True,False,False,False,False


## Data splitting

### Subtask:
Split the data into training and testing sets.


**Reasoning**:
Split the data into training and testing sets using train_test_split.



In [40]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Model training

### Subtask:
Train a Logistic Regression model on the prepared training data.


**Reasoning**:
Train a Logistic Regression model using the prepared training data.



In [41]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
logreg_model = LogisticRegression(max_iter=1000, multi_class='multinomial')

# Train the model
logreg_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Model evaluation

### Subtask:
Evaluate the trained Logistic Regression model using accuracy, precision, recall, and F1-score.


**Reasoning**:
Evaluate the trained Logistic Regression model using accuracy, precision, recall, and F1-score.



In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions on the test data
y_pred = logreg_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=1)
recall = recall_score(y_test, y_pred, average='macro', zero_division=1)
f1 = f1_score(y_test, y_pred, average='macro', zero_division=1)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.6141304347826086
Precision: 0.5351232271564752
Recall: 0.35864495988748174
F1-score: 0.3410305123645199


## Summary:

### 1. Q&A

The task implicitly asks for the performance of a Logistic Regression model on the heart disease dataset.

* **How well does the Logistic Regression model perform?**  The model achieved a moderate accuracy of 0.614 on the test set.  Precision, recall, and F1-score were 0.535, 0.359, and 0.341 respectively, indicating potential room for improvement in correctly classifying positive cases.  A convergence warning during training suggests the model might not have fully optimized.


### 2. Data Analysis Key Findings

* **Missing Data:**  Significant missing values were present in 'ca', 'thal', and 'slope' (611, 486, and 309 respectively) which were imputed using the mode. Other missing values were imputed using the mean or median depending on the skewness of the numerical feature.
* **Model Convergence:** The Logistic Regression model issued a convergence warning, suggesting that the model may not have fully converged during training.  Increasing `max_iter` or scaling the data could address this.
* **Model Performance:** The model achieved an accuracy of 0.614 on the test set, with a precision of 0.535, recall of 0.359 and F1-score of 0.341. The 'macro' average was used for precision, recall and F1-score, to address potential class imbalance.


### 3. Insights or Next Steps

* **Address Model Convergence:** Increase the `max_iter` parameter of the Logistic Regression model or explore different solvers to ensure convergence and potentially improve performance.
* **Investigate Feature Engineering:** Explore feature engineering techniques to create more informative features or to remove less relevant ones.  The relatively low precision, recall, and F1-score suggest this may be beneficial.


# Test a custom input

In [46]:
import pandas as pd

# Create a custom input dictionary
custom_input = {
    'id': 1001,
    'age': 55,
    'sex': 1,
    'trestbps': 130.0,
    'chol': 250.0,
    'thalch': 140.0,
    'oldpeak': 1.2,
    'slope': 1,
    'ca': 0.0,
    'thal': 2,
    'cp_atypical angina': False,
    'cp_non-anginal': True,
    'cp_typical angina': False,
    'fbs_True': False,
    'restecg_normal': True,
    'restecg_st-t abnormality': False,
    'exang_True': True,
}

# Assuming your model is named 'logreg_model'
# Create a DataFrame from the custom input
custom_input_df = pd.DataFrame([custom_input])

# Remove 'id' column if it's not in your training data
custom_input_df = custom_input_df.drop(columns=['id'], errors='ignore')

# Ensure the custom input DataFrame has the same columns as the training data
missing_cols = set(X_train.columns) - set(custom_input_df.columns)
for col in missing_cols:
    custom_input_df[col] = 0  # Fill missing columns with 0

extra_cols = set(custom_input_df.columns) - set(X_train.columns)
custom_input_df = custom_input_df.drop(columns=list(extra_cols), errors='ignore')

custom_input_df = custom_input_df[X_train.columns]  # Reorder columns

# Make prediction on the custom input using 'logreg_model'
custom_prediction = logreg_model.predict(custom_input_df) # Changed model to logreg_model

# Print the prediction and interpretation
if custom_prediction[0] == 1:
    print("Prediction: The person is likely to have heart disease (target = 1).")
else:
    print("Prediction: The person is not likely to have heart disease (target = 0).")
    print("Data types do not match X_train.")

Prediction: The person is likely to have heart disease (target = 1).


In [53]:
import pandas as pd

# Create a custom input dictionary with modified features
custom_input = {
    'id': 1,
    'age': 63,  # Lower age
    'sex': 1,  # Male (assuming 1 represents male)
    'trestbps': 145.0,  # Lower blood pressure
    'chol': 233.0,  # Lower cholesterol
    'thalch': 150.0,  # Higher maximum heart rate achieved
    'oldpeak': 0.5,  # Lower ST depression
    'slope': 2,  # Upsloping ST segment (better prognosis)
    'ca': 0.0,  # No major vessels colored by fluoroscopy
    'thal': 2,  # Normal thallium stress result
    'cp_atypical angina': False,  # No atypical angina
    'cp_non-anginal': False,  # No non-anginal pain
    'cp_typical angina': False,  # No typical angina
    'fbs_True': False,  # Fasting blood sugar < 120 mg/dl
    'restecg_normal': True,  # Normal resting electrocardiographic results
    'restecg_st-t abnormality': False,  # No ST-T wave abnormality
    'exang_True': False,  # No exercise-induced angina
}

# Assuming your model is named 'logreg_model'
# Create a DataFrame from the custom input
custom_input_df = pd.DataFrame([custom_input])

# Remove 'id' column if it's not in your training data
custom_input_df = custom_input_df.drop(columns=['id'], errors='ignore')

# Ensure the custom input DataFrame has the same columns as the training data
missing_cols = set(X_train.columns) - set(custom_input_df.columns)
for col in missing_cols:
    custom_input_df[col] = 0  # Fill missing columns with 0

extra_cols = set(custom_input_df.columns) - set(X_train.columns)
custom_input_df = custom_input_df.drop(columns=list(extra_cols), errors='ignore')

custom_input_df = custom_input_df[X_train.columns]  # Reorder columns

# Make prediction on the custom input using 'logreg_model'
custom_prediction = logreg_model.predict(custom_input_df) # Changed model to logreg_model

# Print the prediction and interpretation
if custom_prediction[0] == 1:
    print("Prediction: The person is likely to have heart disease (target = 1).")
else:
    print("Prediction: The person is not likely to have heart disease (target = 0).")
    print("Data types do not match X_train.")

Prediction: The person is likely to have heart disease (target = 1).
