In [1]:
import pandas as pd

# Define the path to the raw data
file_path = 'D:/ai internship/disease_prediction_project/data/raw/heart_disease.csv'

# Define column names from UCI documentation
columns = [
    'age', 'sex', 'cp', 'trestbps', 'chol',
    'fbs', 'restecg', 'thalach', 'exang', 'oldpeak',
    'slope', 'ca', 'thal', 'target'
]

# Load the dataset
df = pd.read_csv(file_path, names=columns)

# Replace '?' with NaN and convert all columns to numeric
df.replace('?', pd.NA, inplace=True)
df = df.apply(pd.to_numeric, errors='coerce')

# Drop rows with missing values
df.dropna(inplace=True)

# Convert target values > 0 to 1 (indicating heart disease)
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

# Save the cleaned version
df.to_csv('../data/raw/heart_disease.csv', index=False)

print("Cleaned dataset saved successfully!")

Cleaned dataset saved successfully!


In [2]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv('D:/ai internship/disease_prediction_project/data/raw/heart_disease.csv')

# Show the first few rows
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Separate features and target
X = df.drop('target', axis=1)
y = df['target']

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

# Evaluate
train_accuracy = model.score(X_train, y_train)
test_accuracy = model.score(X_test, y_test)

print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
print(f"Testing Accuracy: {test_accuracy * 100:.2f}%")

Training Accuracy: 83.97%
Testing Accuracy: 86.67%


In [4]:
import joblib

# Save the model and scaler
joblib.dump(model, '../models/heart_model.pkl')
joblib.dump(scaler, '../models/heart_scaler.pkl')

print("Model and scaler saved successfully!")

Model and scaler saved successfully!


In [5]:
import pandas as pd
import numpy as np

# Load data
column_names = [
    "age", "sex", "cp", "trestbps", "chol", "fbs", 
    "restecg", "thalach", "exang", "oldpeak", "slope", 
    "ca", "thal", "target"
]

df = pd.read_csv('D:/ai internship/disease_prediction_project/data/raw/heart_disease.csv', names=column_names)

# Replace missing values marked as "?" with NaN
df.replace("?", np.nan, inplace=True)

# Convert to correct types
df = df.apply(pd.to_numeric, errors='coerce')

# Drop rows with missing values
df.dropna(inplace=True)

# Convert target to binary: 0 (no disease) and 1 (disease)
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

# Save cleaned version
df.to_csv('D:/ai internship/disease_prediction_project/data/processed/heart_cleaned.csv', index=False)

df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
1,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
2,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
3,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
4,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
5,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0
