# Model Building


##          Model Training


In [121]:
import numpy as np
import pandas as pd
import joblib


dataset = pd.read_csv('../dataset/heart_disease.csv')
dataset_copy = dataset.copy()
dataset.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [122]:
dataset.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,0.966997,131.623762,246.264026,0.148515,0.528053,149.646865,0.326733,1.039604,1.39934,0.729373,2.313531,0.544554
std,9.082101,0.466011,1.032052,17.538143,51.830751,0.356198,0.52586,22.905161,0.469794,1.161075,0.616226,1.022606,0.612277,0.498835
min,29.0,0.0,0.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,0.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0,2.0,0.0
50%,55.0,1.0,1.0,130.0,240.0,0.0,1.0,153.0,0.0,0.8,1.0,0.0,2.0,1.0
75%,61.0,1.0,2.0,140.0,274.5,0.0,1.0,166.0,1.0,1.6,2.0,1.0,3.0,1.0
max,77.0,1.0,3.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,2.0,4.0,3.0,1.0


### Splitting Data

In [123]:
from sklearn.model_selection import train_test_split
y = dataset.loc[:, dataset.columns == 'target']
X = dataset.loc[:, dataset.columns != 'target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Features Engineering

In [124]:
print(X_train.isnull().sum())
print(X_test.isnull().sum())

selected_features = ['thalach','cp','ca','thal']

X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]


age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
dtype: int64
age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
dtype: int64


### Scaling Numerical Features


In [125]:
from sklearn.preprocessing import StandardScaler

def scaling_features(data, is_train, scaler=None):
    
    data = data[selected_features]
    
    if is_train:
        scaler = StandardScaler()
        scaler.fit(data)
        scaled_data = scaler.transform(data)

    else:
        if scaler is None:
            raise ValueError("A fitted scaler must be provided for scaling test data.")
        scaled_data = scaler.transform(data)
        
    scaled_data = pd.DataFrame(scaled_data, index=data.index)

    return scaled_data, scaler

X_train_scaled, scaler = scaling_features(X_train_selected, is_train=True, scaler=None)
joblib.dump(scaler, '../models/scaler.joblib')

X_train_scaled.head()

Unnamed: 0,0,1,2,3
132,0.532781,0.008099,-0.689701,-0.509048
202,-1.753582,-0.971891,-0.689701,1.17848
196,-0.139679,0.988089,-0.689701,-0.509048
75,0.48795,0.008099,-0.689701,-0.509048
176,0.443119,-0.971891,1.333421,1.17848


### Model Training 

In [137]:
from sklearn.linear_model import LogisticRegression

def model_train(X_train_scaled,y_train):
    model = LogisticRegression()
    model.fit(X_train_scaled, y_train)
    return model

model = model_train(X_train_scaled,y_train)

joblib.dump(model, '../models/model.joblib')

  y = column_or_1d(y, warn=True)


['../models/model.joblib']

## Model Evaluation

### Scaling Numerical Features 

In [127]:
X_test_scaled, _= scaling_features(X_test_selected, is_train=False, scaler = scaler)
X_test_scaled.head()



Unnamed: 0,0,1,2,3
179,-1.708752,-0.971891,0.32186,-2.196576
228,0.398289,1.968079,-0.689701,1.17848
111,1.025918,0.988089,0.32186,1.17848
246,-0.005187,-0.971891,1.333421,1.17848
60,-0.9018,0.988089,0.32186,-0.509048


### Evaluation

In [138]:
from sklearn.metrics import log_loss

def compute_log_loss(y_test: np.ndarray, y_pred_proba: np.ndarray, precision: int = 2) -> float:
    loss = log_loss(y_test, y_pred_proba)
    return round(loss, precision)

y_pred = model.predict(X_test_scaled)

log_loss_value = compute_log_loss(y_test, y_pred)
print(log_loss_value)


4.14


# Model Inference

In [129]:
file_path = "../dataset/test.csv"
test_file = pd.read_csv(file_path)

Scaler = joblib.load('../models/scaler.joblib')

Preprocessed_Test, _= scaling_features(test_file, is_train=False, scaler = Scaler)

model = joblib.load('../models/model.joblib')

predicted_house_prices = model.predict(Preprocessed_Test)
predicted_house_prices

pd.DataFrame(predicted_house_prices).head().astype(int)


Unnamed: 0,0
0,0
1,0
2,0
3,0
4,0


In [139]:
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression

def build_model(data: pd.DataFrame):
    y = data.loc[:, data.columns == 'target']
    X = data.loc[:, data.columns != 'target']

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    x_train = x_train[selected_features]


    x_train_preprocessed,Scaler = scaling_features(x_train, is_train=True, scaler=None)
    x_test_preprocessed,_ = scaling_features(x_test, is_train=False, scaler=Scaler)
    model = LogisticRegression()

    model.fit(x_train_preprocessed, y_train)

    y_pred = model.predict(x_test_preprocessed)

    loss = compute_log_loss(y_test, y_pred)

    return {'loss': round(loss, 2)},model

dataset = pd.read_csv('../dataset/heart_disease.csv')
print(build_model(dataset))

({'loss': 4.14}, LogisticRegression())


  y = column_or_1d(y, warn=True)


In [146]:
dataset = pd.read_csv('../dataset/test.csv')


In [164]:
def make_predictions(data):
    data = data[selected_features]
    Scaler = joblib.load('../models/scaler.joblib')

    data_preprocessed,_ = scaling_features(data, is_train=False, scaler=Scaler)
    
    model = joblib.load('../models/model.joblib')
    prediction = model.predict(data_preprocessed)
    return pd.DataFrame(prediction, columns=['Prediction']).astype(int)


In [165]:
make_predictions(dataset)


Unnamed: 0,Prediction
0,0
1,0
2,0
3,0
4,0
...,...
1020,1
1021,0
1022,0
1023,1
