Attribute Information
1) age
2) sex
3) chest pain type (4 values)
4) resting blood pressure
5) serum cholestoral in mg/dl
6) fasting blood sugar > 120 mg/dl
7) resting electrocardiographic results (values 0,1,2)
8) maximum heart rate achieved
9) exercise induced angina
10) oldpeak = ST depression induced by exercise relative to rest
11) the slope of the peak exercise ST segment
12) number of major vessels (0-3) colored by flourosopy
13) thal: 0 = normal; 1 = fixed defect; 2 = reversable defect
14) target: 0= less chance of heart attack 1= more chance of heart attack

`target` refers to the presence of heart disease in the patient. It is integer valued 0 = no/less chance of heart attack and 1 = more chance of heart attack.

Create a classification model, by using optuna to find the best model/parameters, and after that create a class with the best model Optuna found during hyper-parameter optimization process. Make sure this class includes all the pre-processing steps (if any) that Optuna used. Make sure that this class has a `get_prediction()` function, that can be called (by the front-end user interface for example), with the parameters, and it returns the prdiction of the target.

In [None]:
!pip install optuna



In [None]:
import optuna
import pandas as pd

import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
import sklearn.svm
from sklearn import tree
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, StandardScaler, PowerTransformer,Normalizer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold, KFold,GroupKFold
from sklearn.model_selection import cross_val_score

path = "/work/data/homework 26/heart.csv"

df = pd.read_csv(path)
df.head()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    int64  
 1   sex       303 non-null    int64  
 2   cp        303 non-null    int64  
 3   trestbps  303 non-null    int64  
 4   chol      303 non-null    int64  
 5   fbs       303 non-null    int64  
 6   restecg   303 non-null    int64  
 7   thalach   303 non-null    int64  
 8   exang     303 non-null    int64  
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    int64  
 11  ca        303 non-null    int64  
 12  thal      303 non-null    int64  
 13  target    303 non-null    int64  
dtypes: float64(1), int64(13)
memory usage: 33.3 KB


In [None]:
def load_data():
    
    df=pd.read_csv(path)
    X=df.drop(["target"],axis = 1)
    y=df["target"]
    return X,y

def objective(trial):

    classifier_name = trial.suggest_categorical("classifier", ["SVC", "RandomForest","DecisionTreeClassifier","AdaBoostClassifier"])
    if classifier_name == "SVC":
        svc_c = trial.suggest_float("svc_c", 1e-5, 1e5, log=True)
        model = sklearn.svm.SVC(C=svc_c, gamma="auto")
    elif classifier_name == "RandomForest":
        rf_max_depth = trial.suggest_int("rf_max_depth", 2, 12, log=True)
        model = sklearn.ensemble.RandomForestClassifier(max_depth=rf_max_depth, 
                                                        n_estimators=10)
    elif classifier_name == "DecisionTreeClassifier":
        dt_criteria = trial.suggest_categorical("dt_criteria",["gini","entropy"])
        dt_max_depth = trial.suggest_int("dt_max_depth", 2, 12)
        model = tree.DecisionTreeClassifier(criterion= dt_criteria,
                                            max_depth=dt_max_depth)
    elif classifier_name == 'AdaBoostClassifier':
        learning_rate = trial.suggest_uniform('learning_rate', 1e-3, 1) # 1e-10, 1e10
        sug_ada_estims = trial.suggest_int("estimators", 2, 32)
        model = AdaBoostClassifier(n_estimators=sug_ada_estims,learning_rate=learning_rate)  
  
                                          
    scaler_string = trial.suggest_categorical("------------------------------------_scaler",["no_scaler", "StandardScaler","RobustScaler","MinMaxScaler", "MaxAbsScaler", "StandardScaler", "PowerTransformer","Normalizer"])
    
    if scaler_string == "no_scaler":
        scaled_X = X
    else:
        scaler = eval(scaler_string)()
        scaler.fit(X)
        scaled_X = scaler.transform(X)


    #cv_string = trial.suggest_categorical("validator",["StratifiedKFold","KFold"])
    #if cv_string == "StratifiedKFold":
        #cv_string_model= StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    #elif cv_string == "KFold":
        # cv_string_model= KFold(n_splits=5,shuffle=True,random_state=42)

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    score = cross_val_score(model, scaled_X, y, cv=cv,scoring="f1_weighted")
    trial_score = score.mean()

    return trial_score

X, y = load_data()
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
print(study.best_trial)



[32m[I 2021-10-09 19:16:09,938][0m A new study created in memory with name: no-name-3f45d014-1334-4f15-bd04-4f58230c8137[0m
[32m[I 2021-10-09 19:16:09,973][0m Trial 0 finished with value: 0.8040695064824256 and parameters: {'classifier': 'DecisionTreeClassifier', 'dt_criteria': 'entropy', 'dt_max_depth': 3, '------------------------------------_scaler': 'RobustScaler'}. Best is trial 0 with value: 0.8040695064824256.[0m
[32m[I 2021-10-09 19:16:10,025][0m Trial 1 finished with value: 0.7923026574461376 and parameters: {'classifier': 'SVC', 'svc_c': 0.06436649351644572, '------------------------------------_scaler': 'StandardScaler'}. Best is trial 0 with value: 0.8040695064824256.[0m
  loglike = -n_samples / 2 * np.log(x_trans.var())
[32m[I 2021-10-09 19:16:10,133][0m Trial 2 finished with value: 0.7967720537548736 and parameters: {'classifier': 'SVC', 'svc_c': 41510.02520093985, '------------------------------------_scaler': 'PowerTransformer'}. Best is trial 0 with value: 0

In [None]:
'classifier': 'AdaBoostClassifier', 
'learning_rate': 0.2501994775312749,'estimators': 26, 
'------------------------------------_scaler': 'no_scaler'


In [None]:
import optuna
import pandas as pd

import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
import sklearn.svm
from sklearn import tree
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, StandardScaler, PowerTransformer,Normalizer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold, KFold,GroupKFold
from sklearn.model_selection import cross_val_score
import pickle
path = "/work/data/homework 26/heart.csv"


class PredictHeartProblems():
    MODEL_SAVE_LOCATION = "/work/data/homework 26/classification_model.joblib"

    def __init__(self, path):
        self.df = self.load_model(path)
        self.X, self.y = self.preprocessing()
        self.train_best_model()
        self.save_model()
        
    def load_model(self, path):
        return pd.read_csv(path)

    def preprocessing(self):
        X = df.drop(["target"],axis = 1)
        y = df["target"]
        return X,y

    def train_best_model(self):
        self.model = AdaBoostClassifier(n_estimators=26,learning_rate=0.2501994775312749)
        self.model.fit(self.X, self.y)
        # we now test if everything works until this point
        # self.test = self.model.predict(self.X)
    
    def save_model(self):
        pickle.dump(self.model,  open(self.MODEL_SAVE_LOCATION, 'wb'))

    def predict(self, value_to_predict):
        loaded_model = pickle.load(open(self.MODEL_SAVE_LOCATION, 'rb'))

        df= pd.DataFrame([value_to_predict],columns=['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 'thalach',
       'exang', 'oldpeak', 'slope', 'ca', 'thal'])

        prediction = self.model.predict(df)[0]
        list_of_prediction_labels = ["No heart problem", "Possible heart problem"]

        return list_of_prediction_labels[prediction]


model = PredictHeartProblems(path)

In [None]:
index = 200
value_to_predict = list(model.X.iloc[index].values)
predicted_class = model.predict(value_to_predict)

true_class = model.y.iloc[index]

print(f"Prediction is: {predicted_class}\nTrue class is: {true_class}")

Prediction is: No heart problem
True class is: 0
  "X does not have valid feature names, but"


In [None]:
model.predict([44.0, 1.0, 0.0, 110.0, 197.0, 0.0, 0.0, 177.0, 0.0, 0.0, 2.0, 1.0, 2.0])

In [None]:
import optuna
import pandas as pd

import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import sklearn.datasets
import sklearn.ensemble
import sklearn.model_selection
import sklearn.svm
from sklearn import tree
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, StandardScaler, PowerTransformer,Normalizer
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold, KFold,GroupKFold
from sklearn.model_selection import cross_val_score

path = "/work/data/homework 26/heart.csv"

class Model:
    def __init__(self, datafile = "/work/data/homework 26/heart.csv"):
       self.df = pd.read_csv(datafile)
       self.user_defined_model = AdaBoostClassifier(n_estimators=26,learning_rate=0.2501994775312749)
            
    def split(self, test_size):

        X=df.drop(["target"],axis = 1)
        y=df["target"]
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size = test_size, random_state = 42)
    
    def fit(self):
        self.model = self.user_defined_model.fit(self.X_train, self.y_train)
    
    def predict(self, input_value):
        if input_value == None:
            result = self.user_defined_model.predict(self.X_test)
        else: 
            result = self.user_defined_model.predict(np.array([input_value]))
        return result

if __name__ == '__main__':
    model_instance = Model()
    model_instance.split(0.3)
    model_instance.fit()    
    print(model_instance.predict(["5","1","3","145","273","1","0","150","2.3","0","0","1","5"]))
    print("Accuracy: ", model_instance.model.score(model_instance.X_test, model_instance.y_test))

[0]
Accuracy:  0.8461538461538461
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=7d3ce7c8-a514-49e4-9ba4-a5899ac52ea5' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>