In [7]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

def data_cleaning(df):
    df['Sex'] = list(map(lambda x: 'M' if x==1 else 'F', df['Sex']))
    #Transforming into category, thus 0 and 1 doesn't have distance interpretation on sex
    
    df['Chest pain type'] = list(map(lambda x: 'Light' if x==1 else 'Common' if x==2 else 'Hard' if x==3 else 'Dangerous', df['Chest pain type']))
    #Transforming into category, same reason
    
    df['FBS over 120'] = list(map(lambda x: 'Y' if x==1 else 'N', df['FBS over 120']))
    #Transforming into category, same reason
    
    df['Exercise angina'] = list(map(lambda x: 'Y' if x==1 else 'N', df['Exercise angina']))
    # Transforming into category, same reason
    
    df['Slope of ST'] = list(map(lambda x: 'Upsloping' if x==1 else 'Flat' if x==2 else 'Downsloping', df['Slope of ST']))
    # Transforming into category, same reason
    
    df['EKG results'] = list(map(lambda x: 'N' if x in [0,1] else 'P', df['EKG results']))
    # Merging 0 and 1 to 'N' thus 0 and 1 have the same relation with Heart Disease and, transforming into category
    # thus, here also, there's no distance relation between 0, 1 and 2
    return df


NameError: name 'num_features' is not defined

In [15]:
target = 'Heart Disease'
df = pd.read_csv('df_att.csv')
X = df.drop(target, axis=1)
y = df[target]
num_features = X.select_dtypes(include='number').columns.tolist()
cat_features = X.select_dtypes(exclude='number').columns.tolist()


test = pd.read_csv('test.csv')
test = data_cleaning(test)

preprocessor = ColumnTransformer(
    transformers =[
        ("num", StandardScaler(), num_features),
        ("cat", OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)


preprocessor.fit(X)
X = preprocessor.transform(X)
test = preprocessor.transform(test)


In [16]:
params = {'C': 0.061907906899253747, 'penalty': 'l1', 'solver': 'liblinear'}

model = LogisticRegression(**params)


In [17]:
model.fit(X,y)

LR_prob = model.predict_proba(test)[:,1]
LR_prob




array([0.94736289, 0.00490409, 0.9948972 , ..., 0.05260028, 0.25624798,
       0.02204165], shape=(270000,))

In [18]:
np.save('LR_prob.npy', LR_prob)
