In [9]:
#general
import pandas as pd
import numpy as np
import tensorflow as tf
import wandb

# EDA
from pandas_profiling import ProfileReport

#preprocessing
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler

#modelling
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Activation,Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam, RMSprop, SGD
from tensorflow.keras.losses import BinaryCrossentropy
from tensorflow_addons.optimizers import RectifiedAdam
from sklearn.metrics import accuracy_score
#feature engineering and data augmentation
from tabgan.sampler import OriginalGenerator, GANGenerator


In [2]:
df= pd.read_csv('./cardio_train.csv',delimiter=';')

# EDA 

In [None]:
df.head()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
 
# Creating dataset
fig = plt.figure(figsize =(10, 7))
 
# Creating plot
plt.boxplot(cpy.iloc[:,0:7])
 
# show plot
plt.show()

In [None]:
profile = ProfileReport(df)
profile

# Preprocessing

In [3]:
def preprocessing_p1(df):
    #drop id due to high cardinality
    df=df.drop('id', axis=1)
    
    df['age']=df['age']/365
    
    #calculate BMI and convert into a categorical columns
    
    df['bmi'] = df['weight'] / (( df['height'] / 100) ** 2)
    
#     def categorize_bmi(entry):
#         if entry<18.5:
#             return 'underweight'
#         elif 18.5<entry<24.9:
#             return 'normal'
#         elif 25.0<entry<29.9:
#             return 'overweight'
#         else:
#             return 'obese'
        
#     df['bmi']=df['bmi'].apply(categorize_bmi)
    
    #drop outliers in height and weight
    df.drop(df[(df['height'] > df['height'].quantile(0.975)) | (df['height'] < df['height'].quantile(0.025))].index,inplace=True)
    df.drop(df[(df['weight'] > df['weight'].quantile(0.975)) | (df['weight'] < df['weight'].quantile(0.025))].index,inplace=True)
    
    
    #drop rows were ap_hi is lower than 97,5% quantile
    df.drop(df[(df['ap_hi'] > df['ap_hi'].quantile(0.975)) | (df['ap_hi'] < df['ap_hi'].quantile(0.025))].index,inplace=True)
    df.drop(df[(df['ap_lo'] > df['ap_lo'].quantile(0.975)) | (df['ap_lo'] < df['ap_lo'].quantile(0.025))].index,inplace=True)
    
    return df 


def preprocessing_p2(df):
    
    #applying standard scaling to features

    col_names = ['age', 'height','weight','ap_hi','ap_lo','bmi']

    for column in col_names:
        ct = ColumnTransformer([('somename', StandardScaler(), [column])], remainder='passthrough')
        df[column]=ct.fit_transform(df[[column]])

    # one-hot encode categorical columns
    # columns not to transform
    cols_to_one_hot=['gender','cholesterol','gluc','smoke','alco','active'] #bmi
    
    #df = pd.get_dummies(data=df, columns=[col for col in df.columns if col in cols_to_one_hot])
    df = pd.get_dummies(data=df, columns=cols_to_one_hot)

    return df 


temp=preprocessing_p1(df)
cpy=preprocessing_p2(temp)

#initalize X with our DataFrame - the label we want to predict and assigning the target label to y 

X = cpy.drop('cardio',axis=1)
y = cpy['cardio'] 


#train-test split with the help of sklearn 
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.1)
X_train.head()

#we could've used tf to load the data like:
# df = tf.keras.datasets.<dataset>
# (X_train,y_train),(X_test,y_test) = df.load_data()


Unnamed: 0,age,height,weight,ap_hi,ap_lo,bmi,gender_1,gender_2,cholesterol_1,cholesterol_2,cholesterol_3,gluc_1,gluc_2,gluc_3,smoke_0,smoke_1,alco_0,alco_1,active_0,active_1
27369,1.583538,1.822098,1.254719,1.033989,1.086733,0.208503,0,1,1,0,0,1,0,0,1,0,1,0,0,1
967,0.935293,-0.227647,-1.41431,1.760641,1.086733,-1.297988,1,0,1,0,0,1,0,0,1,0,1,0,0,1
39309,0.400034,0.065174,0.565937,0.815993,-0.612484,0.502691,1,0,1,0,0,1,0,0,1,0,1,0,1,0
5997,-1.615012,-1.39893,0.910328,-0.419317,-0.126993,1.769295,1,0,1,0,0,0,1,0,1,0,0,1,0,1
40563,-0.104337,-0.666878,-0.725529,-0.419317,-0.126993,-0.404128,1,0,0,0,1,1,0,0,1,0,1,0,0,1


# Modelling

In [14]:
model = Sequential()

#model.add(Dense(units=16, activation='elu', input_dim=X_train.shape[1]))

model.add(Dense(units=8,input_dim=X_train.shape[1]))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(rate=0.2))

model.add(Dense(units=16))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(rate=0.5))

model.add(Dense(units=16))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(rate=0.5))

model.add(Dense(units=8))
model.add(Activation('relu'))
model.add(BatchNormalization())
model.add(Dropout(rate=0.5))

model.add(Dense(units=1))
model.add(Activation('sigmoid'))

optimizer=SGD(learning_rate= 0.05, momentum=0.5)
loss = BinaryCrossentropy(label_smoothing = 0.1)
model.compile(loss=loss, optimizer=optimizer, metrics= 'accuracy')

# > CONV/FC -> BatchNorm -> ReLu(or other activation) -> Dropout -> CONV/FC ->

In [15]:
wandb.init(project='my-test-project',config={'epochs':100,'batch_size':128})
model.fit(X_train,y_train, epochs=wandb.config.epochs, batch_size=wandb.config.batch_size, callbacks=[wandb.keras.WandbCallback()])

VBox(children=(Label(value='0.004 MB of 0.004 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016933333332417533, max=1.0…

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100

KeyboardInterrupt: 

In [None]:
#model.evaluate(X_test,y_test)

#prediction and saving model 

#y_hat=model.predict(X_test)
#y_hat=[0 if val<0.5 else 1 for val in y_hat]
#accuracy_score(y_test,y_hat)

#model.save('<nameoffolder>')
#model.load('<nameofmodel>')

In [7]:
# random input data
np.random.seed(0)
train = pd.DataFrame(np.random.randint(-10, 150, size=(150, 4)), columns=list("ABCD"))
target = pd.DataFrame(np.random.randint(0, 2, size=(150, 1)), columns=list("Y"))
test = pd.DataFrame(np.random.randint(0, 100, size=(100, 4)), columns=list("ABCD"))

# generate data
#new_train1, new_target1 = OriginalGenerator().generate_data_pipe(train, target, test, )
new_train, new_target = GANGenerator().generate_data_pipe(train, target, test,)
#df_merged = a.append(b, ignore_index=True)


Fitting CTGAN transformers for each column:   0%|          | 0/5 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/500 [00:00<?, ?it/s]



In [5]:
copy=preprocessing_p1(df)

X = copy.drop('cardio',axis=1)
y = copy['cardio'] 
X_train, X_test, y_train,y_test = train_test_split(X, y, test_size=0.1)

y_train= y_train.to_frame(name='cardio')


new_train, new_target = GANGenerator().generate_data_pipe(X_train, y_train, X_test)

new_target= new_target.to_frame(name='cardio')

X_train=X_train.append(new_train, ignore_index=True)
y_train=y_train.append(new_target, ignore_index=True)

X_train = preprocessing_p2(X_train)


Fitting CTGAN transformers for each column:   0%|          | 0/13 [00:00<?, ?it/s]

Training CTGAN, epochs::   0%|          | 0/500 [00:00<?, ?it/s]

