In [1]:
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks

import os, sys

# Relative paths
dirname = os.path.dirname
sep = os.sep

ml_folder = dirname(os.getcwd())
sys.path.append(ml_folder)

from src.utils import mining_data_tb as md
from src.utils import visualization_tb as vi

import warnings

warnings.filterwarnings("ignore")

In [52]:
class ml_model:
    def __init__(self, model):
        # Data
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.features = None
        self.kfold = None

        # Training
        self.model = model
        self.train_scores = []
        self.val_scores = []
        self.train_set_structures = []
        self.val_set_structures = []
        self.feature_importances = None

        # Test
        self.train_score = None
        self.test_score = None
        self.train_structure = None
        self.test_structure = None
        self.prediction = None
        self.cm = None        

    #########
    def model_data(self, df, features, split, cv, epochs = 1, scaler = False, balance = None, seed = 42): 
        self.features = features

        ### Independent variables
        X = np.array(df.iloc[:, 1:])

        ### Dependent variable
        y = np.array(df.iloc[:, 0])

        ### Data scaling
        if scaler:
            scaler = StandardScaler()
            X = scaler.fit_transform(X)

        ### Train-test
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(X, y, test_size = split, random_state = seed)

        ### Balancing data
        if balance != None:
            sm = SMOTE(sampling_strategy = balance, random_state = seed, n_jobs = -1)
            self.X_train, self.y_train = sm.fit_resample(self.X_train, self.y_train)

        ### Cross validation
        self.kfold = RepeatedStratifiedKFold(n_splits = cv, n_repeats = epochs, random_state = seed)

    #########
    def ml_trainer(self):
        count = 1

        for (train, val) in self.kfold.split(self.X_train, self.y_train):
            # Train-Validation sets
            x_t, y_t = self.X_train[train], self.y_train[train]
            x_v, y_v = self.X_train[val], self.y_train[val]


            # Internal structure
            y_t_unique, y_t_counts = np.unique(y_t, return_counts=True)
            y_v_unique, y_v_counts = np.unique(y_v, return_counts=True)

            self.train_set_structures.append(dict(zip(y_t_unique, y_t_counts / len(y_t))))
            self.train_set_structures.append(dict(zip(y_v_unique, y_v_counts / len(y_v))))

            # Training
            self.model.fit(x_t, y_t)

            # Scores
            train_score = self.model.score(x_t, y_t)
            val_score = self.model.score(x_v, y_v)

            self.train_scores.append(train_score)
            self.val_scores.append(val_score)

            print(f"\n-- Model {count} --")
            print("-" * 25)
            print("Set structure:")
            print("Train structure:", dict(zip(y_t_unique, y_t_counts / len(y_t))))
            print("Validation structure:", dict(zip(y_v_unique, y_v_counts / len(y_v))))
            print("-" * 25)
            print(">train score:", train_score)
            print(">test score:", val_score)
            print("#" * 75)

            count += 1

        try:
            importances = self.model.feature_importances_
            feature_importances = list(zip(self.features, importances))

            self.feature_importances = pd.DataFrame(feature_importances, columns = ["features", "importance"]).sort_values(by = "importance", ascending = False)

            #return train_scores, val_scores, feature_importances_df
        
        except:
            pass
            #return train_scores, val_scores

    def ml_tester(self):
        # Internal structure
        y_train_unique, y_train_counts = np.unique(self.y_train, return_counts=True)
        y_test_unique, y_test_counts = np.unique(self.y_test, return_counts=True)

        self.train_structure =dict(zip(y_train_unique, y_train_counts / len(self.y_train) * 100))
        self.test_structure = dict(zip(y_test_unique, y_test_counts / len(self.y_test) * 100))

        # Scores
        self.train_score = model.score(self.X_train, self.y_train)
        self.test_score = model.score(self.X_test, self.y_test)

        # Prediction
        self.prediction = model.predict(self.X_test)

        # Confusion matrix
        self.cm = metrics.confusion_matrix(self.y_test, self.prediction)

    def ml_predictions(self, to_predict):
        new_predictions = self.model.predict(to_predict)
        return new_predictions

In [21]:
vardata = md.variables_data()
var_data_path = "data" + sep + "6_variables" + sep + "0_final_variables.csv"

vardata.var_data(2, var_data_path)

In [22]:
data = md.dataset()
data.read_all_data(0, ["1_demographics", "2_dietary", "3_examination", "4_laboratory", "5_questionnaire"])
data.concatenate_all_dfs()
data.merge_dfs()

columns_correction = {
            "WTDRD1_x" : "WTDRD1",
            "WTDR2D_x" : "WTDR2D",
            "DRABF_x" : "DRABF",
            "DRDINT_x" : "DRDINT",
            "WTSAF2YR_x" : "WTSAF2YR",
            "LBXHCT_x" : "LBXHCT"
        }
data.clean_columns(columns_correction)
data.heart_disease()

df = data.full_df

In [23]:
df.shape

(29285, 951)

In [24]:
df2 = df.loc[:, ["MCQ010", "RIAGENDR", "RIDAGEYR", "DR1TCHOL", "DR1TTFAT", "DR1TSFAT", "DR1TSUGR", "DR2TCHOL", "DR2TTFAT", "DR2TSFAT", "DR2TSUGR", "BPXDI1", "BPXSY1", "BMXWT", "DXDTOPF", "BMXWAIST", "LBXTR", "LBXTC", "LBXSGL"]]
df2 = df2.dropna()
df2.shape

(3754, 19)

In [25]:
features = vardata.vars_descr_detector(list(df2.columns))

In [53]:
model = RandomForestClassifier(n_jobs = -1, random_state = 42)

my_model = ml_model(model)
my_model.model_data(df2, features, split = 0.2, cv = 4, epochs = 1)
my_model.ml_trainer()
my_model.ml_tester()


-- Model 1 --
-------------------------
Set structure:
Train structure: {1.0: 0.16651865008880995, 2.0: 0.8325932504440497, 9.0: 0.0008880994671403197}
Validation structure: {1.0: 0.1677762982689747, 2.0: 0.8322237017310253}
-------------------------
>train score: 0.9995559502664298
>test score: 0.8308921438082557
###########################################################################

-- Model 2 --
-------------------------
Set structure:
Train structure: {1.0: 0.1669626998223801, 2.0: 0.8325932504440497, 9.0: 0.0004440497335701599}
Validation structure: {1.0: 0.16644474034620507, 2.0: 0.8322237017310253, 9.0: 0.0013315579227696406}
-------------------------
>train score: 1.0
>test score: 0.8229027962716379
###########################################################################

-- Model 3 --
-------------------------
Set structure:
Train structure: {1.0: 0.1669626998223801, 2.0: 0.8325932504440497, 9.0: 0.0004440497335701599}
Validation structure: {1.0: 0.16644474034620507, 

In [56]:
test = my_model.X_test[:15]

In [57]:
my_model.ml_predictions(test)

array([2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])