In [1]:
import pandas as pd
from pandas.plotting import scatter_matrix
import numpy as np
import joblib

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold
from sklearn import metrics
from sklearn.preprocessing import StandardScaler

from imblearn.over_sampling import SMOTE

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, callbacks

import os, sys

# Helpers
abspath = os.path.abspath
dirname = os.path.dirname
sep = os.sep
file_ = os.getcwd()

ml_folder = dirname(file_)
sys.path.append(ml_folder)

from src.utils import mining_data_tb as md
from src.utils import visualization_tb as vi
from src.utils import folder_tb as fo
from src.utils import models as mo


import warnings

warnings.filterwarnings("ignore")

In [2]:
# As the data variables are coded (for instance, "RIAGENDR" is Gender), we first need to load the variable descriptions. For that, we will create an object with all the info and methods to change names whenever necessary

# 1) We create the object
vardata = md.variables_data()

# 2) We load the info
vardata_path = "data" + sep + "6_variables" + sep + "0_final_variables.csv"
vardata.load_data(2, vardata_path)

# Now we can load the actual dataset we will be using for the ml models

# 1) Create object
dataset = md.dataset()

# 2) Load data
folders = ["1_demographics", "2_dietary", "3_examination", "4_laboratory", "5_questionnaire"]
columns_correction = {
            "WTDRD1_x" : "WTDRD1",
            "WTDR2D_x" : "WTDR2D",
            "DRABF_x" : "DRABF",
            "DRDINT_x" : "DRDINT",
            "WTSAF2YR_x" : "WTSAF2YR",
            "LBXHCT_x" : "LBXHCT"
        }
dataset.load_data(2, folders, columns_correction)

# As we can see, it is quite wide. Let's filter by the columns we will actually be using (using our magnificent object)
# We will start by trying to predict Asthma
features = ["MCQ160H", "RIAGENDR", "RIDAGEYR", "DR1TCHOL", "DR1TTFAT", "DR1TSFAT", "DR1TSUGR", "DR2TCHOL", "DR2TTFAT", "DR2TSFAT", "DR2TSUGR", "BPXDI1", "BPXSY1", "BMXWT", "DXDTOPF", "BMXWAIST", "LBXTR", "LBXTC", "LBXSGL"]

dataset.filter_columns(features, inplace = True)

features_names = vardata.vars_descr_detector(list(dataset.df.columns), 40, True)

dataset.df = dataset.df.drop(["LBXTR", "DXDTOPF"], axis = 1)
dataset.df = dataset.df.dropna()

In [11]:
test = dataset.df
test.RIAGENDR = test.RIAGENDR.replace([1, 2], ["Male", "Female"])
test = pd.get_dummies(test, prefix = "", prefix_sep = "", columns = ["RIAGENDR"])
test.head()

Unnamed: 0_level_0,MCQ160H,RIDAGEYR,DR1TCHOL,DR1TTFAT,DR1TSFAT,DR1TSUGR,DR2TCHOL,DR2TTFAT,DR2TSFAT,DR2TSUGR,BPXDI1,BPXSY1,BMXWT,BMXWAIST,LBXTC,LBXSGL,Female,Male
SEQN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
83732,0,62,138.0,79.24,23.43,42.31,635.0,121.59,40.42,118.4,70.0,128.0,94.8,101.1,173.0,94.0,0,1
83733,0,53,407.0,77.91,25.722,180.84,773.0,154.55,32.969,34.52,88.0,146.0,90.4,107.9,265.0,94.0,0,1
83734,1,78,506.0,91.97,23.122,62.87,326.0,90.14,39.829,94.51,46.0,138.0,83.4,116.5,229.0,103.0,0,1
83736,0,42,96.0,19.63,4.581,71.84,181.0,50.88,21.936,73.63,70.0,100.0,55.2,80.4,204.0,83.0,1,0
83737,0,72,233.0,43.08,11.817,22.31,186.0,19.28,4.639,23.99,58.0,116.0,64.4,92.9,190.0,97.0,1,0


In [12]:
loaded_model = joblib.load(fo.path_to_folder(2, "models" + sep + "model_comparison_scale_balance") + "LogisticRegression(max_iter=300, n_jobs=-1, random_state=42).pkl")

dataset.model_data(split = .2, cv = 3, epochs = 1)
pd.Series(loaded_model.predict(dataset.X_test)).value_counts()

1    1386
0    1304
dtype: int64

In [15]:
vardata.vars_descr_detector(["MCQ160H", "RIAGENDR", "RIDAGEYR", "DR1TCHOL", "DR1TTFAT", "DR1TSFAT", "DR1TSUGR", "DR2TCHOL", "DR2TTFAT", "DR2TSFAT", "DR2TSUGR", "BPXDI1", "BPXSY1", "BMXWT", "BMXWAIST", "LBXTC", "LBXSGL"], nom_included = True)

['MCQ160H',
 'RIAGENDR: Gender of the participant.',
 'RIDAGEYR: Age in years of the participant at the time of screening. Individuals 80 and over are topcoded at 80 years of age.',
 'DR1TCHOL: Cholesterol (mg)',
 'DR1TTFAT: Total fat (gm)',
 'DR1TSFAT: Total saturated fatty acids (gm)',
 'DR1TSUGR: Total sugars (gm)',
 'DR2TCHOL: Cholesterol (mg)',
 'DR2TTFAT: Total fat (gm)',
 'DR2TSFAT: Total saturated fatty acids (gm)',
 'DR2TSUGR: Total sugars (gm)',
 'BPXDI1: Diastolic:  Blood pressure (first reading) mm Hg',
 'BPXSY1: Systolic:  Blood pressure (first reading) mm Hg',
 'BMXWT: Weight (kg)',
 'BMXWAIST: Waist Circumference (cm)',
 'LBXTC: Total Cholesterol (mg/dL)',
 'LBXSGL: Glucose, refrigerated serum (mg/dL)']

# Prepare data for the ML models
# We won't scale or balance the data for this first round
dataset.model_data(split = .2, cv = 3, epochs = 1)

# Choose models for test against each other
# Same seed of all of them
seed = 42

log1 = LogisticRegression(n_jobs = -1, random_state = seed)
log2 = LogisticRegression(n_jobs = -1, max_iter = 300, random_state = seed)

models = [log1, log2]

# Models comparison
# Ensemble models
ensembler = mo.model_ensembler(models)
# Load data
ensembler.load_data(dataset.X_train, dataset.X_test, dataset.y_train, dataset.y_test, features_names, dataset.kfold)
# Train them
ensembler.models_tester()


ensembler.metrics

In [5]:
#path_to_save = fo.path_to_folder(2, "models") + "first_attempt"
#ensembler.models_saver(path_to_save)

'Succesfully saved'

In [6]:
dataset.model_data(split = .2, cv = 3, epochs = 1)
my_model = mo.ml_model(log1)
my_model.load_data(dataset.X_train, dataset.X_test, dataset.y_train, dataset.y_test, features_names, dataset.kfold)
my_model.ml_trainer()
my_model.ml_tester()

In [7]:
#my_model.model_saver(fo.path_to_folder(2, "models") + "second_attempt")

'Succesfully saved'

os.listdir(fo.path_to_folder(2, "models"))

loaded_model = joblib.load(fo.path_to_folder(2, "models") + "second_attempt.pkl")
loaded_model

# Let's prepare the data and choose a model

# 1) Prepare data for ml models
# For this, we have an useful method in our dataset object. We'll try a basic split and validation, without making any further change to the data (for this attempt)
dataset.model_data(split = .2, cv = 5, epochs = 1)

# 2) Choose model
model = LogisticRegression(n_jobs = -1, max_iter = 300)

# 3) We create a ml_model object and load the data
my_model = md.ml_model(model)

# Load data
my_model.load_data(dataset.X_train, dataset.X_test, dataset.y_train, dataset.y_test, features_names, dataset.kfold)

# 4) We train the model
my_model.ml_trainer()

my_model.model.intercept_

path_to_save = fo.path_to_folder(2, "models") + "First_attempt.pkl"
joblib.dump(my_model.model, path_to_save)

loaded_model = joblib.load(path_to_save)
loaded_model

loaded_model.intercept_