In [1]:
# Standar libraries
from datetime import datetime

# Scikit learn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA



# External modules
from module_path import test_data_path, train_data_path, plots_data_path
from module_data import Dataset
from module_graph import graph_tree
from module_model import ModelEvaluation, ModelEvaluationXG, ModelSubmission
from model_params2 import Models

Plots directory found in  ../plots


In [2]:
# get current date and time
start_datetime = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
print("Start date & time : ", start_datetime)

# create a Dataset object
df = Dataset(data_imputed = True, 
                data_standarized = False, 
                relevant_data = False)

# train dataframe, test dataframe, y targets dataframe
df_train, df_test, labels = df.process()

Start date & time :  2025-04-30 13:52:56
Train data directory found in  ../data/TRAIN_NEW
Test data directory found in  ../data/TEST
NaN values processed for every dataset by kNNImputer algorithm considering 5 neighbors.


In [3]:
# define array of target variables for the model
targets = ['ADHD_Outcome',  'Sex_F']

# Create the hiperparameters grid for the GridSearchCV
param_grid_lr = {
'C': [0.1, 1.0, 10.0],
'solver': ['lbfgs', 'liblinear'],
}

param_grid_rf = {
'n_estimators': [100, 500, 1000],
'max_depth': [3, 5, 10, 20],
'bootstrap': [True, False],
'criterion': ['gini', 'entropy']
}

iteraciones = 5000

scoring = 'f1'

In [4]:
lr_adhd = Models(X = df_train, 
                 y = labels[targets[0]], 
                 tag = 'adhd', 
                 test_size = 0.3, 
                 shuffle = True, 
                 random_state = 42)

In [5]:
model_lr_adhd = lr_adhd.log_regression(gridsearch = False,
                                           max_iter = 5000,
                                           param_grid = None,
                                           scoring = 'f1',
                                           cv = 5,
                                       model_evaluation = False)

In [6]:
model_lr_adhd.model

sklearn.linear_model._logistic.LogisticRegression

In [None]:
 # evaluate model Logistic Regression (adhd)
lr_adhd = ModelEvaluation(X=df_train, y=labels[targets[0]], tag='adhd')
best_model_lr_adhd, f1_lr = lr_adhd.evaluate_with_gridsearch(
                                                            base_model=LogisticRegression(max_iter=5000),
                                                            param_grid=param_grid_lr,
                                                            scoring='f1'
                                                            )

# evaluate model Logistic Regression (sex_f)
lr_sex_f = ModelEvaluation(X=df_train, y=labels[targets[1]], tag='sex_f')
best_model_lr_sex_f, f1_lr = lr_sex_f.evaluate_with_gridsearch(
                                                            base_model=LogisticRegression(max_iter=5000),
                                                            param_grid=param_grid_lr,
                                                            scoring='f1'
                                                            )
