# Importing

In [2]:
#!pip install pingouin
import pandas as pd 
import os 
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.anova import AnovaRM
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import category_encoders as ce
from sklearn.linear_model import LinearRegression
import plotly.express as px 
import dash
from scipy.stats import pearsonr
from dash import dcc, html, Input, Output
import numpy as np
import scipy.stats as stats
from scipy.stats import levene
from scipy.stats import skew, kurtosis
from statsmodels.graphics.gofplots import qqplot
import pingouin as pg
from sklearn.model_selection import cross_val_score

In [3]:
dir=os.path.join('datasets\\human_cognitive_performance.csv')
df=pd.read_csv(dir)

In [4]:
df

Unnamed: 0,User_ID,Age,Gender,Sleep_Duration,Stress_Level,Diet_Type,Daily_Screen_Time,Exercise_Frequency,Caffeine_Intake,Reaction_Time,Memory_Test_Score,Cognitive_Score,AI_Predicted_Score
0,U1,57,Female,6.5,3,Non-Vegetarian,6.5,Medium,41,583.33,65,36.71,39.77
1,U2,39,Female,7.6,9,Non-Vegetarian,10.8,High,214,368.24,58,54.35,57.68
2,U3,26,Male,8.2,6,Vegetarian,5.7,Low,429,445.21,49,32.57,29.54
3,U4,32,Male,7.8,9,Vegetarian,8.3,Low,464,286.33,94,70.15,74.59
4,U5,50,Male,9.7,2,Non-Vegetarian,11.3,Medium,365,237.65,62,87.54,91.78
...,...,...,...,...,...,...,...,...,...,...,...,...,...
79995,U79996,55,Male,7.8,9,Non-Vegetarian,9.4,Low,157,312.34,55,50.80,46.49
79996,U79997,35,Female,9.8,6,Vegetarian,6.5,Medium,130,290.17,86,94.89,92.41
79997,U79998,32,Female,7.3,10,Non-Vegetarian,7.5,Low,462,249.57,72,63.51,67.35
79998,U79999,34,Male,8.7,8,Vegetarian,7.2,Medium,96,319.48,82,81.43,77.95


# Cleaning

In [6]:
df.drop(['User_ID','AI_Predicted_Score'],axis=1,inplace=True) 

In [7]:
df[df.isnull().any(axis=1)] 

Unnamed: 0,Age,Gender,Sleep_Duration,Stress_Level,Diet_Type,Daily_Screen_Time,Exercise_Frequency,Caffeine_Intake,Reaction_Time,Memory_Test_Score,Cognitive_Score


In [8]:
df[df.duplicated()] 

Unnamed: 0,Age,Gender,Sleep_Duration,Stress_Level,Diet_Type,Daily_Screen_Time,Exercise_Frequency,Caffeine_Intake,Reaction_Time,Memory_Test_Score,Cognitive_Score


#### there were not any duplicate values or missing values and it has been removed these irrelevant variables for the analysis (User_ID, AI_Predicted_Score)

# Bivariate Analysis

In [11]:
df

Unnamed: 0,Age,Gender,Sleep_Duration,Stress_Level,Diet_Type,Daily_Screen_Time,Exercise_Frequency,Caffeine_Intake,Reaction_Time,Memory_Test_Score,Cognitive_Score
0,57,Female,6.5,3,Non-Vegetarian,6.5,Medium,41,583.33,65,36.71
1,39,Female,7.6,9,Non-Vegetarian,10.8,High,214,368.24,58,54.35
2,26,Male,8.2,6,Vegetarian,5.7,Low,429,445.21,49,32.57
3,32,Male,7.8,9,Vegetarian,8.3,Low,464,286.33,94,70.15
4,50,Male,9.7,2,Non-Vegetarian,11.3,Medium,365,237.65,62,87.54
...,...,...,...,...,...,...,...,...,...,...,...
79995,55,Male,7.8,9,Non-Vegetarian,9.4,Low,157,312.34,55,50.80
79996,35,Female,9.8,6,Vegetarian,6.5,Medium,130,290.17,86,94.89
79997,32,Female,7.3,10,Non-Vegetarian,7.5,Low,462,249.57,72,63.51
79998,34,Male,8.7,8,Vegetarian,7.2,Medium,96,319.48,82,81.43


In [12]:
print(f'number of rows: {df.shape[0]} ,number of columns : {df.shape[1]}')

number of rows: 80000 ,number of columns : 11


In [13]:
numerical_columns= ['Age','Sleep_Duration','Daily_Screen_Time','Caffeine_Intake','Reaction_Time','Memory_Test_Score']

In [14]:
app= dash.Dash(__name__)
app.layout = html.Div([
    html.H2('Relation With Cognitive Score'),
    dcc.Dropdown(
        id='x-axis-dropdown',
        options=[{'label':col,'value':col} for col in numerical_columns ],
        value='Age'
    ),   
        dcc.Graph(id='matrix')
                
])

@app.callback(
    Output('matrix','figure'),
    Input('x-axis-dropdown','value')
)

def update_graph(selected_columns):
    if not selected_columns:
        return px.scatter(title="Relation with cognitive score")

        


    selected_var = selected_columns

    fig=px.density_heatmap(
        data_frame=df,
        x=selected_columns,
        y='Cognitive_Score',
        nbinsx=40,
        nbinsy=40,
        color_continuous_scale='Viridis'
     
)

    return fig 

if __name__ == '__main__':
    app.run_server(port=8054,debug=True)





In [None]:
app = dash.Dash(__name__)
app.layout = html.Div([
    html.H2('Relation With Cognitive Score'),
    dcc.Dropdown(
        id='x-axis-dropdown',
        options=[{'label': col, 'value': col} for col in numerical_columns],
        value='Age'
    ),   
    dcc.Graph(id='matrix')        
])

@app.callback(
    Output('matrix', 'figure'),
    Input('x-axis-dropdown', 'value')
)
def update_graph(selected_column):
    if not selected_column:
        return px.scatter(title="Relation with Cognitive Score")

    # Regla de Sturges: número de bins ≈ log2(n) + 1
    n = df.shape[0]
    sturges = int(np.log2(n) + 1)


    df['binned'] = pd.cut(df[selected_column], bins=sturges)

    # Calcular media del Cognitive Score por cada bin
    agg_df = df.groupby('binned', observed=True)['Cognitive_Score'].mean().reset_index()

   # Crear etiquetas limpias del tipo "30–35"
    agg_df['binned_str'] = agg_df['binned'].apply(lambda x: f"{int(x.left)}–{int(x.right)}")

    # Crear gráfico de líneas
    fig = px.line(
        agg_df,
        x='binned_str',
        y='Cognitive_Score',
        markers=True,
        title=f'Promedio del Cognitive Score por rangos de {selected_column}'
    )

    fig.update_traces(
    mode="lines+markers", 
    marker=dict(size=6, color="white",line=dict(width=2.5,color='Skyblue')) 
)
    
    fig.update_layout(plot_bgcolor="Black",  
                      paper_bgcolor="Black",xaxis_title=selected_column, yaxis_title='Promedio Cognitive Score',
                     font=dict(family="Dni light", size=14, color="White"))

    return fig

if __name__ == '__main__':
    app.run_server(port=8054,debug=True)


# Correlation Analysis

In [None]:
df.columns

In [None]:
df_correlation=df[['Age', 'Sleep_Duration', 
       'Daily_Screen_Time','Caffeine_Intake',
       'Reaction_Time', 'Memory_Test_Score', 'Cognitive_Score']]

In [None]:
corr_matrix = df_correlation.corr()

In [None]:
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Mapa de calor de correlaciones')
plt.savefig("heatmap_correlaciones.png", dpi=300, bbox_inches='tight')
plt.show()

In [None]:
correlation_results = []

for col in df_correlation.columns:
    if col != 'Cognitive_Score':
        r, p = pearsonr(df_correlation[col], df_correlation['Cognitive_Score'])
        correlation_results.append({
            'Variable': col,
            'Coeficiente de correlación (r)': round(r, 3),
            'Valor p': round(p, 4),
            'Significativo (p < 0.05)': p < 0.05
        })

# Convertir resultados a DataFrame
df_resultados = pd.DataFrame(correlation_results)
print(df_resultados)

# Validacion de supuestos -- Anova

## Normalidad

In [None]:
df_categorical=df[['Gender','Diet_Type','Exercise_Frequency','Cognitive_Score']]

In [None]:
df_categorical

In [None]:
group_counts = df_categorical.groupby(['Gender', 'Diet_Type', 'Exercise_Frequency']).size()
print(group_counts)

In [None]:


def analizar_normalidad_por_grupo(df, grupo, variable_continua='Cognitive_Score'):
    categorias = df[grupo].unique()
    
    resultados = []
    
    for categoria in categorias:
        datos = df[df[grupo] == categoria][variable_continua]
        
        if len(datos) > 0:
            # Cálculo de estadísticos
            s = skew(datos)
            k = kurtosis(datos)
            
            # Visualización
            plt.figure(figsize=(12, 4))
            
            # Histograma
            plt.subplot(1, 2, 1)
            sns.histplot(datos, kde=True)
            plt.title(f'{grupo}={categoria}\nAsimetría={s:.2f}, Curtosis={k:.2f}')
            
            # QQ-plot
            plt.subplot(1, 2, 2)
            qqplot(datos, line='s', ax=plt.gca())
            plt.title('QQ-plot')
            
            plt.tight_layout()
             # Guardar la figura
            filename = f'{grupo}_{categoria}_normalidad.png'
            plt.savefig(os.path.join('Images', filename))  # Guarda la imagen en la carpeta especificada
            plt.show()
            plt.close()  # Cerrar la figura después de guardarla para liberar memoria
            

# Aplicar a todas tus variables categóricas
resultados_gender = analizar_normalidad_por_grupo(df_categorical, 'Gender')
resultados_diet = analizar_normalidad_por_grupo(df_categorical, 'Diet_Type')
resultados_exercise = analizar_normalidad_por_grupo(df_categorical, 'Exercise_Frequency')

## Homogeniedad de varianzas 

In [None]:
from scipy.stats import levene

# Levene's Test para Gender
grupo_male = df[df['Gender'] == 'Male']['Cognitive_Score']
grupo_female = df[df['Gender'] == 'Female']['Cognitive_Score']
grupo_other = df[df['Gender'] == 'Other']['Cognitive_Score']

stat_gender, p_gender = levene(grupo_male, grupo_female, grupo_other)
print(f"Levene test para Gender - p-value: {p_gender:.4f}")

# Levene's Test para Diet_Type
grupos_dieta = [grupo['Cognitive_Score'] for name, grupo in df.groupby('Diet_Type')]
stat_dieta, p_dieta = levene(*grupos_dieta)
print(f"Levene test para Diet_Type - p-value: {p_dieta:.4f}")

# Levene's Test para Exercise_Frequency
grupos_ejercicio = [grupo['Cognitive_Score'] for name, grupo in df.groupby('Exercise_Frequency')]
stat_ejercicio, p_ejercicio = levene(*grupos_ejercicio)
print(f"Levene test para Exercise_Frequency - p-value: {p_ejercicio:.4f}")


# ANOVA

In [None]:
# Modelo de three-way ANOVA con interacción
modelo = ols('Cognitive_Score ~ C(Gender) * C(Diet_Type)', data=df_categorical).fit()
anova_tabla = sm.stats.anova_lm(modelo, typ=2)
# Extraer sum_sq
ss_total = anova_tabla['sum_sq'].sum()
eta_squared = anova_tabla['sum_sq'] / ss_total

# Añadirlo a la tabla original
anova_tabla['eta_squared'] = eta_squared
anova_tabla

In [None]:
# por no tener varianza homogenea exercise frequency utilizamos welch annova 
welch_anova = pg.welch_anova(dv='Cognitive_Score', between='Exercise_Frequency', data=df_categorical)
print(welch_anova.to_string(float_format='{:.3f}'.format)) # cambiando el numero de decimales 

In [None]:
suma_total_sq=5.910541e+02+1.525509e+03+2.602179e+06+5.582019e+02+2.023083e+03+1.347967e+03+3.738810e+03+3.992376e+07

In [None]:
print(suma_total_sq)

In [None]:
print(f'porcentage de variabilidad explicadoa por la variable que rechazo la hipotesis nula (Exercise_Frequency): {round((2602179/suma_total_sq)*100)} %')

# Coeficientes de regresion

In [None]:
df

In [None]:
df['Diet_Type'].unique()

In [None]:
    numerical_features=['Age','Sleep_Duration','Daily_Screen_Time','Caffeine_Intake',
                        'Reaction_Time','Memory_Test_Score','Stress_Level']
    categorical_features=['Gender','Diet_Type'] 
    ordinal_features=['Exercise_Frequency'] #ordinal

### Multicolinealidad

In [None]:
#looking for multicollinearity in the numeric variables 
    
X_numerical = add_constant(df[numerical_features])
vif_data = pd.DataFrame()
    
vif_values = []  
    
for i in range(X_numerical.shape[1]):  
    vif = variance_inflation_factor(X_numerical.values, i)   
    vif_values.append(vif)                                            
    
    
vif_data["feature"] = X_numerical.columns  
vif_data["VIF"] = vif_values 
    
print(vif_data)

#### no hay colinealidad en las diferentes variables numericas

### Splitting data

In [None]:
X = df.drop(columns=["Cognitive_Score"])
y = df["Cognitive_Score"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
exercise_mapping = {'Low': 1, 'Medium': 2, 'High': 3}

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('numerical', StandardScaler(), numerical_features),
        ('categorical', OneHotEncoder(drop='first'), categorical_features),
        ('ordinal', ce.OrdinalEncoder(
            cols=ordinal_features,
            mapping=[{
                'col': 'Exercise_Frequency',
                'mapping': exercise_mapping
            }]
        ), ordinal_features)
    ]
)

In [None]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
pipeline

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
feature_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

In [None]:
feature_names

In [None]:
coefficients = pipeline.named_steps['regressor'].coef_

In [None]:
coef_df=pd.DataFrame({'features':feature_names,
                    'coeficientes':coefficients}).sort_values(by='coeficientes',key=abs,ascending=False)

In [None]:
coef_df

In [None]:
df['Cognitive_Score'].describe()

##  p value de la regresion

In [None]:
# preprocesando los datos antes de sacar el p-value 
X_train_processed = preprocessor.fit_transform(X_train)

In [None]:
# Añadir una constante (intercepto) a X_train para statsmodels
X_train_with_const = sm.add_constant(X_train_processed)

# Ajustar el modelo con statsmodels para obtener los valores p
model_sm = sm.OLS(y_train, X_train_with_const).fit()

# Ver el resumen del modelo, que incluye los p-values
print(model_sm.summary())

## Overfitting

In [None]:
scores = cross_val_score(pipeline, X, y, cv=5, scoring='r2')
print("R² en CV (5 folds):", scores)
print("Media:", scores.mean(), "±", scores.std())