# Preparacion de datos

In [1]:
import pandas as pd

# Cargar los archivos CSV en DataFrames
df_parkinson = pd.read_csv("Parkinson.csv")  # Casos de Parkinson
df_contaminacion = pd.read_csv("Contaminacion_aire.csv")  # Tasa de contaminación
df_plomo = pd.read_csv("Plomo.csv")  # Tasa de carga de enfermedad por exposición al plomo
df_pesticidas = pd.read_csv("Pepticidas.csv")  # Uso de pesticidas en toneladas
df_precipitaciones = pd.read_csv("Precipitaciones.csv")  # Precipitaciones en mm
df_calidad_agua = pd.read_csv("Calidad_agua.csv")


In [2]:
display(df_parkinson)

Unnamed: 0,Año,País,Parkinson
0,1990,Cambodia,21.533830
1,1991,Cambodia,21.483840
2,1992,Cambodia,21.502138
3,1993,Cambodia,21.607151
4,1994,Cambodia,21.767658
...,...,...,...
7291,2017,South America,101.905710
7292,2018,South America,105.467110
7293,2019,South America,108.644905
7294,2020,South America,110.542580


In [5]:
# Unir todos los DataFrames por 'Año' y 'País'
df = df_parkinson.merge(df_contaminacion, on=['Año', 'País'], how='left')
df = df.merge(df_plomo, on=['Año', 'País'], how='left')
df = df.merge(df_calidad_agua, on=["Año","País"], how='left')
df = df.merge(df_pesticidas, on=['Año', 'País'], how='left')
df_final = df.merge(df_precipitaciones, on=['Año', 'País'], how='left')

# Mostrar los primeros datos
display(df_final)

Unnamed: 0,Año,País,Parkinson,Tasa_contaminacion_Aire,Exp_Plomo,Muertes,Pesticidas,Precipitación (mm)
0,1990,Cambodia,21.533830,0.770228,799.84644,5680.9985,258.02,1951.19900
1,1991,Cambodia,21.483840,0.867021,798.64690,5618.9200,653.00,904.05820
2,1992,Cambodia,21.502138,0.876114,799.89874,5418.9050,545.00,1474.90120
3,1993,Cambodia,21.607151,0.992484,804.59204,5273.5493,687.00,640.35815
4,1994,Cambodia,21.767658,0.902714,812.10320,5176.5522,2196.54,977.33185
...,...,...,...,...,...,...,...,...
7291,2017,South America,101.905710,,,5844.3230,,
7292,2018,South America,105.467110,,,5681.7460,,
7293,2019,South America,108.644905,,,5672.4670,,
7294,2020,South America,110.542580,,,5413.5270,,


In [7]:
# Calcular el porcentaje de valores NaN en cada columna
nan_percentage = df_final.isnull().mean() * 100

# Mostrar el porcentaje de valores NaN por columna
print(nan_percentage)


Año                         0.000000
País                        0.000000
Parkinson                   0.000000
Tasa_contaminacion_Aire     2.631579
Exp_Plomo                   2.631579
Muertes                     0.000000
Pesticidas                 18.078399
Precipitación (mm)         19.736842
dtype: float64


**ELIMINCACIÓN DE NULOS**

In [6]:
df_final_no_nulos  = df_final.dropna()
display(df_final_no_nulos)

Unnamed: 0,Año,País,Parkinson,Tasa_contaminacion_Aire,Exp_Plomo,Muertes,Pesticidas,Precipitación (mm)
0,1990,Cambodia,21.533830,0.770228,799.84644,5680.9985,258.02,1951.19900
1,1991,Cambodia,21.483840,0.867021,798.64690,5618.9200,653.00,904.05820
2,1992,Cambodia,21.502138,0.876114,799.89874,5418.9050,545.00,1474.90120
3,1993,Cambodia,21.607151,0.992484,804.59204,5273.5493,687.00,640.35815
4,1994,Cambodia,21.767658,0.902714,812.10320,5176.5522,2196.54,977.33185
...,...,...,...,...,...,...,...,...
6779,2017,Sudan,33.183840,2.207330,1261.99870,2538.9585,18849.00,1076.95200
6780,2018,Sudan,33.985485,2.483574,1248.51800,1889.6714,18849.00,221.57901
6781,2019,Sudan,34.539875,2.573782,1235.76650,1719.6345,20448.00,878.00850
6782,2020,Sudan,33.731514,2.687866,1224.61080,1554.0481,24062.00,1252.68140


**TABLA FINAL PARA EL MODELO**

In [27]:
df_final_no_nulos = df_final_no_nulos.drop(columns=['Año', 'País'])
display(df_final_no_nulos)

Unnamed: 0,Parkinson,Tasa_contaminacion_Aire,Exp_Plomo,Muertes,Pesticidas,Precipitación (mm)
0,21.533830,0.770228,799.84644,5680.9985,258.02,1951.19900
1,21.483840,0.867021,798.64690,5618.9200,653.00,904.05820
2,21.502138,0.876114,799.89874,5418.9050,545.00,1474.90120
3,21.607151,0.992484,804.59204,5273.5493,687.00,640.35815
4,21.767658,0.902714,812.10320,5176.5522,2196.54,977.33185
...,...,...,...,...,...,...
6779,33.183840,2.207330,1261.99870,2538.9585,18849.00,1076.95200
6780,33.985485,2.483574,1248.51800,1889.6714,18849.00,221.57901
6781,34.539875,2.573782,1235.76650,1719.6345,20448.00,878.00850
6782,33.731514,2.687866,1224.61080,1554.0481,24062.00,1252.68140


In [28]:
df_final_no_nulos.to_csv('Tabla_modelos.csv', index=False)

**MODELOS GLM**

In [91]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import statsmodels.api as sm
from sklearn.preprocessing import StandardScaler

def entrenar_modelo_glm(df, modelo_familia, variables_independientes, variable_dependiente, test_size=0.2, scaler=False):
    # Definir las variables independientes (X) y dependiente (y)
    X = df[variables_independientes]
    y = df[variable_dependiente]

    # Dividir en conjunto de entrenamiento y prueba
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

    # Escalar las variables si es necesario
    if scaler:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)
    
    # Añadir constante para el término de intercepto en la regresión
    X_train = sm.add_constant(X_train)
    X_test = sm.add_constant(X_test)

    # Crear y entrenar el modelo GLM con la familia elegida
    modelo = sm.GLM(y_train, X_train, family=modelo_familia).fit()

    # Mostrar los resultados del modelo
    print(modelo.summary())

    # Realizar predicciones sobre el conjunto de prueba
    y_pred = modelo.predict(X_test)

    # Evaluar el modelo con el Error Cuadrático Medio (RMSE)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f'Error Cuadrático Medio (RMSE): {rmse}')

    # Evaluar el modelo con el Error Absoluto Medio (MAE)
    mae = mean_absolute_error(y_test, y_pred)
    print(f'Error Absoluto Medio (MAE): {mae}')

    return modelo, rmse, mae

# Ejemplo de uso:

# Cargar los datos
df = pd.read_csv("Tabla_modelos.csv")

# Definir las variables
variables_independientes = ['Tasa_contaminacion_Aire', 'Exp_Plomo', 'Muertes', 'Pesticidas', 'Precipitación (mm)']
variable_dependiente = 'Parkinson'


**Regresión de Poisson**

In [92]:
# Para el modelo Poisson
print("\nModelo Poisson:")
modelo_poisson, rmse_poisson, mae_poisson = entrenar_modelo_glm(df, sm.families.Poisson(), variables_independientes, variable_dependiente)




Modelo Poisson:
                 Generalized Linear Model Regression Results                  
Dep. Variable:              Parkinson   No. Observations:                 4346
Model:                            GLM   Df Residuals:                     4340
Model Family:                 Poisson   Df Model:                            5
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -68675.
Date:                Sat, 22 Mar 2025   Deviance:                   1.1192e+05
Time:                        13:16:23   Pearson chi2:                 1.60e+05
No. Iterations:                     6                                         
Covariance Type:            nonrobust                                         
                              coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
const    

  x = pd.concat(x[::order], 1)


In [56]:
# Cargar datos
df = pd.read_csv("Tabla_modelos.csv")

# Calcular la media y la varianza de la variable dependiente 'Parkinson'
media = df['Parkinson'].mean()
varianza = df['Parkinson'].var()

# Mostrar los resultados
print(f"Media de Parkinson: {media}")
print(f"Varianza de Parkinson: {varianza}")


Media de Parkinson: 82.40091383388582
Varianza de Parkinson: 6340.221275949008
