# UNIFICACION TABLA FINAL

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.formula.api as snf
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import r2_score,mean_squared_error


In [13]:
Censo = pd.read_csv("Censo.csv")
Emisiones = pd.read_csv("Emisiones.csv")
Hospitalizaciones= pd.read_csv("Hospitalizaciones.csv", encoding="iso-8859-1")

In [14]:
Tabla_Final = Hospitalizaciones.merge(
    Emisiones,
    on = ['Provincia_ID','Provincia','Año'],
    how = 'left'
).merge(
    Censo,
     on = ['Provincia_ID','Provincia','Año','Sexo'],
    how = 'left'
)

In [16]:
Tabla_Final['Porcentaje_H'] = 0.0  # Inicializar la columna

# Recorrer el DataFrame fila por fila
for index, row in Tabla_Final.iterrows():
    # Obtener los valores de Diagnóstico, Provincia, Sexo, Hospitalizaciones y Habitantes
    diagnostico = row['Diagnóstico']
    provincia = row['Provincia']
    sexo = row['Sexo']
    hospitalizaciones = row['Hospitalizaciones']
    habitantes = row['Habitantes']
    
    # Calcular el porcentaje de hospitalizaciones por habitantes (evitando división por cero)
    if habitantes > 0:
        porcentaje = round ((hospitalizaciones / habitantes) * 100,2)
    else:
        porcentaje = 0.0
    
    # Asignar el valor calculado a la nueva columna 'Porcentaje_H'
    Tabla_Final.at[index, 'Porcentaje_H'] = porcentaje


## Creacion de Columnas Metales Pesados e Indice Contaminacion para modelo predictivo

In [17]:
Tabla_Final['Metales Pesados'] = Tabla_Final [['As','Cd','Ni','Pb']].sum(axis=1).round(4)

In [18]:
Tabla_Final['Indice_Contaminación']= (0.5*Tabla_Final['PM25']+ 0.3*Tabla_Final['PM10']+ 0.2*Tabla_Final['BaP']).round(4)

In [20]:
Tabla_Final.to_csv('Tabla_Final.csv', index=False , encoding = 'utf-8')

In [8]:
Tabla_Final

Unnamed: 0,Año,Diagnóstico,Diagnóstico_ID,Provincia,Provincia_ID,Sexo,Hospitalizaciones,As,BaP,Cd,Ni,PM10,PM25,Pb,Habitantes,Porcentaje_H,Metales Pesados,Indice_Contaminación
0,2012,"Neoplasia maligna de tráquea, bronquios y pulmón",1,Almería,4,Hombres,2615,0.5523,0.1264,0.0843,4.8142,25.3795,12.9886,0.0045,350979,0.75,5.4553,14.1334
1,2012,"Neoplasia maligna de tráquea, bronquios y pulmón",1,Almería,4,Mujeres,681,0.5523,0.1264,0.0843,4.8142,25.3795,12.9886,0.0045,338341,0.20,5.4553,14.1334
2,2012,"Neoplasia maligna de tráquea, bronquios y pulmón",1,Cádiz,11,Hombres,6505,0.9225,0.1264,0.1749,14.0723,26.0824,16.0152,0.0069,620463,1.05,15.1766,15.8576
3,2012,"Neoplasia maligna de tráquea, bronquios y pulmón",1,Cádiz,11,Mujeres,929,0.9225,0.1264,0.1749,14.0723,26.0824,16.0152,0.0069,624899,0.15,15.1766,15.8576
4,2012,"Neoplasia maligna de tráquea, bronquios y pulmón",1,Córdoba,14,Hombres,3501,0.6451,0.1264,1.1759,2.1564,24.9661,10.9992,0.0233,394505,0.89,4.0007,13.0147
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10553,2021,"Neumoconiosis, enfermedades pulmonares por age...",9,Bizkaia,48,Mujeres,11193,0.5618,0.0940,0.6921,5.0880,18.2621,11.2742,0.0200,596445,1.88,6.3619,11.1345
10554,2021,"Neumoconiosis, enfermedades pulmonares por age...",9,Gipuzkoa,20,Hombres,7564,0.2387,0.0716,0.2239,2.0403,12.5108,8.9434,0.0103,354493,2.13,2.5132,8.2393
10555,2021,"Neumoconiosis, enfermedades pulmonares por age...",9,Gipuzkoa,20,Mujeres,5597,0.2387,0.0716,0.2239,2.0403,12.5108,8.9434,0.0103,371015,1.51,2.5132,8.2393
10556,2021,"Neumoconiosis, enfermedades pulmonares por age...",9,La Rioja,26,Hombres,5575,0.3667,0.1261,0.1327,2.0412,17.8790,5.5776,0.0036,157763,3.53,2.5442,8.1777


In [9]:
Tabla_Final.dtypes

Año                       int64
Diagnóstico              object
Diagnóstico_ID            int64
Provincia                object
Provincia_ID              int64
Sexo                     object
Hospitalizaciones         int64
As                      float64
BaP                     float64
Cd                      float64
Ni                      float64
PM10                    float64
PM25                    float64
Pb                      float64
Habitantes                int64
Porcentaje_H            float64
Metales Pesados         float64
Indice_Contaminación    float64
dtype: object

# MODELO PREDICTIVO

## Random Forest

In [10]:
import joblib

In [10]:
Tabla_Final = Tabla_Final.drop(columns=['Año','Provincia_ID','Diagnóstico_ID','As','BaP','Cd','Ni','PM10','PM25','Pb'])

In [13]:
Tabla_Final.head(3)

Unnamed: 0,Diagnóstico,Provincia,Sexo,Hospitalizaciones,Habitantes,Porcentaje_H,Metales Pesados,Indice_Contaminación
0,"Neoplasia maligna de tráquea, bronquios y pulmón",Almería,Hombres,2615,350979,0.75,5.4553,14.1334
1,"Neoplasia maligna de tráquea, bronquios y pulmón",Almería,Mujeres,681,338341,0.2,5.4553,14.1334
2,"Neoplasia maligna de tráquea, bronquios y pulmón",Cádiz,Hombres,6505,620463,1.05,15.1766,15.8576


In [17]:
# Diccionario para almacenar los codificadores
label_encoders = {}

# Codificar las columnas categóricas
for column in ['Diagnóstico', 'Provincia', 'Sexo']:
    le = LabelEncoder()
    Tabla_Final[column] = le.fit_transform(Tabla_Final[column])
    label_encoders[column] = le

In [18]:
Tabla_Final.head(3)

Unnamed: 0,Diagnóstico,Provincia,Sexo,Hospitalizaciones,Habitantes,Porcentaje_H,Metales Pesados,Indice_Contaminación
0,6,3,0,2615,350979,0.75,5.4553,14.1334
1,6,3,1,681,338341,0.2,5.4553,14.1334
2,6,14,0,6505,620463,1.05,15.1766,15.8576


In [19]:
X = Tabla_Final.drop(columns=['Porcentaje_H','Hospitalizaciones'])
y = Tabla_Final['Hospitalizaciones']

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [21]:
model = RandomForestRegressor(n_estimators=300, max_depth=50, min_samples_split=5, min_samples_leaf=4, random_state=42)
model.fit(X_train, y_train)

model.fit(X_train,y_train)

In [22]:
y_pred = model.predict(X_test)

In [23]:
r2 = r2_score(y_test,y_pred)
rmse = np.sqrt(mean_squared_error(y_test,y_pred))

print(f'R2:{r2:.2f}')
print(f'RMSE: {rmse:.2f}')

R2:0.57
RMSE: 8736.84


# Visualizacion

In [35]:
import joblib
import os

# Obtener la ruta actual
ruta_actual = os.getcwd()

# Guardar el modelo en la carpeta actual
joblib.dump(model, os.path.join(ruta_actual, 'modelo_random_forest.pkl'))
joblib.dump(label_encoders, os.path.join(ruta_actual, 'label_encoders.pkl'))

print("Modelo y codificadores guardados exitosamente en:", ruta_actual)

Modelo y codificadores guardados exitosamente en: /Users/josemiguel/Desktop/Proyecto/Analisis
