In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import normalize

Dataset and Preprocesing

In [2]:
# Load the dataset
df = pd.read_csv('dataset_def_2022.csv')
df.head()

Unnamed: 0,anio_clave_geo,clave_geo_loc,clave_geo_mun,ent_regis,mun_regis,ent_resid,mun_resid,grado_marginacion,loc_resid,tloc_resid,...,rel_emba,rel_emba_d,razon_m,maternas,lista_mex_dgis,lista mex_inegi,grupo_cancer,desc_basica,anio_regis,ent_resid_desc
0,202214053,140530001,14053,1,1,14,53,Muy bajo,1,13,...,9,No especificada,0,,COVID-19,COVID-19,,"COVID-19, virus identificado",2022,Jalisco
1,202214116,141160048,14116,1,1,14,116,Muy bajo,48,2,...,8,No aplica,0,,Enfermedades del corazón,Infarto agudo del miocardio,,"INFARTO AGUDO DEL MIOCARDIO, SIN OTRA ESPECIFI...",2022,Jalisco
2,202214035,140350001,14035,1,1,14,35,Muy bajo,1,8,...,8,No aplica,0,,COVID-19,COVID-19,,"COVID-19, virus identificado",2022,Jalisco
3,202214035,140350001,14035,1,1,14,35,Muy bajo,1,8,...,8,No aplica,0,,Síndrome de dependencia del alcohol,Síndrome de dependencia del alcohol,,TRASTORNOS MENTALES Y DEL COMPORTAMIENTO DEBID...,2022,Jalisco
4,202214064,140640019,14064,1,5,14,64,Bajo,19,2,...,9,No especificada,0,,COVID-19,COVID-19,,"COVID-19, virus identificado",2022,Jalisco


In [3]:
# Basic exploration
print("Shape of the dataset:", df.shape)
df.info()
print(df.describe())

# Check for missing values
missing_values = df.isnull().sum()
print("Missing values in each column:\n", missing_values)


Shape of the dataset: (55973, 56)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55973 entries, 0 to 55972
Data columns (total 56 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   anio_clave_geo     55973 non-null  int64 
 1   clave_geo_loc      55973 non-null  int64 
 2   clave_geo_mun      55973 non-null  int64 
 3   ent_regis          55973 non-null  int64 
 4   mun_regis          55973 non-null  int64 
 5   ent_resid          55973 non-null  int64 
 6   mun_resid          55973 non-null  int64 
 7   grado_marginacion  55813 non-null  object
 8   loc_resid          55973 non-null  int64 
 9   tloc_resid         55973 non-null  int64 
 10  tam_loc_resid      55973 non-null  object
 11  ent_ocurr          55973 non-null  int64 
 12  mun_ocurr          55973 non-null  int64 
 13  loc_ocurr          55973 non-null  int64 
 14  tloc_ocurr         55973 non-null  int64 
 15  tam_loc_ocurr      55973 non-null  object
 16  causa_

Preprocesing

In [4]:
# Drop columns
df.drop(columns=[
    'ent_regis', 'ent_resid', 'tam_loc_resid', 'ent_ocurr', 'tloc_ocurr',
    'tam_loc_ocurr', 'sexo_d', 'edad', 'grupo_edad', 'dia_ocurr',
    'dia_nacim', 'mes_nacim', 'anio_nacim', 'escolaridad_d', 'edo_civil_d',
    'ocurr_trab', 'ocurr_trab_d', 'lugar_ocur', 'lugar_ocur_d', 'cond_cert',
    'cond_cert_d', 'derechohab_d', 'embarazo', 'embarazoD', 'rel_emba',
    'rel_emba_d', 'razon_m', 'maternas', 'lista_mex_dgis', 'grupo_cancer',
    'ent_resid_desc'
], inplace=True)

# Map datatypes
df['grado_marginacion'] = df['grado_marginacion'].map({'Muy bajo': 1, 'Bajo': 2, 'Medio': 3, 'Alto': 4, 'Muy alto': 5})
df['tipo_edad'] = df['tipo_edad'].map({'Años': 1, 'Meses': 2, 'Días': 3, 'Horas': 4, 'Minutos': 5})

# Handle Missing Values
df['lista mex_inegi'] = df['lista mex_inegi'].fillna('None')
df['desc_basica'] = df['desc_basica'].fillna('None')
mean_value = df['grado_marginacion'].mean()
df['grado_marginacion'] = df['grado_marginacion'].fillna(mean_value)

In [5]:
df.dtypes

anio_clave_geo         int64
clave_geo_loc          int64
clave_geo_mun          int64
mun_regis              int64
mun_resid              int64
grado_marginacion    float64
loc_resid              int64
tloc_resid             int64
mun_ocurr              int64
loc_ocurr              int64
causa_def             object
lista_mex             object
lista_mex_d           object
sexo                   int64
tipo_edad              int64
edad_a                 int64
mes_ocurr              int64
anio_ocur              int64
ocupacion              int64
escolarida             int64
edo_civil              int64
derechohab             int64
lista mex_inegi       object
desc_basica           object
anio_regis             int64
dtype: object

In [6]:
df.isnull().sum()

anio_clave_geo       0
clave_geo_loc        0
clave_geo_mun        0
mun_regis            0
mun_resid            0
grado_marginacion    0
loc_resid            0
tloc_resid           0
mun_ocurr            0
loc_ocurr            0
causa_def            0
lista_mex            0
lista_mex_d          0
sexo                 0
tipo_edad            0
edad_a               0
mes_ocurr            0
anio_ocur            0
ocupacion            0
escolarida           0
edo_civil            0
derechohab           0
lista mex_inegi      0
desc_basica          0
anio_regis           0
dtype: int64

NLP and Normalization

Normalization

Embedding

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform()

TypeError: TfidfVectorizer.fit_transform() missing 1 required positional argument: 'raw_documents'

Tests