Universidad del Valle de Guatemala

## Proyecto: Defunciones Fetales / Nacimientos

Laura Tamath	19365 |     Andrea Amaya 19357 |
Brandon Hernández	 19376 |		Martin Amado	19020 |   Juan Pablo Pineda 19087

In [None]:
import pandas as pd 
import numpy as np
import pyreadstat
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

import pyclustertend 
import random
import sklearn.cluster as cluster
from sklearn.metrics import silhouette_samples, silhouette_score, confusion_matrix
import sklearn.preprocessing
from sklearn.cluster import Birch, KMeans

pd.options.display.max_columns = None
pd.options.mode.chained_assignment = None 

## DEFUNCIONES

In [None]:
var_names = ['dep_reg', 'mun_reg', 'mon_reg', 'year_reg', 'dep_occu','mun_occu','area_geog','sex_death','day_occu',
'month_occu','year_occu','part_type','birth_class','via_part','weeks_ges', 'mother_age' ,'mom_country_res','mom_dep_res',
'mom_mun_resi', 'mom_group', 'mom_civil_status', 'mom_nationality', 'mom_scholarship', 'mom_occupation', 'cause_death',
'assistance_received', 'site_occu', 'total_children', 'total_dead_children', 'total_living_children']

quan_vars = ['weeks_ges', 'mother_age', 'total_children', 'total_dead_children', 'total_living_children']
qual_vars = []
for var in var_names: 
  if var not in quan_vars: qual_vars.append(var)

In [None]:
def filter_df(data_frame, to_remove, is_quali=True):
  df = data_frame.copy()
  all_vars = var_names[:]
  remove_vars = to_remove[:]

  for var in remove_vars: all_vars.remove(var)
  df.columns = all_vars
  for var in remove_vars:
    df[var] = np.full(len(df.index), np.nan if is_quali else 0)
  return df.reindex(sorted(df.columns), axis=1).copy()

In [None]:
df_2009 = pd.read_spss('./data/db_2009.sav')
df_2010 = pd.read_spss('./data/db_2010.sav')
df_2011 = pd.read_spss('./data/db_2011.sav')
df_2012 = pd.read_spss('./data/db_2012.sav')
df_2013 = pd.read_spss('./data/db_2013.sav')
df_2014 = pd.read_spss('./data/db_2014.sav')
df_2015 = pd.read_spss('./data/db_2015.sav')
df_2016 = pd.read_spss('./data/db_2016.sav')
df_2017 = pd.read_spss('./data/db_2017.sav')
df_2018 = pd.read_spss('./data/db_2018.sav')
df_2019 = pd.read_spss('./data/db_2019.sav')
df_2020 = pd.read_spss('./data/db_2020.sav')

# Filter data
remove_2009 = ['via_part', 'mom_country_res', 'mom_scholarship']
remove_2010_2011 = ['mom_country_res']
remove_2012_2013_2014 = ['year_occu']
remove_2018_2019_2020 = ['area_geog']

df_2009 = filter_df(df_2009, remove_2009)
df_2009['year_occu'] = np.full(len(df_2009.index), '2009')
df_2009['year_reg'] = np.full(len(df_2009.index), '2009')
df_2010 = filter_df(df_2010, remove_2010_2011)
df_2011 = filter_df(df_2011, remove_2010_2011)
df_2012 = filter_df(df_2012, remove_2012_2013_2014)
df_2012['year_occu'] = np.full(len(df_2012.index), '2012')
df_2013 = filter_df(df_2013, remove_2012_2013_2014)
df_2013['year_occu'] = np.full(len(df_2013.index), '2013')
df_2014 = filter_df(df_2014, remove_2012_2013_2014)
df_2014['year_occu'] = np.full(len(df_2014.index), '2014')
df_2015= filter_df(df_2015, [])
df_2016= filter_df(df_2016, [])
df_2017= filter_df(df_2017, [])
df_2018 = filter_df(df_2018, remove_2018_2019_2020)
df_2019 = filter_df(df_2019, remove_2018_2019_2020)
df_2020 = filter_df(df_2020, remove_2018_2019_2020)
data = pd.concat([df_2009, df_2010, df_2011, df_2012, df_2013, df_2014, df_2015, df_2016, df_2017, df_2018, df_2019, df_2020])

data['day_occu'] = data['day_occu'].astype(float)
data['day_occu'] = data['day_occu'].astype(int)
data['year_reg'] = data['year_reg'].astype(float)
data['year_reg'] = data['year_reg'].astype(int)
data['year_occu'] = data['year_occu'].astype(int)
data[qual_vars] = data[qual_vars].astype(str)
data = data.reset_index()


quan_df = data[quan_vars].replace('Ignorado', -1).fillna(-1)
data = pd.concat([quan_df, data[qual_vars]], axis=1)

data = data.drop(data[data[quan_vars[0]] < 0].index)
data = data.drop(data[data[quan_vars[1]] < 0].index)

data = data.sample(n=380, random_state=1)

In [None]:
for var in qual_vars:
    data[var].value_counts().plot(kind='bar')
    plt.figure(figsize=(20,5))
    print('\n'+ var)
    plt.show()

In [None]:
# frequency table of place of residence and fetal deaths
pd.crosstab(index=data['mom_dep_res'],
            columns=data['total_dead_children'], margins=True)

In [None]:
# Graph of place of residence and fetal deaths
pd.crosstab(index=data['mom_dep_res'],
            columns=data['total_dead_children'], margins=True).apply(lambda r: r/r.sum() *100, axis=1).plot(kind='bar')

In [None]:
# frequency table of place of birth and fetal deaths
pd.crosstab(index=data['site_occu'],
            columns=data['total_dead_children'], margins=True)

In [None]:
# Graph of place of birth and fetal deaths
pd.crosstab(index=data['site_occu'],
            columns=data['total_dead_children'], margins=True).apply(lambda r: r/r.sum() *100, axis=1).plot(kind='bar')

In [None]:
# frequency table of type of birth and fetal deaths
pd.crosstab(index=data['part_type'],
            columns=data['total_dead_children'], margins=True)

In [None]:
# Graph of type of birth and fetal deaths
pd.crosstab(index=data['part_type'],
            columns=data['total_dead_children'], margins=True).apply(lambda r: r/r.sum() *100, axis=1).plot(kind='bar')

In [None]:
# frequency table of mother's marital status and fetal deaths
pd.crosstab(index=data['mom_civil_status'],
            columns=data['total_dead_children'], margins=True)

In [None]:
# Graph of mother's marital status and fetal deaths
pd.crosstab(index=data['mom_civil_status'],
            columns=data['total_dead_children'], margins=True).apply(lambda r: r/r.sum() *100, axis=1).plot(kind='bar')

In [None]:
# frequency table of gestation weeks and fetal deaths
pd.crosstab(index=data['weeks_ges'],
            columns=data['total_dead_children'], margins=True)

In [None]:
# Graph  of gestation weeks and fetal deaths
pd.crosstab(index=data['weeks_ges'],
            columns=data['total_dead_children'], margins=True).apply(lambda r: r/r.sum() *100, axis=1).plot(kind='bar')

In [None]:
# frequency table of place of birth and total born
pd.crosstab(index=data['site_occu'],
            columns=data['total_children'], margins=True)

### Clean of quant vars

In [None]:
quan_df = data[quan_vars]
quan_df[quan_vars] = quan_df[quan_vars].astype(float)
quan_df[quan_vars] = quan_df[quan_vars].astype(int)


graph_labels = [quan_vars[0], quan_vars[1]]

In [None]:
for var in quan_vars:
  serie = quan_df[quan_df[var] > 0][var]
  display(serie.describe())
  sns.displot(quan_df[var], kde=True)
  print('\033[1m' + var + '\033[0m' + ': Kurtosis:', stats.kurtosis(serie), 'Skewness:', stats.skew(serie), '\n')

In [None]:
k = 10 #number of variables for heatmap
corrmat = quan_df.corr()
cm = np.corrcoef(corrmat.values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=quan_vars, xticklabels=quan_vars)
plt.show()

In [None]:
sns.set()
sns.pairplot(quan_df, height= 5)
plt.show()

## NACIMIENTOS

In [None]:
nc_var_names = ['dep_reg', 'mun_reg', 'mon_reg', 'year_reg', 'inscirption', 'dep_occu','mun_occu', 'area_geog', 'pounds_weight', 'ounces_weight',
'day_occu', 'month_occu','year_occu', 'genre', 'part_type','via_part', 'dad_age', 'dad_contry', 'dep_dad', 'mun_dad',
'dad_group', 'dad_civil', 'birth_country_dad', 'birt_dep_dad', 'birth_mun_dad', 'dad_nan', 'dad_scholar', 'dad_occup', 'mother_age', 
'mom_country_res', 'mom_dep_res', 'mom_mun_resi', 'mom_group', 'mom_civil_status', 'birth_country_mom', 'birth_dep_mom', 'birth_mun_mom',
'mom_nationality', 'mom_scholarship', 'mom_occupation', 'assistance_received', 'site_occu', 'total_children', 'total_dead_children', 
'total_living_children']

filter = ['dep_reg', 'mun_reg', 'mon_reg', 'year_reg', 'dep_occu', 'mun_occu', 'day_occu', 'month_occu', 'part_type', 'via_part',
'mother_age', 'mom_country_res', 'mom_dep_res', 'mom_mun_resi', 'mom_group', 'mom_civil_status', 'mom_scholarship', 'mom_occupation', 
'assistance_received', 'site_occu', 'total_children', 'total_dead_children', 'total_living_children']

nc_quan_vars = ['mother_age', 'total_children', 'total_dead_children', 'total_living_children']
nc_qual_vars = []
for var in filter: 
  if var not in nc_quan_vars: nc_qual_vars.append(var)

In [None]:
def nc_filter_df(data_frame, to_remove, is_quali=True):
  df = data_frame.copy()
  all_vars = nc_var_names[:]
  remove_vars = to_remove[:]

  for var in remove_vars: all_vars.remove(var)
  df.columns = all_vars
  for var in remove_vars:
    df[var] = np.full(len(df.index), np.nan if is_quali else 0)
  return df.reindex(sorted(df.columns), axis=1).copy()

def drop_df_colums(data_frame, to_remove):
  df = data_frame.copy()
  all_vars = nc_var_names[:]
  remove_vars = to_remove[:]

  for var in remove_vars: all_vars.remove(var)
  for var in all_vars:
    df = df.drop(var, axis=1)
  return df.copy()

In [None]:
nc_2009 = pd.read_spss('./data/nc_2009.sav')
nc_2010 = pd.read_spss('./data/nc_2010.sav')
nc_2011 = pd.read_spss('./data/nc_2011.sav')
nc_2012 = pd.read_spss('./data/nc_2012.sav')
nc_2013 = pd.read_spss('./data/nc_2013.sav')
nc_2014 = pd.read_spss('./data/nc_2014.sav')
nc_2015 = pd.read_spss('./data/nc_2015.sav')
nc_2016 = pd.read_spss('./data/nc_2016.sav')
nc_2017 = pd.read_spss('./data/nc_2017.sav')
nc_2018 = pd.read_spss('./data/nc_2018.sav')
nc_2019 = pd.read_spss('./data/nc_2019.sav')
nc_2020 = pd.read_spss('./data/nc_2020.sav')

nc_2009 = nc_filter_df(nc_2009, ['inscirption', 'via_part', 'dad_contry', 'birth_country_dad', 'dad_scholar', 'mom_country_res', 'birth_country_mom', 'mom_scholarship'])
nc_2009['year_reg'] = np.full(len(nc_2009.index), '2009')
nc_2010 = nc_filter_df(nc_2010, ['inscirption', 'via_part', 'dad_contry', 'birth_country_dad', 'mom_country_res', 'birth_country_mom'])
nc_2010['year_reg'] = np.full(len(nc_2010.index), '2009')
nc_2011 = nc_filter_df(nc_2011, ['inscirption', 'via_part', 'dad_contry', 'birth_country_dad', 'mom_country_res', 'birth_country_mom'])
nc_2011['year_reg'] = np.full(len(nc_2011.index), '2009')
nc_2012 = nc_filter_df(nc_2012, ['inscirption', 'area_geog', 'year_occu', 'via_part'])
nc_2012['year_reg'] = np.full(len(nc_2012.index), '2009')
nc_2013 = nc_filter_df(nc_2013, ['inscirption', 'area_geog', 'year_occu', 'via_part'])
nc_2013['year_reg'] = np.full(len(nc_2013.index), '2009')
nc_2014 = nc_filter_df(nc_2014, ['inscirption', 'area_geog',  'year_occu', 'via_part', 'dad_nan', 'mom_nationality'])
nc_2014['year_reg'] = np.full(len(nc_2014.index), '2009')
nc_2015 = nc_filter_df(nc_2015, ['area_geog', 'dad_nan', 'mom_nationality'])
nc_2015['year_reg'] = np.full(len(nc_2015.index), '2009')
nc_2016 = nc_filter_df(nc_2016, ['area_geog', 'dad_nan', 'mom_nationality'])
nc_2016['year_reg'] = np.full(len(nc_2016.index), '2009')
nc_2017 = nc_filter_df(nc_2017, ['area_geog', 'dad_nan', 'mom_nationality'])
nc_2017['year_reg'] = np.full(len(nc_2017.index), '2009')
nc_2018 = nc_filter_df(nc_2018, ['area_geog', 'dad_nan', 'mom_nationality'])
nc_2018['year_reg'] = np.full(len(nc_2018.index), '2009')
nc_2019 = nc_filter_df(nc_2019, ['area_geog', 'dad_nan', 'mom_nationality'])
nc_2019['year_reg'] = np.full(len(nc_2019.index), '2009')
nc_2020 = nc_filter_df(nc_2020, ['area_geog', 'dad_nan', 'mom_nationality'])
nc_2020['year_reg'] = np.full(len(nc_2020.index), '2009')

nc_2009 = drop_df_colums(nc_2009, filter)
nc_2010 = drop_df_colums(nc_2010, filter)
nc_2011 = drop_df_colums(nc_2011, filter)
nc_2012 = drop_df_colums(nc_2012, filter)
nc_2013 = drop_df_colums(nc_2013, filter)
nc_2014 = drop_df_colums(nc_2014, filter)
nc_2015 = drop_df_colums(nc_2015, filter)
nc_2016 = drop_df_colums(nc_2016, filter)
nc_2017 = drop_df_colums(nc_2017, filter)
nc_2018 = drop_df_colums(nc_2018, filter)
nc_2019 = drop_df_colums(nc_2019, filter)
nc_2020 = drop_df_colums(nc_2020, filter)

nc_data = pd.concat([nc_2009, nc_2010, nc_2011, nc_2012, nc_2013, nc_2014, nc_2015, nc_2016, nc_2017, nc_2018, nc_2019, nc_2020])

In [None]:
nc_quan_df = nc_data[nc_quan_vars]
nc_quan_df = nc_quan_df[nc_quan_vars].replace('Ignorado', -1).fillna(-1)

In [None]:
for var in nc_quan_vars:
  serie = quan_df[quan_df[var] > 0][var]
  display(serie.describe())
  sns.displot(quan_df[var], kde=True)
  print('\033[1m' + var + '\033[0m' + ': Kurtosis:', stats.kurtosis(serie), 'Skewness:', stats.skew(serie), '\n')

In [None]:
k = 10 #number of variables for heatmap
corrmat = nc_quan_df.corr()
cm = np.corrcoef(corrmat.values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=quan_vars, xticklabels=quan_vars)
plt.show()

In [None]:
sns.set()
sns.pairplot(quan_df, height= 5)
plt.show()

In [None]:
for var in nc_qual_vars:
    data[var].value_counts().plot(kind='bar')
    plt.figure(figsize=(20,5))
    print('\n'+ var)
    plt.show()

In [None]:
# frequency table of registration department and births
pd.crosstab(index=data['dep_reg'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of registration township  and births
pd.crosstab(index=data['mun_reg'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of registration month and births
pd.crosstab(index=data['mon_reg'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of registration year and births
pd.crosstab(index=data['year_reg'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of occurrence department and births
pd.crosstab(index=data['dep_occu'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of occurrence township and births
pd.crosstab(index=data['mun_occu'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of occurrence day and births
pd.crosstab(index=data['day_occu'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of occurrence month and births
pd.crosstab(index=data['month_occu'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of type of birth and births
pd.crosstab(index=data['part_type'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of via of birth and births
pd.crosstab(index=data['via_part'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of mom country residence and births
pd.crosstab(index=data['mom_country_res'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of mom departament residence and births
pd.crosstab(index=data['mom_dep_res'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of mom township residence and births
pd.crosstab(index=data['mom_mun_resi'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of mom group and births
pd.crosstab(index=data['mom_group'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of mom civil status and births
pd.crosstab(index=data['mom_civil_status'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of mom scholarship and births
pd.crosstab(index=data['mom_scholarship'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of mom occupation and births
pd.crosstab(index=data['mom_occupation'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of asistence received and births
pd.crosstab(index=data['assistance_received'],
            columns=data['total_living_children'], margins=True)

In [None]:
# frequency table of site of occurrence and births
pd.crosstab(index=data['site_occu'],
            columns=data['total_living_children'], margins=True)

## Exploración de datos

1. Comience  describiendo  cuantas  variables  y  observaciones  tiene  disponibles,  el tipo de cada una de las variables. 

2. Haga un resumen de las variables numéricas e investigue si siguen una distribución normal y tablas de frecuencia para las variables categóricas, escriba lo que vaya encontrando. 

3. Cruce  las  variables  que  considere  que  son  las  más  importantes  para  hallar  los elementos  clave  que  lo  pueden  llevar  a  comprender  lo  que  está  causando  el problema encontrado.

4. Haga gráficos exploratorios que le de ideas del estado de los datos. 

5. Haga un agrupamiento (clustering) e interprete los resultados. 

## Clustering

In [None]:
random.seed(123)
X_scale=sklearn.preprocessing.scale(quan_df)

pyclustertend.hopkins(X_scale, len(X_scale))

El valor de hopkins es de 0.11, por lo que vale la pena hacer el agrupamiento al tener datos distribuidos de manera uniforme.

In [None]:
pyclustertend.vat(X_scale)

Se realiza la gráfica de codo para encontrar la cantidad óptima de clusters

In [None]:
numeroClusters = range(1,11)

wcss = []
# Obtenemos 10 posibles clusters
for i in numeroClusters:
    # Se calcula la kmean con esa cantidad de clusters
    kmeans = cluster.KMeans(n_clusters=i)
    kmeans.fit(X_scale)
    # Obtenemos la inercia
    wcss.append(kmeans.inertia_)

# Graficando
plt.plot(numeroClusters, wcss)
plt.xlabel("Cantidad de clusters")
plt.ylabel("WCSS")
plt.title("Gráfico de Codo")
plt.show()

Se harán uso de 4 clusters

In [None]:
number_clusters = 4
colors = ['mediumslateblue', 'skyblue', 'pink', 'cornflowerblue']

In [None]:
# Inicializar el modelo
birch_model = Birch(threshold=1.5, n_clusters=number_clusters)
birch_model.fit(X_scale)

# Obtenemos los puntos y los clusters
birch_result = birch_model.predict(X_scale)

for i in range(number_clusters):
    # Graficar los clusters
    plt.scatter(X_scale[birch_result == i, 0], X_scale[birch_result == i, 1], s = 100, c = colors[i], label = "Cluster %d" %i)

plt.title("Metodo de BIRCH")
plt.xlabel(graph_labels[0])
plt.ylabel(graph_labels[1])
plt.legend()
plt.show()

## Silueta

In [None]:
def make_silhouette(clusterer, n_clusters, label):
    fig, ax = plt.subplots(figsize=(1,1))
    fig.set_size_inches(18, 7)

    ax.set_xlim([-0.1, 1])
    ax.set_ylim([0, len(X_scale) + (n_clusters + 1) * 10])

    cluster_labels = clusterer.fit_predict(X_scale)

    silhouette_avg = silhouette_score(X_scale, cluster_labels)
    print(
        "For n_clusters =",
        n_clusters,
        "The average silhouette_score of",
        label,
        'is:',
        silhouette_avg,
    )

    sample_silhouette_values = silhouette_samples(quan_df, cluster_labels)

    y_lower = 10
    for i in range(n_clusters):
        ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]

        ith_cluster_silhouette_values.sort()

        size_cluster_i = ith_cluster_silhouette_values.shape[0]
        y_upper = y_lower + size_cluster_i

        color = colors[i]
        ax.fill_betweenx(
            np.arange(y_lower, y_upper),
            0,
            ith_cluster_silhouette_values,
            facecolor=color,
            edgecolor=color,
            alpha=0.7,
        )

        ax.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

        y_lower = y_upper + 10  # 10 for the 0 samples

    ax.set_title(label)
    ax.set_xlabel("The silhouette coefficient values")
    ax.set_ylabel("Cluster label")

    ax.axvline(x=silhouette_avg, color="red", linestyle="--")

    ax.set_yticks([]) 
    ax.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])

make_silhouette(birch_model, number_clusters, 'BIRCH')
plt.show()

## Matriz de confusion

In [None]:
stamps = ['dep_reg', 'assistance_received', 'site_occu', 'cause_death']

Cambiando las variables cualitativas a numeros

In [None]:
for stamp in stamps:
  aux = data.groupby(by=stamp)
  
  
  tag_group = list(aux.groups.keys())
  tag_group = [x.lower() for x in tag_group]
  tag_group = list(dict.fromkeys(tag_group))

  
  lower_case_col = data[[stamp]]
  lower_case_col[stamp] =  lower_case_col[stamp].str.lower()
  tags_list = lower_case_col.values.tolist()

  tags = []
  tag_to_number = {}
  number_to_taga = {}

  for i in range(len(tag_group)): 
    tag_to_number[tag_group[i]] = i
    number_to_taga[i] = tag_group[i]

  for i in range(len(tags_list)):
    tags.append(tag_to_number[tags_list[i][0]])

    # Se realiza el analisis de los grupos
  confusion_birch = confusion_matrix(birch_result, tags)[0:number_clusters]

  # Se observar como es que estan por categoria
  def get_category(confusion_array, label=''):
    print('_'*100)
    print('\nCONFUSION DE:', label, 'CON LA VARIABLE CUALITATIVA', stamp)
    keys = list(tag_to_number.keys())
    for i in range(number_clusters):
      print('\nCLUSTER #', i)
      result = list(confusion_array[i])
      index = result.index(max(result))
      '''
      for i in range(len(result)):
        print('%s con %d' %(keys[i], result[i]))
      '''

      print('Se asegura que es el grupo de: %s con %d' %(keys[index], result[index]))
    
    print('_'*100)

  get_category(confusion_birch, 'BIRCH')