# Primera entrega

## Limpieza de datos
En esta entrega se hará una recolección y limpieza de datos solamente. No habrá modelado ni análisis. 
<br>Solamente velar por la integridad, coherencia y cohesión del conjunto de datos. Se realizará una transformación de formato inicial de archivos html disfrazados de .xls a archivos .csv para mayor facilidad de manejo.

In [None]:
# !pip install -r requirements.txt

In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

In [17]:
def transform_html_to_csv(input_path: str, output_path: str=None, tables_to_convert: list[int] = [9]) -> None:
    '''
    Turns .html files into .csv files

    Params:
        input_path: where the .html files are stored
        output_path: where the .csv will be stored
    Returns:
        out: None
    '''
    if output_path is None:
        output_path = input_path 
    else:
        os.makedirs(output_path, exist_ok=True)

    for filename in tqdm(os.listdir(input_path), desc='Converting Files'):
        if filename.endswith('.xls'):
            html_path:str = os.path.join(input_path, filename)
            base_name:str = os.path.splitext(filename)[0]
            # try to read the excel and copy it into a .csv file
            try: 
                tables:list[pd.DataFrame] = pd.read_html(html_path)
                # if no tables, skip
                if not tables:
                    print(f"No tables found in '{filename}'")
                    continue

                # convert each of the tables selected to .csv
                for i, df in enumerate(tables):
                    if i in tables_to_convert:
                        csv_filename = f"{base_name}_table{i}.csv"
                        csv_path = os.path.join(output_path, csv_filename)
                        df.to_csv(csv_path, index=False)
                        print(f"Converted '{filename}' to '{csv_filename}'") # print all of the transformations of the .xls to its respective .csv files
                    
            except Exception as e:
                print(f"Error processing file '{filename}' : {e}")

def remove_files(path: str, extension: str) -> None:
    '''
    Removes all files that have a certain extension
    Input:
        path: folder path where .xls to be removed are stored
    '''
    for filename in os.listdir(path):
        if filename.endswith(extension):
            try:
                xls_path = os.path.join(path, filename)
                os.remove(xls_path)
            except Exception as e:
                print(f"Error removing file '{filename}' : {e}")


In [18]:
# start by turning .xls files to .csv files
transform_html_to_csv(input_path="../Dataset_raw", output_path="../Dataset_cleaned", tables_to_convert=[9])

Converting Files:   7%|▋         | 3/44 [00:00<00:01, 25.11it/s]

Converted 'alta_verapaz_basico.xls' to 'alta_verapaz_basico_table9.csv'
Converted 'alta_verapaz_diversificado.xls' to 'alta_verapaz_diversificado_table9.csv'
Converted 'baja_verapaz_basico.xls' to 'baja_verapaz_basico_table9.csv'
Converted 'baja_verapaz_diversificado.xls' to 'baja_verapaz_diversificado_table9.csv'
Converted 'chimaltenango_basico.xls' to 'chimaltenango_basico_table9.csv'
Converted 'chimaltenango_diversificado.xls' to 'chimaltenango_diversificado_table9.csv'
Converted 'chiquimula_basico.xls' to 'chiquimula_basico_table9.csv'


Converting Files:  25%|██▌       | 11/44 [00:00<00:00, 33.44it/s]

Converted 'chiquimula_diversificado.xls' to 'chiquimula_diversificado_table9.csv'
Converted 'el_progreso_basico.xls' to 'el_progreso_basico_table9.csv'
Converted 'el_progreso_diversificado.xls' to 'el_progreso_diversificado_table9.csv'
Converted 'escuintla_basico.xls' to 'escuintla_basico_table9.csv'
Converted 'escuintla_diversificado.xls' to 'escuintla_diversificado_table9.csv'


Converting Files:  34%|███▍      | 15/44 [00:00<00:01, 18.74it/s]

Converted 'guatemala_basico.xls' to 'guatemala_basico_table9.csv'
Converted 'guatemala_diversificado.xls' to 'guatemala_diversificado_table9.csv'
Converted 'huehuetenango_basico.xls' to 'huehuetenango_basico_table9.csv'
Converted 'huehuetenango_diversificado.xls' to 'huehuetenango_diversificado_table9.csv'


Converting Files:  50%|█████     | 22/44 [00:00<00:00, 24.32it/s]

Converted 'izabal_basico.xls' to 'izabal_basico_table9.csv'
Converted 'izabal_diversificado.xls' to 'izabal_diversificado_table9.csv'
Converted 'jalapa_basico.xls' to 'jalapa_basico_table9.csv'
Converted 'jalapa_diversificado.xls' to 'jalapa_diversificado_table9.csv'
Converted 'jutiapa_basico.xls' to 'jutiapa_basico_table9.csv'
Converted 'jutiapa_diversificado.xls' to 'jutiapa_diversificado_table9.csv'
Converted 'peten_basico.xls' to 'peten_basico_table9.csv'


Converting Files:  64%|██████▎   | 28/44 [00:01<00:00, 24.59it/s]

Converted 'peten_diversificado.xls' to 'peten_diversificado_table9.csv'
Converted 'quetzaltenango_basico.xls' to 'quetzaltenango_basico_table9.csv'
Converted 'quetzaltenango_diversificado.xls' to 'quetzaltenango_diversificado_table9.csv'
Converted 'quiche_basico.xls' to 'quiche_basico_table9.csv'
Converted 'quiche_diversificado.xls' to 'quiche_diversificado_table9.csv'


Converting Files:  70%|███████   | 31/44 [00:01<00:00, 25.36it/s]

Converted 'retalhuleu_basico.xls' to 'retalhuleu_basico_table9.csv'
Converted 'retalhuleu_diversificado.xls' to 'retalhuleu_diversificado_table9.csv'
Converted 'sacatepequez_basico.xls' to 'sacatepequez_basico_table9.csv'
Converted 'sacatepequez_diversificado.xls' to 'sacatepequez_diversificado_table9.csv'
Converted 'santa_rosa_basico.xls' to 'santa_rosa_basico_table9.csv'
Converted 'santa_rosa_diversificado.xls' to 'santa_rosa_diversificado_table9.csv'


Converting Files:  89%|████████▊ | 39/44 [00:01<00:00, 27.69it/s]

Converted 'san_marcos_basico.xls' to 'san_marcos_basico_table9.csv'
Converted 'san_marcos_diversificado.xls' to 'san_marcos_diversificado_table9.csv'
Converted 'solola_basico.xls' to 'solola_basico_table9.csv'
Converted 'solola_diversificado.xls' to 'solola_diversificado_table9.csv'
Converted 'suchitepequez_basico.xls' to 'suchitepequez_basico_table9.csv'
Converted 'suchitepequez_diversificado.xls' to 'suchitepequez_diversificado_table9.csv'
Converted 'totonicapan_basico.xls' to 'totonicapan_basico_table9.csv'
Converted 'totonicapan_diversificado.xls' to 'totonicapan_diversificado_table9.csv'


Converting Files: 100%|██████████| 44/44 [00:01<00:00, 26.50it/s]

Converted 'zacapa_basico.xls' to 'zacapa_basico_table9.csv'
Converted 'zacapa_diversificado.xls' to 'zacapa_diversificado_table9.csv'





In [None]:
# remove .xls files 
# remove_files(path="../Dataset_raw", extension='.xls')