# Production Code

In [None]:
import os
import re
import numpy as np
import pandas as pd

source_directory = "../surveys/plain_txt/"

## Extract
### Helper Functions

In [None]:
def find_between(str_1, str_2, contents):
    regex_between = str_1 + r"([\w\W]*)" + str_2
    regex_search = re.findall(regex_between, contents)
    
    return regex_search

### Parsing Functions

In [None]:
def parse_interviewee(contents):
    name_regex = r"nombre de la persona[^\n:]*:[\s]*([^\n]*)\n"
    name_search = re.findall(name_regex, contents)
    if len(name_search) == 1:
        name = name_search[0]
        if name == "3":
            name = "N/A"
    else:
        alt_name_regex = r"nombre de la persona[^\n\(]*\(\w* +([^\n]*)\n"
        alt_name_search = re.findall(alt_name_regex, contents)
        if len(alt_name_search) == 1:
            name = alt_name_search[0]
            if name == "3":
                name = "N/A"
        else:
            name = "UNENCOUNTERED FORMAT"

    name = name.strip()
            
    return name


def parse_date_range(contents):
    date_range = ""
    
    section_start = "v-1: ventas de ganado"
    section_end = "v-2: ventas de"
    section_search = find_between(section_start, section_end, contents)
    
    if len(section_search) == 1:
        section_string = section_search[0]
        
        date_regex = r"([\w ]+\d+ *[-–] *[\w ]+\d+)"
        date_search = re.findall(date_regex, section_string)
        
        if len(date_search) == 1:
            date_range = date_search[0]
        else:
            date_range = "UNENCOUNTERED FORMAT 2"
    else:
        date_range = "UNENCOUNTERED FORMAT"
    
    return date_range


def parse_section_5_1(contents):
    section_5_1_data = []

    section_start = "v-1: ventas de ganado"
    section_end = "v-2: ventas de"
    section_search = find_between(section_start, section_end, contents)

    if len(section_search) == 1:
        section_string = section_search[0] + "v-2:"

        product_list = ["toro para carne \/\n\nnovillo \(\+300 kg\)\n", "vaca\n", "ternero\n", "vaquilla\n", 
                        "bovino macho\n\nreproductor\n", "llama macho para carne\n", "lama hembra\n",
                        "llama macho reproductor\n", "borrego \(macho joven\)\n", "cordero\n", "capón\n",
                        "oveja\n", "ovino macho adulto\n\npara carne\n", 
                        "ovino macho reproductor\n", "chivo \(macho adulto\)\n", 
                        "caprino reproductor macho\n\nadulto \(castrón\)\n",
                        "cabra \(hembra\)\n", "chivo capón \(castrado\)\n", "cabrito \/ chivito mamón\n",
                        "cabrito \/ chivito lechón\n", "burro macho\n", "burro hembra\n",
                        "equino macho reproductor\n", "equino macho para carne\n",
                        "equino hembra \/ yegua\n", "potrillo \/ potranca\n", 
                        "equino manso \(macho \/ hembra\)\n", "mular\n", 
                        "cerdo macho reproductor \(padrillo\)\n", "cerda \(madre\) hembra\n", 
                        "maltón \(cachorro\)\n", "cerdo capón \(castrado\) \/ cerda para\n\ncarne\n",
                        "lechón \/ lechona\n", "cerdo macho adulto para carne\n", "v-2:"]
        translation_list = ["bull_or_steer_for_meat_300_+_kg" ,"cow", "calf",
                            "heifer", "male_bovine_breeding", 
                            "male_llama_for_meat", "female_llama", 
                            "male_llama_breeding", "male_sheep_young", "lamb",
                            "capon", "female_sheep" ,"male_sheep_adult_for_meat",
                            "male_sheep_breeding", "male_goat", "male_goat_breeding",
                            "female_goat", "male_goat_castrated", "goat_young_suckling_mamón",
                            "goat_young_suckling_lechón", "male_donkey", "female_donkey",
                            "male_equine_breeding", "male_equine_for_meat",
                            "female_equine", "equine_young", "equine_tame",
                            "mule", "male_pig_breeding", "female_pig_breeding",
                            "puppy", "male_pig_castrated_or_for_meat", "pig_suckling",
                            "male_pig_for_meat"]
        # column headers not including species column
        column_headers = ["has_sold", "month_of_sale_1", "number_1", 
                          "unit_price_1", "month_of_sale_2", "number_2", 
                          "unit_price_2", "month_of_sale_3", "number_3",
                          "unit_price_3", "market_destination"]

        for i in range(len(product_list)-1):
            item_dict = {}
            item_1 = product_list[i]
            item_2 = product_list[i+1]
            term = translation_list[i]

            row_search = find_between(item_1, item_2, section_string)

            item_dict["species"] = term

            # if product row found
            if len(row_search) == 1:
                row_string = row_search[0]
                row_list = row_string.split("\n")
                row_list_len = len(row_list)

                # decide whether single newline or double newline formatting
                # should be used to parse the information by column
                # also check to see if unintentional newlines were added that
                # would mess with parsing script
                if row_list_len >=26:
                    # skip odd indexes (1, 3, 5, 7)
                    # drop first two rows (start count at -2)

                    count = -3
                    if row_list[1] != "":
                        count = -1
                    for j in range(row_list_len):
                        if count > 20:
                            break;
                        if count >= 0 and count % 2 == 0:
                            col = column_headers[count//2]
                            entry = row_list[j]
                            entry = entry.strip()
                            item_dict[col] = entry
                        count += 1

                else:
                    for col in column_headers:
                        item_dict[col] = "UNENCOUNTERED FORMAT"
            else:
                for col in column_headers:
                    item_dict[col] = "UNENCOUNTERED FORMAT"

            section_5_1_data.append(item_dict)
    else:
        section_5_1_data = [{}]
    
    return section_5_1_data

### Process the Whole File

In [None]:
def process_files(source_directory):
    dataset = []

    for process_file in os.listdir(source_directory):
        file_path = os.path.join(source_directory, process_file)

        # with statements automatically control the closing of files
        with open(file_path, "r") as file:
            contents = file.read()
            contents = contents.lower()

            data_dict = {}

            interviewee = parse_interviewee(contents)
            date_range = parse_date_range(contents)
            
            section_5_1_data = parse_section_5_1(contents)
            
            # develop a check to make sure files contain the right amount
            # of parsed entries
            if len(section_5_1_data) != 34:
                print(f"Unencountered Format: {process_file}\n"
                      + f"Length of outflow_data: {len(section_5_1_data)}\n")
            
            for row in section_5_1_data:
                data_dict = {}
                data_dict["filename"] = process_file
                data_dict["interviewee"] = interviewee
                data_dict["date_range"] = date_range
                
                for key, value in row.items():
                    data_dict[key] = value

                dataset.append(data_dict)

                
    # convert list to DataFrame
    raw_df = pd.DataFrame(data=dataset)

    return raw_df

## Transform

### Helper Functions

In [None]:
def transform_date_range(dataf):
    replace_dict = {"agosto 2018 - agosto 2019": "august 2018-2019",
                    "septiembre 2018 - septiembre 2019": "september 2018-2019",
                    "octubre 2018 – octubre 2019": "october 2018-2019",
                    "octubre 2018 - octubre 2019": "october 2018-2019",
                    "octubre 2018 - octubre2019": "october 2018-2019"}
    dataf["date_range"] = dataf["date_range"].replace(replace_dict)
    
    return dataf

### Pipeline Functions

In [None]:
def start_pipeline(dataf):
    return dataf.copy() 


def transform_values(dataf):
    dataf = transform_date_range(dataf)
    
    return dataf

### Create Raw and Cleaned DataFrame

In [None]:
raw_df = process_files(source_directory)

clean_df = (raw_df
            .pipe(start_pipeline)
            .pipe(transform_values))

## Load
### Export to CSV

In [None]:
datasets_directory = "../datasets/"
filename = "section_5_1.csv"
file_path = os.path.join(datasets_directory, filename)

clean_df.to_csv(file_path, index=False, na_rep="null")

## Other
### Functions to Check the Implementation

In [None]:
def get_dropped_df(raw_dataf, clean_dataf):
    raw_indexes = raw_dataf.index
    clean_indexes = clean_dataf.index
    
    dropped_indexes = raw_indexes[~raw_indexes.isin(clean_indexes)]
    dropped_df = raw_dataf.loc[dropped_indexes].copy()

    return dropped_df

### Create Dropped DataFrame

In [None]:
dropped_df = get_dropped_df(raw_df, clean_df)

***
# Testing Code
### View the Data

In [None]:
clean_df.head(50)

### Check for Parsing Errors

In [None]:
clean_df[clean_df["has_sold"] == "UNENCOUNTERED FORMAT"]

In [None]:
cols_to_check = clean_df.columns
cols_to_drop = ["filename", "interviewee"]

cols_to_check = cols_to_check.drop(cols_to_drop)

for col in cols_to_check:
    print("*" * 50)
    print(" " * 5 + col)
    print(clean_df[col].value_counts(dropna=False))
    print("*" * 50)
    print("\n")

### Check Dropped Rows
Another way to check is by opening the variable inspector and manually scrolling
through the `df_dropped` DataFrame. 

The code below tells us whether or not there are files that contain no information for this section. If raw file count is the same as clean file count, then no files were dropped.

In [None]:
print("Raw File Count: {}".format(len(raw_df["filename"].value_counts())))
print("Clean File Count: {}".format(len(clean_df["filename"].value_counts())))

If there are files that are completely dropped, find those files and look at them to check for parsing errors.

Check the rows value counts to see if any rows are being dropped that should not be.

In [None]:
cols_to_check = dropped_df.columns
cols_to_drop = ["filename", "interviewee"]

cols_to_check = cols_to_check.drop(cols_to_drop)

for col in cols_to_check:
    print("*" * 50)
    print(" " * 5 + col)
    print(dropped_df[col].value_counts(dropna=False))
    print("*" * 50)
    print("\n")

### Duplicate Rows

In [None]:
df_columns = clean_df.columns[clean_df.columns != "filename"]
clean_df[clean_df.loc[:, df_columns].duplicated(keep=False)]

## Experimental Code

In [None]:
# -------------------------------------------------------------------------- #
# TESTING CODE
process_file = "-28 Raúl Roberto Guzmán.txt"
# process_file = "--Encuesta Fao- Juan Basilio Carrizo.txt"
# process_file = "67 Encuesta N° 2 - FAO cd.txt"

file_path = os.path.join(source_directory, process_file)

file = open(file_path, "r")

contents = file.read()
contents = contents.lower()

In [None]:
section_5_1_data = []

section_start = "v-1:"
section_end = "v-2:"
section_search = find_between(section_start, section_end, contents)

if len(section_search) == 1:
    section_string = section_search[0] + "v-2:"

    product_list = ["toro para carne \/\n\nnovillo \(\+300 kg\)\n", "vaca\n", "ternero\n", "vaquilla\n", 
                    "bovino macho\n\nreproductor\n", "llama macho para carne\n", "lama hembra\n",
                    "llama macho reproductor\n", "borrego \(macho joven\)\n", "cordero\n", "capón\n",
                    "oveja\n", "ovino macho adulto\n\npara carne\n", 
                    "ovino macho reproductor\n", "chivo \(macho adulto\)\n", 
                    "caprino reproductor macho\n\nadulto \(castrón\)\n",
                    "cabra \(hembra\)\n", "chivo capón \(castrado\)\n", "cabrito \/ chivito mamón\n",
                    "cabrito \/ chivito lechón\n", "burro macho\n", "burro hembra\n",
                    "equino macho reproductor\n", "equino macho para carne\n",
                    "equino hembra \/ yegua\n", "potrillo \/ potranca\n", 
                    "equino manso \(macho \/ hembra\)\n", "mular\n", 
                    "cerdo macho reproductor \(padrillo\)\n", "cerda \(madre\) hembra\n", 
                    "maltón \(cachorro\)\n", "cerdo capón \(castrado\) \/ cerda para\n\ncarne\n",
                    "lechón \/ lechona\n", "cerdo macho adulto para carne\n", "v-2:"]
    translation_list = ["bull_or_steer_for_meat_300+kg" ,"cow", "calf",
                        "heifer", "male_bovine_breeding", 
                        "male_llama_for_meat", "female_llama", 
                        "male_llama_breeding", "male_sheep_young", "lamb",
                        "capon", "female_sheep" ,"male_sheep_adult_for_meat",
                        "male_sheep_breeding", "male_goat", "male_goat_breeding",
                        "female_goat", "male_goat_castrated", "goat_young_suckling",
                        "goat_young_suckling", "male_donkey", "female_donkey",
                        "male_equine_breeding", "male_equine_for_meat",
                        "female_equine", "equine_young", "equine_tame",
                        "mule", "male_pig_breeding", "female_pig_breeding",
                        "puppy", "male_pig_castrated_or_for_meat", "pig_suckling",
                        "male_pig_for_meat"]
    # column headers not including species column
    column_headers = ["has_sold", "month_of_sale_1", "number_1", 
                      "unit_price_1", "month_of_sale_2", "number_2", 
                      "unit_price_2", "month_of_sale_3", "number_3",
                      "unit_price_3", "market_destination"]

    for i in range(len(product_list)-1):
        item_dict = {}
        item_1 = product_list[i]
        item_2 = product_list[i+1]
        term = translation_list[i]

        row_search = find_between(item_1, item_2, section_string)

        item_dict["species"] = term

        # if product row found
        if len(row_search) == 1:
            row_string = row_search[0]
            row_list = row_string.split("\n")
            row_list_len = len(row_list)
            
            # decide whether single newline or double newline formatting
            # should be used to parse the information by column
            # also check to see if unintentional newlines were added that
            # would mess with parsing script
            if row_list_len >=26:
                # skip odd indexes (1, 3, 5, 7)
                # drop first two rows (start count at -2)
                
                count = -3
                if row_list[1] != "":
                    count = -1
                for j in range(row_list_len):
                    if count > 20:
                        break;
                    if count >= 0 and count % 2 == 0:
                        col = column_headers[count//2]
                        entry = row_list[j]
                        entry = entry.strip()
                        item_dict[col] = entry
                    count += 1
                    
            else:
                for col in column_headers:
                    item_dict[col] = "UNENCOUNTERED FORMAT"
        else:
            for col in column_headers:
                item_dict[col] = "UNENCOUNTERED FORMAT"
                
        section_5_1_data.append(item_dict)
else:
    section_5_1_data = [{}]
        
raw_df = pd.DataFrame(section_5_1_data)

In [None]:
raw_df