# Production Code

In [None]:
import os
import re
import numpy as np
import pandas as pd

source_directory = "../surveys/plain_txt/"

## Extract

### Global Variables

In [None]:
product_list = ["leche", "manteca", "quesillo", "queso", "dulce de leche",
                "cuero curtido", "cuero crudo", "lana vellón", "lana hilo",
                "lana descerdada", "pelo mohair", "pelo de caballo",
                "artesanías textiles", "charqui", "chalona",
                "embutidos \/ chacinados", "carne de ovino",
                "carne de cabra", "carne de bovino", "carne de cerdo",
                "carne de pollo", "carne de equinos", "carne de camélidos",
                "guano", "otros"]
translation_list = ["milk", "butter", "quesillo", "cheese", "caramel",
                    "tanned_leather", "rawhide", "fleece_wool", "wool_yarn",
                    "uncerned wool", "mohair", "horsehair",
                    "textile_handicrafts", "jerky", "dried_meat", "sausages",
                    "sheep_meat", "goat_meat", "beef", "pork_meat",
                    "chicken_meat", "equine_meat", "camelid_meat", "guano"]
# column headers not including product column
column_headers = ["has_produced", "quantity", "unit_of_measure",
                  "periodicity_1", "number_of_periods_1",
                  "periodicity_2", "number_of_periods_2",
                  "self_consume", "commericial"]

### Helper Functions

In [None]:
def find_between(str_1, str_2, contents):
    regex_between = str_1 + r"([\w\W]*)" + str_2
    regex_search = re.findall(regex_between, contents)

    return regex_search

### Parsing Functions

In [None]:
def parse_interviewee(contents):
    name_regex = r"nombre de la persona[^\n:]*:[\s]*([^\n]*)\n"
    name_search = re.findall(name_regex, contents)
    if len(name_search) == 1:
        name = name_search[0]
        if name == "3":
            name = "N/A"
    else:
        alt_name_regex = r"nombre de la persona[^\n\(]*\(\w* +([^\n]*)\n"
        alt_name_search = re.findall(alt_name_regex, contents)
        if len(alt_name_search) == 1:
            name = alt_name_search[0]
            if name == "3":
                name = "N/A"
        else:
            name = "UNENCOUNTERED FORMAT"

    name = name.strip()

    return name


def parse_date_range(section_search):
    date_range = ""

    if len(section_search) == 1:
        section_string = section_search[0]

        date_regex = r"([\w ]+\d+ *[-–] *[\w ]+\d+)"
        date_search = re.findall(date_regex, section_string)

        if len(date_search) == 1:
            date_range = date_search[0]
        else:
            date_range = "UNENCOUNTERED FORMAT"
    else:
        date_range = "UNENCOUNTERED FORMAT"

    return date_range


def parse_section_4_1(section_search, product_list,
                      translation_list, column_headers):
    section_4_1_data = []

    if len(section_search) == 1:
        section_string = section_search[0]

        # iterate through products
        for i in range(len(product_list)-1):
            item_dict = {}
            item_1 = product_list[i]
            item_2 = product_list[i+1]
            term = translation_list[i]

            row_search = find_between(item_1, item_2, section_string)

            item_dict["product"] = term

            # if product row found
            if len(row_search) == 1:
                row_string = row_search[0]
                row_list = row_string.split("\n")

                if len(row_list) == 21:
                    # skip odd indexes (1, 3, 5, 7)
                    # drop first two rows (start count at -2)
                    count = -2
                    if (row_list[-1 * count] == ""
                       and row_list[-1 * count + 2] != ""):
                        count -= 2

                    for j in range(len(row_list)):
                        if count > 16:
                            break;
                        if count >= 0 and count % 2 == 0:
                            col = column_headers[count//2]
                            entry = row_list[j]
                            entry = entry.strip()
                            item_dict[col] = entry
                        count += 1
                else:
                    for col in column_headers:
                        item_dict[col] = "UNENCOUNTERED FORMAT"
            else:
                for col in column_headers:
                    item_dict[col] = "UNENCOUNTERED FORMAT"

            section_4_1_data.append(item_dict)
    else:
        section_4_1_data = [{}]

    return section_4_1_data


def parse_otros_4_1(otros_search, column_headers):
    otros_4_1_data = []

    # parsing section for otros #
    if len(otros_search) == 1:
        otros_string = otros_search[0]
        otros_list = otros_string.split("\n")
        otros_list = otros_list[1:]

        if len(otros_list) > 65:
            # skip odd indexes (1, 3, 5, 7)
            count = 0
            count_restart = -2
            count_end = 20

            item_dict = {}
            for item in otros_list:
                if count == count_end:
                    count = count_restart
                    otros_4_1_data.append(item_dict)
                    item_dict = {}
                if count == 0:
                    stripped_item = item.strip()
                    item_dict["product"] = stripped_item
                if count > 0 and count % 2 == 0:
                    col = column_headers[(count-2)//2]
                    item_dict[col] = item
                count += 1
        else:
            otros_4_1_data = [{}]
    else:
        otros_4_1_data = [{}]

    return otros_4_1_data

### Process the Whole File

In [None]:
def process_files(source_directory, product_list,
                  translation_list, column_headers):
    dataset = []

    for process_file in os.listdir(source_directory):
        file_path = os.path.join(source_directory, process_file)

        # with statements automatically control the closing of files
        with open(file_path, "r") as file:
            contents = file.read()
            contents = contents.lower()

            data_dict = {}

            interviewee = parse_interviewee(contents)

            section_start = "iv-1"
            section_end = "iv-2"
            section_search = find_between(section_start, section_end, contents)

            otros_start = r"iv-1[\w\W]*otros[^\n]*\n"
            otros_search = find_between(otros_start, section_end, contents)

            date_range = parse_date_range(section_search)
            section_4_1_data = parse_section_4_1(section_search, product_list,
                                                 translation_list, column_headers)
            otros_4_1_data = parse_otros_4_1(otros_search, column_headers)

            # check to make sure files contain the right amount of entries
            if len(section_4_1_data) != 24:
                print(f"Unencountered Format: {process_file}\n"
                      + f"Length of section_data: {len(section_4_1_data)}\n")

            if len(otros_4_1_data) != 3 and len(otros_4_1_data) != 4:
                print(f"Unencountered Format: {process_file}\n"
                      + f"Length of otros_data: {len(otros_4_1_data)}\n")

            for row in section_4_1_data:
                data_dict = {}
                data_dict["filename"] = process_file
                data_dict["interviewee"] = interviewee
                data_dict["date_range"] = date_range

                for key, value in row.items():
                    data_dict[key] = value

                dataset.append(data_dict)

            for row in otros_4_1_data:
                data_dict = {}
                data_dict["filename"] = process_file
                data_dict["interviewee"] = interviewee
                data_dict["date_range"] = date_range

                for key, value in row.items():
                    data_dict[key] = value

                dataset.append(data_dict)

    # convert list to DataFrame
    raw_df = pd.DataFrame(data=dataset)

    return raw_df

## Transform

### Helper Functions

In [None]:
def standardize_has_produced(dataf):
    map_dict = {"nos": "no",
                "no ": "no"}
    dataf["has_produced"] = dataf["has_produced"].replace(map_dict)

    return dataf


def standardize_unit_of_measure(dataf):
    map_dict = {"kgs": "kg",
                "k.": "kg",
                "k": "kg",
                "kilos": "kg",
                "kilogramo": "kg",
                "kilogramos": "kg",
                "kig": "kg",
                "kilo": "kg",
                "unidades": "unit",
                "unit": "unit",
                "unidad": "unit",
                "cueros/unidades": "unit",
                "unudades": "unit",
                "animales": "unit",
                "moldes": "unit",
                "molde": "unit",
                "u": "unit",
                "litros": "L",
                "litro": "L",
                "l": "L",
                "lts": "L",
                "lts.": "L",
                "camionada": "truck_load",
                "camionadas": "truck_load",
                "m3": "cubic_meters"}

    dataf["unit_of_measure"] = dataf["unit_of_measure"].replace(map_dict)

    return dataf


def transform_date_range(dataf):
    replace_dict = {"agosto 2018 - agosto 2019": "august 2018-2019",
                    "septiembre 2018 - septiembre 2019": "september 2018-2019",
                    "octubre 2018 – octubre 2019": "october 2018-2019",
                    "octubre 2018 - octubre 2019": "october 2018-2019"}
    dataf["date_range"] = dataf["date_range"].replace(replace_dict)

    return dataf


def map_periodicity(dataf):
    map_dict = {"1": "day",
                "2": "week",
                "3": "month",
                "4": "year"}

    dataf["periodicity_1"] = dataf["periodicity_1"].replace(map_dict)
    dataf["periodicity_2"] = dataf["periodicity_2"].replace(map_dict)

    return dataf


def map_consumption(dataf):
    map_dict = {"0": 0,
                "1": 0.25,
                "2": 0.5,
                "3": 0.75,
                "4": 1}

    dataf["self_consume_fraction"] = dataf["self_consume"].replace(map_dict)
    dataf["commericial_fraction"] = dataf["commericial"].replace(map_dict)

    dataf = dataf.drop(["self_consume", "commericial"], axis=1)

    return dataf


def map_has_produced(dataf):
    # convert 'no' values to 'si' if the row contains more than 4 non-null
    # values (i.e. someone accidentally put no in that column)
    replace_filter = ((dataf["has_produced"] == "no")
                      & (dataf.isnull().sum(axis=1) < 5))
    dataf["has_produced"] = dataf["has_produced"].where(~replace_filter, "si")

    # convert values to English
    map_dict = {"si": "yes"}
    dataf["has_produced"] = dataf["has_produced"].replace(map_dict)

    return dataf


def clean_quantity(dataf):
    # decimal separator check
    # if quantity value contains a period followed by three digits
    # assume that the number entered means thousands (e.g 10.000, 20.000).
    # Replace the '.' with an empty character so that when the quantity column
    # is converted to float, it does not assume decimal separator
    replace_filter = dataf["quantity"].str.contains("\.\d{3}", na=False)
    replace_series = dataf["quantity"].str.replace(".", "")
    dataf["quantity"] = dataf["quantity"].where(~replace_filter, replace_series)

    return dataf

### Pipeline Functions

In [None]:
def start_pipeline(dataf):
    return dataf.copy()


def handle_null_data(dataf):
    # fill missing values with null
    dataf = dataf.replace("", np.NaN)

    # drop rows which contain no information
    isnull_sum = dataf.isnull().sum(axis=1)
    drop_filter = ((isnull_sum >= 9)
                   | ((isnull_sum == 8) & (dataf["has_produced"] == "no")))
    drop_indexes = dataf[drop_filter].index

    dataf = dataf.drop(drop_indexes)

    return dataf


def standardize_columns(dataf):
    dataf = standardize_has_produced(dataf)
    dataf = standardize_unit_of_measure(dataf)

    return dataf


def transform_values(dataf):
    dataf = map_periodicity(dataf)
    dataf = map_consumption(dataf)
    dataf = map_has_produced(dataf)
    dataf = clean_quantity(dataf)
    dataf = transform_date_range(dataf)

    return dataf


def set_dtypes(dataf):
    dataf["quantity"] = dataf["quantity"].astype(float)

    dataf["number_of_periods_1"] = dataf["number_of_periods_1"].astype(float)
    dataf["number_of_periods_2"] = dataf["number_of_periods_2"].astype(float)
    dataf["self_consume_fraction"] = dataf["self_consume_fraction"].astype(float)
    dataf["commericial_fraction"] = dataf["commericial_fraction"].astype(float)

    return dataf

### Create Raw and Cleaned DataFrame

In [None]:
raw_df = process_files(source_directory, product_list,
                       translation_list, column_headers)

clean_df = (raw_df
            .pipe(start_pipeline)
            .pipe(standardize_columns)
            .pipe(handle_null_data)
            .pipe(transform_values)
            .pipe(set_dtypes))

## Other
### Functions to Check the Implementation

In [None]:
def get_dropped_df(raw_dataf, clean_dataf):
    raw_indexes = raw_dataf.index
    clean_indexes = clean_dataf.index

    dropped_indexes = raw_indexes[~raw_indexes.isin(clean_indexes)]
    dropped_df = raw_dataf.loc[dropped_indexes].copy()

    return dropped_df

### Create Dropped DataFrame
Create the `dropped_df` and collect the rows which contain no non-null values

In [None]:
dropped_df = get_dropped_df(raw_df, clean_df)

# get rows that were dropped but do not contain a has_produced == "no"
null_df = dropped_df.loc[(dropped_df["has_produced"] != "no")
                         & (~dropped_df["product"].isin(["1", "2", "3", "4"])),
                         ["filename", "product"]]

### Check Dropped Row

In [None]:
def get_dropped_row(section_search, product,
                    product_list, translation_list):
    dropped_row = {}

    if len(section_search) == 1:
        section_string = section_search[0] + "v-3:"

        dropped_row["product"] = product

        product_index = translation_list.index(product)
        item_1 = product_list[product_index]
        item_2 = product_list[product_index + 1]

        row_search = find_between(item_1, item_2, section_string)

        # if product row found
        if len(row_search) == 1:
            row_string = row_search[0]
            row_list = row_string.split("\n")
            row_list_len = len(row_list)

            if row_list_len >= 21:
                count = -2
                for j in range(row_list_len):
                    if count > 16:
                        break
                    if count >= 0:
                        col = str(count)
                        entry = row_list[j]
                        entry = entry.strip()
                        dropped_row[col] = entry
                    count += 1

            else:
                for j in range(17):
                    dropped_row[str(j)] = "UNENCOUNTERED FORMAT"
        else:
            for j in range(17):
                dropped_row[str(j)] = "UNENCOUNTERED FORMAT"
    else:
        dropped_row = {}

    return dropped_row

### Check Dropped Rows for Parsing Errors

In [None]:
def check_dropped_rows(source_directory, product_list, translation_list, s):
    process_file = s["filename"]
    product = s["product"]

    check_dict = {}

    file_path = os.path.join(source_directory, process_file)

    # with statements automatically control the closing of files
    with open(file_path, "r") as file:
        contents = file.read()
        contents = contents.lower()

        section_start = "iv-1"
        section_end = "iv-2"
        section_search = find_between(section_start, section_end, contents)

        dropped_row = get_dropped_row(section_search, product,
                                      product_list, translation_list)

        # check to make sure files contain the right amount of entries
        if len(dropped_row) <= 1:
            print(f"Unencountered Format: {process_file}\n"
                  + f"Length of data: {len(dropped_row)}\n")

        check_dict["filename"] = process_file

        for key, value in dropped_row.items():
            check_dict[key] = value

    # convert list to Series
    check_series = pd.Series(data=check_dict)

    return check_series

In [None]:
check_df = null_df.apply(lambda s: check_dropped_rows(source_directory,
                                                      product_list,
                                                      translation_list,
                                                      s),
                         axis=1)

In [None]:
def drop_null_check(dataf):
    # fill missing values with null
    dataf = dataf.replace("", np.NaN)

    # drop rows which contain no information
    isnull_sum = dataf.isnull().sum(axis=1)
    drop_filter = isnull_sum >= 17
    drop_indexes = dataf[drop_filter].index

    dataf = dataf.drop(drop_indexes)

    return dataf

In [None]:
non_null_df = (check_df
               .pipe(start_pipeline)
               .pipe(drop_null_check))

In [None]:
non_null_df

## Load
### Export to CSV

In [None]:
if non_null_df.shape[0] == 0:
    datasets_directory = "../datasets/"
    filename = "section_4_1.csv"
    file_path = os.path.join(datasets_directory, filename)

    clean_df.to_csv(file_path, index=False, na_rep="null")

    print(f"Exported to {filename}")
else:
    print("Error: Dropped rows contain non-null values")

***
# Testing Code
### View the Data

In [None]:
clean_df.head(50)

### Check for Parsing Errors

In [None]:
cols_to_check = clean_df.columns
cols_to_drop = ["filename", "interviewee"]

cols_to_check = cols_to_check.drop(cols_to_drop)

for col in cols_to_check:
    print("*" * 50)
    print(" " * 5 + col)
    print(clean_df[col].value_counts(dropna=False))
    print("*" * 50)
    print("\n")

### Check Dropped Rows
Another way to check is by opening the variable inspector and manually scrolling
through the `df_dropped` DataFrame. 

The code below tells us whether or not there are files that contain no information for this section. If raw file count is the same as clean file count, then no files were dropped.

In [None]:
print("Raw File Count: {}".format(len(raw_df["filename"].value_counts())))
print("Clean File Count: {}".format(len(clean_df["filename"].value_counts())))

If there are files that are completely dropped, find those files and look at them to check for parsing errors.

In [None]:
raw_file_names = raw_df["filename"].value_counts().index
clean_file_names = clean_df["filename"].value_counts().index

dropped_files = raw_file_names[~raw_file_names.isin(clean_file_names)].to_list()
dropped_files[0:10]

Check the rows value counts to see if any rows are being dropped that should not be.

In [None]:
cols_to_check = dropped_df.columns
cols_to_drop = ["filename", "interviewee"]

cols_to_check = cols_to_check.drop(cols_to_drop)

for col in cols_to_check:
    print("*" * 50)
    print(" " * 5 + col)
    print(dropped_df[col].value_counts(dropna=False))
    print("*" * 50)
    print("\n")

### Duplicate Rows

In [None]:
df_columns = clean_df.columns[clean_df.columns != "filename"]
clean_df[clean_df.loc[:, df_columns].duplicated(keep=False)]