# Production Code

In [None]:
import os
import re
import numpy as np
import pandas as pd

source_directory = "../surveys/plain_txt/"

## Extract
### Parsing Functions

In [None]:
def parse_interviewee(contents):
    name_regex = r"nombre de la persona[^\n:]*:[\s]*([^\n]*)\n"
    name_search = re.findall(name_regex, contents)
    if len(name_search) == 1:
        name = name_search[0]
        if name == "3":
            name = "N/A"
    else:
        alt_name_regex = r"nombre de la persona[^\n\(]*\(\w* +([^\n]*)\n"
        alt_name_search = re.findall(alt_name_regex, contents)
        if len(alt_name_search) == 1:
            name = alt_name_search[0]
            if name == "3":
                name = "N/A"
        else:
            name = "UNENCOUNTERED FORMAT"

    name = name.strip()
            
    return name


def parse_section_3_1(contents):
    section_3_1_pattern = r"(bovinos machos[\w\W]*)iii-2"
    section_3_1_search = re.findall(section_3_1_pattern, contents)
    
    animal_list = ["bovines_m", "bovines_f",
                   "camelids_m", "camelids_f",
                   "sheep_m", "sheep_f",
                   "goats_m", "goats_f",
                   "donkeys_m", "donkeys_f",
                   "equines_m", "equines_f",
                   "pigs_m", "pigs_f"]
    
    entry = {}

    if len(section_3_1_search) == 1:
        section_3_1_str = section_3_1_search[0]
        
        fields_regex = r"[\|il_][_ \t]*([\d]*)[_ \t]*[\|il][^\n\d]*\|?[il]?[_ \t]*([\d]*)[_ \t]*\|?[il]?[ \t]*\n"
        section_3_1 = re.findall(fields_regex, section_3_1_str)

        if len(section_3_1) == 14:

            row_count = 0
            for row in section_3_1:
                animal = animal_list[row_count]
                entry[animal] = row
                row_count += 1
        else:
            for animal in animal_list:
                entry[animal] = "UNENCOUNTERED FORMAT"
    else:
        for animal in animal_list:
            entry[animal] = "UNENCOUNTERED FORMAT"

    return entry

### Process the Whole File

In [None]:
def process_files(source_directory):
    dataset = []

    for process_file in os.listdir(source_directory):
        file_path = os.path.join(source_directory, process_file)

        # with statements automatically control the closing of files
        with open(file_path, "r") as file:
            contents = file.read()
            contents = contents.lower()

            data_dict = {}

            section_3_1_entries = parse_section_3_1(contents)
            
            if len(section_3_1_entries) != 14:
                print(f"Unencountered Format: {process_file}")
            
            for key, value in section_3_1_entries.items():
                data_dict = {}
                data_dict["filename"] = process_file
                data_dict["interviewee"] = parse_interviewee(contents)
                data_dict["animal"] = key
                data_dict["current"] = value[0]
                data_dict["last_year"] = value[1]

                dataset.append(data_dict)

    # convert list to DataFrame
    raw_df = pd.DataFrame(data=dataset)

    return raw_df

## Transform
### Helper Function

In [None]:
    def map_values(value):
            if value <= 10:
                return 1
            elif value <= 50:
                return 2
            elif value <= 100:
                return 3
            elif value <= 150:
                return 4
            elif value <= 200:
                return 5
            elif value <= 300:
                return 6
            elif value <= 500:
                return 7
            elif value <= 700:
                return 8
            elif value <= 1000:
                return 9
            elif value > 1000:
                return 10

### Pipeline Functions

In [None]:
def start_pipeline(dataf):
    return dataf.copy() 


def handle_null_data(dataf):
    # fill missing values with null
    dataf = dataf.replace("", np.NaN)
    dataf = dataf.replace("0", np.NaN)

    # drop rows which contain no information
    isnull_sum = dataf.isnull().sum(axis=1)
    drop_filter = isnull_sum > 1
    drop_indexes = dataf[drop_filter].index

    dataf = dataf.drop(drop_indexes)

    return dataf


def clean_columns(dataf):
    # find the entries that contain numbers larger than 10 (the max class)
    # use this filter as indicator of which interviewees used actual quantity
    # versus those who specified quantity by the class system
    # Then, map actual quantities to the class system
    mapping_filter = (dataf["current"] > 10) | (dataf["last_year"] > 10)
    files_to_map = dataf.loc[mapping_filter, "filename"].value_counts().index
    file_filter = dataf["filename"].isin(files_to_map)

    current_mapped = dataf["current"].apply(map_values)
    last_year_mapped = dataf["last_year"].apply(map_values)
    
    # create new column containing just the numbers which were numeric
    dataf["current_numeric"] = dataf["current"].where(file_filter, np.NaN)
    dataf["last_year_numeric"] = dataf["last_year"].where(file_filter, np.NaN)
    
    # map all values to their respective classes
    dataf["current_class"] = dataf["current"].where(~file_filter, current_mapped)
    dataf["last_year_class"] = dataf["last_year"].where(~file_filter, last_year_mapped)
    
    
    # drop original columns
    dataf = dataf.drop(["current", "last_year"], axis=1)
    
    return dataf


def set_dtypes(dataf):
    dataf["current"] = dataf["current"].astype(float)
    dataf["last_year"] = dataf["last_year"].astype(float)
    
    return dataf

### Create Raw and Cleaned DataFrame

In [None]:
raw_df = process_files(source_directory)

clean_df = (raw_df
            .pipe(start_pipeline)
            .pipe(handle_null_data)
            .pipe(set_dtypes)
            .pipe(clean_columns))

## Load
### Export to CSV

In [None]:
datasets_directory = "../datasets/"
filename = "section_3_1.csv"
file_path = os.path.join(datasets_directory, filename)

clean_df.to_csv(file_path, index=False, na_rep="null")

## Other
### Functions to Check the Implementation

In [None]:
def get_dropped_df(raw_dataf, clean_dataf):
    raw_indexes = raw_dataf.index
    clean_indexes = clean_dataf.index
    
    dropped_indexes = raw_indexes[~raw_indexes.isin(clean_indexes)]
    dropped_df = raw_dataf.loc[dropped_indexes].copy()

    return dropped_df

### Create Dropped DataFrame

In [None]:
dropped_df = get_dropped_df(raw_df, clean_df)

***
# Testing Code
### View the Data

In [None]:
clean_df.head(10)

### Check for Parsing Errors

In [None]:
cols_to_check = clean_df.columns
cols_to_drop = ["filename", "interviewee"]

cols_to_check = cols_to_check.drop(cols_to_drop)

for col in cols_to_check:
    print("*" * 50)
    print(" " * 5 + col)
    print(clean_df[col].value_counts(dropna=False))
    print("*" * 50)
    print("\n")

### Check Dropped Rows
Another way to check is by opening the variable inspector and manually scrolling
through the `df_dropped` DataFrame. 

The code below tells us whether or not there are files that contain no information for this section. If raw file count is the same as clean file count, then no files were dropped.

In [None]:
print("Raw File Count: {}".format(len(raw_df["filename"].value_counts())))
print("Clean File Count: {}".format(len(clean_df["filename"].value_counts())))

If there are files that are completely dropped, find those files and look at them to check for parsing errors.

In [None]:
raw_file_names = raw_df["filename"].value_counts().index
clean_file_names = clean_df["filename"].value_counts().index

raw_file_names[~raw_file_names.isin(clean_file_names)]

Check the rows value counts to see if any rows are being dropped that should not be.

In [None]:
cols_to_check = dropped_df.columns
cols_to_drop = ["filename", "interviewee"]

cols_to_check = cols_to_check.drop(cols_to_drop)

for col in cols_to_check:
    print("*" * 50)
    print(" " * 5 + col)
    print(dropped_df[col].value_counts(dropna=False))
    print("*" * 50)
    print("\n")

### Duplicate Rows

In [None]:
df_columns = clean_df.columns[clean_df.columns != "filename"]
clean_df[clean_df.loc[:, df_columns].duplicated(keep=False)]