# Production Code

In [None]:
import os
import re
import numpy as np
import pandas as pd

source_directory = "../surveys/plain_txt/"

## Extract
### Helper Functions

In [None]:
def find_between(str_1, str_2, contents):
    regex_between = str_1 + r"([\w\W]*)" + str_2
    regex_search = re.findall(regex_between, contents)
    
    return regex_search

### Parsing Functions

In [None]:
def parse_interviewee(contents):
    name_regex = r"nombre de la persona[^\n:]*:[\s]*([^\n]*)\n"
    name_search = re.findall(name_regex, contents)
    if len(name_search) == 1:
        name = name_search[0]
        if name == "3":
            name = "N/A"
    else:
        alt_name_regex = r"nombre de la persona[^\n\(]*\(\w* +([^\n]*)\n"
        alt_name_search = re.findall(alt_name_regex, contents)
        if len(alt_name_search) == 1:
            name = alt_name_search[0]
            if name == "3":
                name = "N/A"
        else:
            name = "UNENCOUNTERED FORMAT"

    name = name.strip()
            
    return name


def parse_date_range(contents):
    date_range = ""
    
    section_start = "iii-2"
    section_end = "iii-3"
    section_search = find_between(section_start, section_end, contents)
    
    if len(section_search) == 1:
        section_string = section_search[0]
        
        date_regex = r"([\w ]+\d+ *[-–] *[\w ]+\d+)"
        date_search = re.findall(date_regex, section_string)
        
        if len(date_search) == 1:
            date_range = date_search[0]
        else:
            date_range = "UNENCOUNTERED FORMAT"
    else:
        date_range = "UNENCOUNTERED FORMAT"
    
    return date_range


def parse_section_3_2_outflow(contents):
    regex_double_string = r"salidas de stocks\n\nmortalidad animal([\w\W]*)entradas de stocks[\w\W]*iii-3"
    regex_double_list = re.findall(regex_double_string, contents)
    
    regex_single_string = r"salidas de stocks\nmortalidad animal([\w\W]*)entradas de stocks[\w\W]*iii-3"
    regex_single_list = re.findall(regex_single_string, contents)

    entry_fields = ["bovines", "sheep", "goats", "camelids", 
                    "donkeys", "equines", "pigs"]

    outflow_entry_list = []
    outflow_entry = {}
    
    outflow_row_labels = ["animal_death", "robbery", 
                          "slaughter_or_consumption", "loans_given", 
                          "sold", "others_outflow"]

    outflow_data = []
    row = {}
        
    
    if len(regex_double_list) == 1:
        outflow = regex_double_list[0]
        
        if re.search("\n", outflow):
            outflow_list = outflow.split("\n")
            
            count = -2
            
            # handle a file structure where some files have two extra newlines 
            # at the end of each row
            count_restart = -6
            if outflow_list.index("robos") < 20:
                count_restart = -4
            
            for item in outflow_list:
                item = item.strip()
                if count == 14:
                    outflow_entry_list.append(outflow_entry)
                    outflow_entry = {}
                    count = count_restart
                if count % 2 == 0 & count >= 0:
                    entry_field = entry_fields[count//2]
                    outflow_entry[entry_field] = item
                count += 1
            
            if len(outflow_entry_list) == 6:
                for i in range(len(outflow_row_labels)):
                    label = outflow_row_labels[i]
                    entry = outflow_entry_list[i]
                    for key, value in entry.items():
                        row = {}
                        row["stock_variation"] = label
                        row["animal"] = key
                        row["value"] = value
                        outflow_data.append(row)
            else:
                outflow_data = [{}]
        else:
            outflow_data = [{}]
    elif len(regex_single_list) == 1:
        outflow = regex_single_list[0]
        
        if re.search("\n", outflow):
            outflow_list = outflow.split("\n")

            # count until 7
            count = -1
            for item in outflow_list:
                item = item.strip()
                if count == 7:
                    outflow_entry_list.append(outflow_entry)
                    outflow_entry = {}
                    count = -3
                if count >= 0:
                    entry_field = entry_fields[count]
                    outflow_entry[entry_field] = item
                count += 1
            
            if len(outflow_entry_list) == 6:
                for i in range(len(outflow_row_labels)):
                    label = outflow_row_labels[i]
                    entry = outflow_entry_list[i]
                    for key, value in entry.items():
                        row = {}
                        row["stock_variation"] = label
                        row["animal"] = key
                        row["value"] = value
                        outflow_data.append(row)
            else:
                outflow_data = [{}]
        else:
            outflow_data = [{}]
    else:
        outflow_data = [{}]

    return outflow_data


def parse_section_3_2_inflow(contents):
    regex_double_string = r"entradas de stocks\n\npartos \(nacimientos\)([\w\W]*)iii-3"
    regex_double_list = re.findall(regex_double_string, contents)

    regex_single_string = r"entradas de stocks\npartos \(nacimientos\)([\w\W]*)iii-3"
    regex_single_list = re.findall(regex_single_string, contents)
    
    entry_fields = ["bovines", "sheep", "goats", "camelids", 
                    "donkeys", "equines", "pigs"]

    inflow_entry_list = []
    inflow_entry = {}
    
    inflow_row_labels = ["births", "loans_received", 
                         "purchased", "others_inflow"]

    inflow_data = []
    row = {}
    
    if len(regex_double_list) == 1:
        inflow = regex_double_list[0]

        if re.search("\n", inflow):
            inflow_list = inflow.split("\n")

            count = -2
            # handle a file structure where some files have two extra newlines 
            # at the end of each row
            count_restart = -6
            if inflow_list.index("préstamos recibidos") < 20:
                count_restart = -4
            
            # count until 14, skip odd indexes (1, 3, 5, 7, ...)
            for item in inflow_list:
                item = item.strip()
                if count == 14:
                    inflow_entry_list.append(inflow_entry)
                    inflow_entry = {}
                    count = count_restart
                if count % 2 == 0 & count >= 0:
                    entry_field = entry_fields[count//2]
                    inflow_entry[entry_field] = item
                count += 1

            if len(inflow_entry_list) == 4:
                for i in range(len(inflow_row_labels)):
                    label = inflow_row_labels[i]
                    entry = inflow_entry_list[i]
                    for key, value in entry.items():
                        row = {}
                        row["stock_variation"] = label
                        row["animal"] = key
                        row["value"] = value
                        inflow_data.append(row)
            else:
                inflow_data = [{}]
        else:
            inflow_data = [{}]
    elif len(regex_single_list) == 1:
        inflow = regex_single_list[0]
        
        if re.search("\n", inflow):
            inflow_list = inflow.split("\n")

            # count until 7
            count = -1
            for item in inflow_list:
                item = item.strip()
                if count == 7:
                    inflow_entry_list.append(inflow_entry)
                    inflow_entry = {}
                    count = -3
                if count >= 0:
                    entry_field = entry_fields[count]
                    inflow_entry[entry_field] = item
                count += 1
            
            if len(inflowentry_list) == 6:
                for i in range(len(inflow_row_labels)):
                    label = inflow_row_labels[i]
                    entry = inflow_entry_list[i]
                    for key, value in entry.items():
                        row = {}
                        row["stock_variation"] = label
                        row["animal"] = key
                        row["value"] = value
                        inflow_data.append(row)
            else:
                inflow_data = [{}]
        else:
            inflow_data = [{}]
    else:
        inflow_data = [{}]
    
    return inflow_data

### Process the Whole File

In [None]:
def process_files(source_directory):
    dataset = []

    for process_file in os.listdir(source_directory):
        file_path = os.path.join(source_directory, process_file)

        # with statements automatically control the closing of files
        with open(file_path, "r") as file:
            contents = file.read()
            contents = contents.lower()

            data_dict = {}

            interviewee = parse_interviewee(contents)
            date_range = parse_date_range(contents)
            
            outflow_data = parse_section_3_2_outflow(contents)
            inflow_data = parse_section_3_2_inflow(contents)
            
            if len(outflow_data) != 42:
                print(f"Unencountered Format: {process_file}, Length of outflow_data: {len(outflow_data)}")
                
            if len(inflow_data) != 28:
                print(f"Unencountered Format: {process_file}, Length of outflow_data: {len(inflow_data)}")

                
            for row in outflow_data:
                data_dict = {}
                data_dict["filename"] = process_file
                data_dict["interviewee"] = interviewee
                data_dict["date_range"] = date_range
                
                for key, value in row.items():
                    data_dict[key] = value
                    
                dataset.append(data_dict)
            
            
            for row in inflow_data:
                data_dict = {}
                data_dict["filename"] = process_file
                data_dict["interviewee"] = interviewee
                data_dict["date_range"] = date_range
                
                for key, value in row.items():
                    data_dict[key] = value

                dataset.append(data_dict)

                
    # convert list to DataFrame
    raw_df = pd.DataFrame(data=dataset)

    return raw_df

## Transform

### Helper Functions

In [None]:
def transform_date_range(dataf):
    replace_dict = {"agosto 2018 - agosto 2019": "august 2018-2019",
                    "septiembre 2018 - septiembre 2019": "september 2018-2019",
                    "octubre 2018 – octubre 2019": "october 2018-2019",
                    "octubre 2018 - octubre 2019": "october 2018-2019"}
    dataf["date_range"] = dataf["date_range"].replace(replace_dict)
    
    return dataf

### Pipeline Functions

In [None]:
def start_pipeline(dataf):
    return dataf.copy() 


def handle_null_data(dataf):
    # fill missing values with null
    dataf = dataf.replace("", np.NaN)
    
    # drop rows which contain no information for value or other_value
    drop_filter = (((dataf["value"].isnull()) | (dataf["value"] == "0"))
                   & dataf["other_value"].isnull())
    drop_indexes = dataf[drop_filter].index

    dataf = dataf.drop(drop_indexes)
    
    return dataf


def clean_columns(dataf):
    # create new column to keep the "x" values indicating that
    # spot occured in that row, but we do not know how many
    other_filter = (dataf["value"] == "x")
    dataf["other_value"] = dataf["value"].where(other_filter, np.NaN)
    
    # replace values with their corresponding numeric value.
    # drop "x" values, because they are not numeric
    map_dict = {"1 (parto)": "1",
                "2 (parto)": "2",
                "x": np.NaN,
                "o": "0"}
    dataf["value"] = dataf["value"].replace(map_dict)
    
    return dataf


def set_dtypes(dataf):
    dataf["value"] = dataf["value"].astype(float)
    
    return dataf


def transform_values(dataf):
    # outflow #
    outflow_row_labels = ["animal_death", "robbery","slaughter_or_consumption", 
                          "loans_given", "sold", "others_outflow"]
    
    # if flow_label in outflow_row_labels, change positive value to negative
    outflow_filter = dataf["stock_variation"].isin(outflow_row_labels)
    flipped_values = -1 * dataf["value"]
    
    dataf["value"] = dataf["value"].where(~outflow_filter, flipped_values)
    
    
    dataf = transform_date_range(dataf)
    
    return dataf

### Create Raw and Cleaned DataFrame

In [None]:
raw_df = process_files(source_directory)

clean_df = (raw_df
            .pipe(start_pipeline)
            .pipe(clean_columns)
            .pipe(handle_null_data)
            .pipe(set_dtypes)
            .pipe(transform_values))

## Load
### Export to CSV

In [None]:
datasets_directory = "../datasets/"
filename = "section_3_2.csv"
file_path = os.path.join(datasets_directory, filename)

clean_df.to_csv(file_path, index=False, na_rep="null")

## Other
### Functions to Check the Implementation

In [None]:
def get_dropped_df(raw_dataf, clean_dataf):
    raw_indexes = raw_dataf.index
    clean_indexes = clean_dataf.index
    
    dropped_indexes = raw_indexes[~raw_indexes.isin(clean_indexes)]
    dropped_df = raw_dataf.loc[dropped_indexes].copy()

    return dropped_df

### Create Dropped DataFrame

In [None]:
dropped_df = get_dropped_df(raw_df, clean_df)

***
# Testing Code
### View the Data

In [None]:
clean_df.head(50)

### Check for Parsing Errors

In [None]:
cols_to_check = clean_df.columns
cols_to_drop = ["filename", "interviewee"]

cols_to_check = cols_to_check.drop(cols_to_drop)

for col in cols_to_check:
    print("*" * 50)
    print(" " * 5 + col)
    print(clean_df[col].value_counts(dropna=False))
    print("*" * 50)
    print("\n")

### Check Dropped Rows
Another way to check is by opening the variable inspector and manually scrolling
through the `df_dropped` DataFrame. 

The code below tells us whether or not there are files that contain no information for this section. If raw file count is the same as clean file count, then no files were dropped.

In [None]:
print("Raw File Count: {}".format(len(raw_df["filename"].value_counts())))
print("Clean File Count: {}".format(len(clean_df["filename"].value_counts())))

If there are files that are completely dropped, find those files and look at them to check for parsing errors.

In [None]:
raw_file_names = raw_df["filename"].value_counts().index
clean_file_names = clean_df["filename"].value_counts().index

raw_file_names[~raw_file_names.isin(clean_file_names)]

Check the rows value counts to see if any rows are being dropped that should not be.

In [None]:
cols_to_check = dropped_df.columns
cols_to_drop = ["filename", "interviewee"]

cols_to_check = cols_to_check.drop(cols_to_drop)

for col in cols_to_check:
    print("*" * 50)
    print(" " * 5 + col)
    print(dropped_df[col].value_counts(dropna=False))
    print("*" * 50)
    print("\n")

### Duplicate Rows

In [None]:
df_columns = clean_df.columns[clean_df.columns != "filename"]
clean_df[clean_df.loc[:, df_columns].duplicated(keep=False)]