# Production Code

In [None]:
import os
import re
import numpy as np
import pandas as pd

source_directory = "../surveys/plain_txt/"

## Extract
### Parsing Functions

In [None]:
def parse_interviewee(contents):
    name_regex = r"nombre de la persona[^\n:]*:[\s]*([^\n]*)\n"
    name_search = re.findall(name_regex, contents)
    if len(name_search) == 1:
        name = name_search[0]
        if name == "3":
            name = "N/A"
    else:
        alt_name_regex = r"nombre de la persona[^\n\(]*\(\w* +([^\n]*)\n"
        alt_name_search = re.findall(alt_name_regex, contents)
        if len(alt_name_search) == 1:
            name = alt_name_search[0]
            if name == "3":
                name = "N/A"
        else:
            name = "UNENCOUNTERED FORMAT"

    name = name.strip()

    return name


def parse_section_2(contents):
    regex_string = r"2\.no\n\n([\w\W]*)sección iii"
    regex_list = re.findall(regex_string, contents)

    entry_fields = ["order_number", "relationship_with_reference",
                    "sex", "age", "education", "employment_situation",
                    "employment_type", "main_activity",
                    "livestock_owner"]

    entry_list = []
    entry = {}

    if len(regex_list) == 1:
        section_2 = regex_list[0]

        if re.search("\n", section_2):
            section_2_list = section_2.split("\n")

            # count until 18, skip odd indexes (1, 3, 5, 7, ...)
            count = 0
            for item in section_2_list:
                item = item.strip()
                if count == 18:
                    entry_list.append(entry)
                    entry = {}
                    count = 0
                if count % 2 == 0:
                    entry_field = entry_fields[count//2]
                    entry[entry_field] = item
                count += 1
        else:
            for entry_field in entry_fields:
                entry[entry_field] = "UNENCOUNTERED FORMAT"
            entry_list.append(entry)
    else:
        for entry_field in entry_fields:
            entry[entry_field] = "UNENCOUNTERED FORMAT"
        entry_list.append(entry)

    return entry_list

### Process the Whole File

In [None]:
def process_files(source_directory):
    dataset = []

    for process_file in os.listdir(source_directory):
        file_path = os.path.join(source_directory, process_file)

        # with statements automatically control the closing of files
        with open(file_path, "r") as file:
            contents = file.read()
            contents = contents.lower()

            data_dict = {}

            section_2_entry_list = parse_section_2(contents)

            for entry in section_2_entry_list:
                data_dict = {}
                data_dict["filename"] = process_file
                data_dict["interviewee"] = parse_interviewee(contents)

                for key, value in entry.items():
                    data_dict[key] = value

                dataset.append(data_dict)

    # convert list to DataFrame
    raw_df = pd.DataFrame(data=dataset)

    return raw_df

## Transform
### Data Cleaning Functions

In [None]:
def clean_relationship(dataf):
    # handle null values
    dataf = dataf.replace("-", np.NaN)

    # handle relationships with additional info
    split_relationship = dataf["relationship_with_reference"].str.split()

    rels_with_add_info = split_relationship[split_relationship.str.len() > 1]

    relationship = split_relationship.str.get(0)
    relationship = relationship.rename("relationship_with_reference")

    additional_info = rels_with_add_info.str.slice(start=1).str.join(" ")
    additional_info = additional_info.rename("relationship_additional_info")

    # find non-null values for additional info
    additional_index = additional_info[~additional_info.isnull()].index

    relationship[additional_index] = additional_info[~additional_info.isnull()]

    # convert relationship number to its corresponding description
    replace_dict = {
                             "1": "head/boss",
                             "2": "spouse/concubine",
                             "3": "child/stepchild",
                             "4": "mother/father",
                             "5": "mother-in-law/father-in-law",
                             "6": "sister/brother",
                             "7": "another relative",
                             "8": "other unfamiliar",
                             "bis nieto": "great-grandchild",
                             "nieto": "grandchild",
                             "nieta": "grandchild",
                             "nuera": "child-in-law",
                             "hijastro": "child/stepchild"
                         }

    relationship = relationship.replace(replace_dict)

    # drop original relationship_with_reference column
    dataf = dataf.drop("relationship_with_reference", axis=1)

    # merge new relationship_with_reference
    dataf = dataf.merge(relationship, how="left", left_index=True,
                        right_index=True)

    return dataf


def clean_sex(dataf):
    replace_dict = {"hombre": "male",
                    "mujer": "female",
                    "m": "male",
                    "f": "female",
                    "1": "male",
                    "2": "female"}
    dataf["sex"] = dataf["sex"].replace(replace_dict)

    return dataf


def clean_age(dataf):
    dataf["age"] = dataf["age"].replace("16-26", np.NaN)

    return dataf


def clean_education(dataf):
    replace_dict = {
                       "1": "no school",
                       "2": "primary incomplete",
                       "3": "primary complete",
                       "4": "secondary incomplete",
                       "5": "secondary complete",
                       "6": "tertiary incomplete",
                       "7": "tertiary complete",
                       "8": "university incomplete",
                       "9": "university complete"
                    }

    dataf["education"] = dataf["education"].replace(replace_dict)

    return dataf


def clean_employment_situation(dataf):
    replace_dict = {
                       "1": "formal employment",
                       "2": "informal employment",
                       "3": "none",
                       "indep.": "independent",
                       "indepen.": "independent",
                       "jubilada": "retired",
                       "pension.": "pension",
                   }

    dataf["employment_situation"] = dataf["employment_situation"].replace(replace_dict)

    return dataf


def clean_employment_type(dataf):
    replace_dict = {
                       "1": "permanent work",
                       "1\t1": "permanent work",
                       "2": "temporary work",
                       "2\t2": "temporary work",
                       "0": np.NaN,
                       "_": np.NaN,
                       "--\t-": np.NaN,
                       "-\t-": np.NaN
                   }

    dataf["employment_type"] = dataf["employment_type"].replace(replace_dict)

    return dataf


def clean_main_activity(dataf):
    replace_dict = {
                       "1": "none",
                       "2": "agriculture",
                       "3": "livestock",
                       "4": "horticulture",
                       "5": "fishing/hunting",
                       "6": "forestry",
                       "7": "beekeeping",
                       "8": "trade",
                       "9": "craftsmanship",
                       "10": "civil servant",
                       "11": "student",
                       "12": "none",
                       "13": "other",
                       "3 y 13: turismo)": "livestock and tourism",
                       "3,13- personal de service en establecimiento educativo": "livestock and service personnel",
                       "13 cineasta": "filmmaker",
                       "13 (jubilada)": "retired",
                       "13 (veterin.)": "veterin.",
                       "13(enfermera)": "nurse",
                       "13 médica": "doctor",
                       "13 personal de servicio": "service personnel",
                       "13 bioquimico": "biochemical",
                       "13 albañil": "mason",
                       "13 mecanico": "mechanic",
                       "13 (adminis.)": "administrator",
                       "13 turismo": "tourism",
                       "13 ama de casa": "housewife",
                       "13ama d casa": "housewife",
                       "13 (sicólog)": "psychologist",
                       "3/2": "agriculture and livestock",
                       "2-3": "agriculture and livestock",
                       "2/3": "agriculture and livestock",
                       "2, 3": "agriculture and livestock",
                       "3-9": "livestock and trade",
                       "2, 3, 7": "agriculture, livestock, and beekeeping",
                       "9-3": "livestock and craftsmanship",
                       "3_2": "agriculture and livestock",
                       "2-9": "agriculture and craftsmanship",
                       "3-8": "livestock and trade",
                       "3-": "livestock",
                       "2-3-": "agriculture and livestock",
                       "discapacitado": "disabled"
                   }

    dataf["main_activity"] = dataf["main_activity"].replace(replace_dict)

    return dataf


def clean_livestock_keeper(dataf):
    replace_dict = {
                       "1": "yes",
                       "2": "no",
                       "si": "yes",
                       "_": np.NaN,
                       "--\t-": np.NaN,
                       "-\t-": np.NaN
                   }

    dataf["livestock_owner"] = dataf["livestock_owner"].replace(replace_dict)

    return dataf

### Pipeline Functions

In [None]:
def start_pipeline(dataf):
    return dataf.copy() 


def handle_null_data(dataf):
    # drop order number, since it does not contain useful information
    dataf = dataf.drop("order_number", axis=1)

    # fill missing values with null
    dataf = dataf.replace("", np.NaN)

    # drop rows which contain no information
    isnull_sum = dataf.isnull().sum(axis=1)
    drop_filter = isnull_sum > 5
    drop_indexes = dataf[drop_filter].index

    dataf = dataf.drop(drop_indexes)

    return dataf


def clean_columns(dataf):
    return (dataf
            .pipe(clean_relationship)
            .pipe(clean_sex)
            .pipe(clean_age)
            .pipe(clean_education)
            .pipe(clean_employment_situation)
            .pipe(clean_employment_type)
            .pipe(clean_main_activity)
            .pipe(clean_livestock_keeper))


def set_dtypes(dataf):
    dataf["age"] = dataf["age"].astype(float)

    return dataf

### Create Raw and Cleaned DataFrame

In [None]:
raw_df = process_files(source_directory)

clean_df = (raw_df
            .pipe(start_pipeline)
            .pipe(handle_null_data)
            .pipe(clean_columns)
            .pipe(set_dtypes))

## Load
### Export to CSV

In [None]:
datasets_directory = "../datasets/"
filename = "section_2.csv"
file_path = os.path.join(datasets_directory, filename)

clean_df.to_csv(file_path, index=False, na_rep="null")

## Other
### Functions to Check the Implementation

In [None]:
def get_dropped_df(raw_dataf, clean_dataf):
    raw_indexes = raw_dataf.index
    clean_indexes = clean_dataf.index

    dropped_indexes = raw_indexes[~raw_indexes.isin(clean_indexes)]
    dropped_df = raw_dataf.loc[dropped_indexes].copy()

    return dropped_df

### Create Dropped DataFrame

In [None]:
dropped_df = get_dropped_df(raw_df, clean_df)

***
# Testing Code
### View the Data

In [None]:
clean_df.head(10)

### Check Each Column for Parsing Errors and Standardize Values

In [None]:
clean_df[clean_df["relationship_with_reference"].isnull()]

In [None]:
cols_to_check = clean_df.columns
cols_to_drop = ["filename", "interviewee"]

cols_to_check = cols_to_check.drop(cols_to_drop)

for col in cols_to_check:
    print("*" * 50)
    print(" " * 5 + col)
    print(clean_df[col].value_counts(dropna=False))
    print("*" * 50)
    print("\n")

### Check Dropped Rows
Check the rows value counts to see if any rows are being dropped that should not be.
Another way to check is by opening the variable inspector and manually scrolling
through the `df_dropped` DataFrame. 

The code below tells us that there are no files that contain no information for this section.

In [None]:
print("Raw File Count: {}".format(len(raw_df["filename"].value_counts())))
print("Clean File Count: {}".format(len(clean_df["filename"].value_counts())))

Check to see which rows were dropped based on null values in each row

In [None]:
dropped_df[dropped_df.isnull().sum(axis=1) < 8]

Print the value counts for each row to see if there are any anomalous (non-null values) that may have been incorrectly extracted

In [None]:
cols_to_check = dropped_df.columns
cols_to_drop = ["filename", "interviewee"]

cols_to_check = cols_to_check.drop(cols_to_drop)

for col in cols_to_check:
    print("*" * 50)
    print(" " * 5 + col)
    print(dropped_df[col].value_counts(dropna=False))
    print("*" * 50)
    print("\n")

### Duplicate Rows

In [None]:
df_columns = clean_df.columns[clean_df.columns != "filename"]
clean_df[clean_df.loc[:, df_columns].duplicated(keep=False)]