# Production Code

In [None]:
import os
import re
import numpy as np
import pandas as pd

source_directory = "../surveys/plain_txt/"

## Extract
### Helper Functions

In [None]:
def month_str_to_num(month_str):
    month_dict = {
                    'enero': "1",
                    'febrero': "2",
                    'marzo': "3",
                    'abril': "4",
                    'mayo': "5",
                    'junio': "6",
                    'julio': "7",
                    'agosto': "8",
                    'septiembre': "9", 
                    'octubre': "10",
                    'noviembre': "11",
                    'diciembre': "12"
                  }

    return month_dict[month_str]

### Parsing Functions

In [None]:
def parse_country(contents):
    country_regex = r"pa\ws[^\n:]*:[\s]*([^\n]*)\n"
    country_search = re.findall(country_regex, contents)
    if len(country_search) == 1:
        country = country_search[0]
    else:
        country = "UNENCOUNTERED FORMAT"

    country = country.strip()

    return country


def parse_interviewee(contents):
    name_regex = r"nombre de la persona[^\n:]*:[\s]*([^\n]*)\n"
    name_search = re.findall(name_regex, contents)
    if len(name_search) == 1:
        name = name_search[0]
        if name == "3":
            name = "N/A"
    else:
        alt_name_regex = r"nombre de la persona[^\n\(]*\(\w* +([^\n]*)\n"
        alt_name_search = re.findall(alt_name_regex, contents)
        if len(alt_name_search) == 1:
            name = alt_name_search[0]
            if name == "3":
                name = "N/A"
        else:
            name = "UNENCOUNTERED FORMAT"

    name = name.strip()

    return name


def parse_sex(contents):
    sex_regex = r"g\wnero de la persona[^\n:]*:[\s]*([^\n]*)\n"
    sex_search = re.findall(sex_regex, contents)
    if len(sex_search) == 1:
        sexes = re.split(r"/", sex_search[0])
        sex = ""
        if re.search(r"x", sexes[0]):
            sex = "male"
        elif re.search(r"x", sexes[1]):
            sex = "female"
    else:
        sex = "UNENCOUNTERED FORMAT"

    return sex


def parse_ethnicity(contents):
    ethnicity_regex = r"etnia de la persona[^\n:]*:[\s]*([^\n]*)\n"
    ethnicity_search = re.findall(ethnicity_regex, contents)
    if len(ethnicity_search) == 1:
        ethnicity = ethnicity_search[0]
    else:
        ethnicity = "UNENCOUNTERED FORMAT"

    ethnicity = ethnicity.strip()

    if ethnicity == "5":
        ethnicity = "N/A"

    return ethnicity


def parse_survey_date(contents):
    date_regex = r"fecha de la entre[^\n: ]*:?[\s]*([^\n]*)\n"
    date_search = re.findall(date_regex, contents)

    if len(date_search) == 1:
        date_str = date_search[0]
        if re.search(r"/", date_str):
            date_search = date_str.split("/")
            date_str = "-".join(date_search)
        elif re.search(r"-", date_str):
            date_str = date_str
        elif date_str == "6":
            date_str = "N/A"
        else:
            # try date in string format (i.e. 31 DE OCTUBRE)
            date_string_regex = r"fecha de la entre[^\n: ]*:?[ ]*([^\n]*)\n"
            date_string_search = re.findall(date_string_regex, contents)

            if len(date_string_search) == 1:
                date_str = date_string_search[0]
                date_str = date_str.strip()

                parse_date_regex = r"([\d]+) *[\w]+ *([\w]+) *[\w]* *([\d]*)"
                parse_date_search = re.findall(parse_date_regex, date_str)
                if len(parse_date_search) == 1:
                    day = parse_date_search[0][0]
                    month = parse_date_search[0][1]
                    month = month_str_to_num(month)
                    year = parse_date_search[0][2]
                    date_str = day + "-" + month + "-" + year
                else:
                    date_str = "UNENCOUNTERED FORMAT"
            else:
                date_str = "UNENCOUNTERED FORMAT"
    else:
        date_str = "UNENCOUNTERED FORMAT"

    if date_str != "UNENCOUNTERED FORMAT":
        date_str = date_str.replace(" ", "")
        date_str = date_str.replace("¿", "")

    return date_str


def parse_survey_hour(contents):
    # survey hour
    hour_list = re.findall(r"hora de la entre[^\n:]*:[\s]*([^\n]*)\n",
                               contents)
    hour_null_regex = r"i-1"
    hour_period_regex = r"([\d]+[\.]?[\d]*)[\w]*"
    hour_colon_regex = r"([\d]+[ ]?[:]?[ ]?[\d]*)[\w]*"
    hour_space_regex = r"([\d]+\s[\d]+)[\w]*"
    hour_no_min_regex = r"([\d]+)[\s]*[\w]*"

    if len(hour_list) == 1:
        hour_str = hour_list[0]
        # handles time fields that contain no entry
        if re.search(hour_null_regex, hour_str):
            hour_str = "N/A"
        elif re.search(r"[\d]{4}", hour_str):
            hour_regex = r"([\d]{4})"
            hour_search = re.findall(hour_regex, hour_str)
            hour_str = hour_search[0][0:2] + ":" + hour_search[0][2:4]
        # times separated by period (i.e. 12.00, 8.00)
        elif re.search(r"\.", hour_str):
            hour_list = re.findall(hour_period_regex, hour_str)
            hour_str = hour_list[0]
            hour_list = hour_str.split(".")
            hour_str = ":".join(hour_list)
        # times separated by : (i.e. 12:00, 8:30)
        elif re.search(r":", hour_str):
            hour_list = re.findall(hour_colon_regex, hour_str)
            hour_str = hour_list[0]
        # times separated by spaces (i.e. 12 00 hr)
        elif re.search(hour_space_regex, hour_str):
            hour_list = re.findall(hour_space_regex, hour_str)
            hour_str = hour_list[0]
            hour_list = hour_str.split(" ")
            hour_str = ":".join(hour_list)
        # times with no minutes field (i.e. 12hr, 8hrs, 8 h)
        elif re.search(hour_no_min_regex, hour_str):
            hour_list = re.findall(hour_no_min_regex, hour_str)
            hour_str = hour_list[0] + ":" + "00"
    else:
        hour_str = "UNENCOUNTERED FORMAT"

    if hour_str != "UNENCOUNTERED FORMAT":
        hour_str = hour_str.replace(" ", "")

    return hour_str


def parse_latitude(contents):
    coordinates_regex = r"coordenadas geo[^\n:]*:[\s]*([^\n]*)\n"
    coordinate_search = re.findall(coordinates_regex, contents)
    
    if len(coordinate_search) == 1:
        coordinate_list = re.split("/", coordinate_search[0])
        # decimal coordinates
        dec_regex = r"-\d+.\d+"
        arc_regex = r"(\d+,?\d*)\W"
        empty_regex = r"\|_+\|"
        
        if len(coordinate_list) == 2:
            if re.search(dec_regex, coordinate_list[1]):
                latitude_search = re.findall(dec_regex, coordinate_list[1])
                if len(latitude_search) == 1:
                    latitude = latitude_search[0]
                    latitude = latitude.replace(",", ".")
                else:
                    latitude = "UNENCOUNTERED FORMAT"
            elif re.search(arc_regex, coordinate_list[1]):
                # latitude (deg, arcmin, arcsec)
                latitude_list = re.findall(arc_regex, coordinate_list[1])
                if len(latitude_list) == 3:
                    lat_deg = float(latitude_list[0])
                    lat_arcmin = float(latitude_list[1])
                    lat_arcsec_str = latitude_list[2].replace(",", ".")
                    lat_arcsec = float(lat_arcsec_str)

                    latitude = str(-1 * (lat_deg 
                                         + (lat_arcmin / 60)
                                         + (lat_arcsec / 3600)))
                else:
                    latitude = "UNENCOUNTERED FORMAT"
            elif re.search(empty_regex, coordinate_list[0]):
                latitude = ""
            else:
                latitude = "UNENCOUNTERED FORMAT"
        else:
            coordinates = re.findall(dec_regex, coordinate_search[0])
            if len(coordinates) == 2:
                latitude = coordinates[1]
            else:
                latitude = "UNENCOUNTERED FORMAT"
    else:
        latitude = "UNENCOUNTERED FORMAT"

    return latitude


def parse_longitude(contents):
    coordinates_regex = r"coordenadas geo[^\n:]*:[\s]*([^\n]*)\n"
    coordinate_search = re.findall(coordinates_regex, contents)
    
    if len(coordinate_search) == 1:
        coordinate_list = re.split("/", coordinate_search[0])
        # decimal coordinates
        dec_regex = r"-\d+.\d+"
        arc_regex = r"(\d+,?\d*)\W"
        empty_regex = r"\|_+\|"
                           
        if len(coordinate_list) == 2:
            if re.search(dec_regex, coordinate_list[0]):
                longitude_search = re.findall(dec_regex, coordinate_list[0])
                if len(longitude_search) != 0:
                    longitude = longitude_search[0]
                    longitude = longitude.replace(",", ".")
                else:
                    longitude = "UNENCOUNTERED FORMAT"
            elif re.search(arc_regex, coordinate_list[0]):
                # longitude (deg, arcmin, arcsec)
                longitude_list = re.findall(arc_regex, coordinate_list[0])
                if len(longitude_list) == 3:
                    lon_deg = float(longitude_list[0])
                    lon_arcmin = float(longitude_list[1])
                    lon_arcsec_str = longitude_list[2].replace(",", ".")
                    lon_arcsec = float(lon_arcsec_str)

                    longitude = str(-1 * (lon_deg 
                                          + (lon_arcmin / 60)
                                          + (lon_arcsec / 3600)))
                else:
                    longitude = "UNENCOUNTERED FORMAT"
            elif re.search(empty_regex, coordinate_list[1]):
                longitude = ""
            else:
                longitude = "UNENCOUNTERED FORMAT"
        else:
            coordinates = re.findall(dec_regex, coordinate_search[0])
            if len(coordinates) == 2:
                longitude = coordinates[0]
            else:
                longitude = "UNENCOUNTERED FORMAT"
    else:
        longitude = "UNENCOUNTERED FORMAT"

    longitude = longitude.strip()

    if re.search(r"-[\d]*-[\d]+", longitude):
        longitude = "-" + ".".join(longitude[1:].split("-"))

    return longitude


def parse_region(contents):
    region_regex = r"regi\wn[^\n:]*:[\s]*([^\n]*)\n"
    region_search = re.findall(region_regex, contents)
    if len(region_search) == 1:
        region = region_search[0]
    else:
        region = "UNENCOUNTERED FORMAT"

    if region == "3":
        region = "N/A"

    return region


def parse_province(contents):
    province_regex = r"provincia[^\n:]*:[\s]*([^\n]*)\n"
    province_search = re.findall(province_regex, contents)
    if len(province_search) == 1:
        province = province_search[0]
    else:
        province = "UNENCOUNTERED FORMAT"

    province = province.strip() 

    return province


def parse_department(contents):
    department_regex = r"departa[^\n:]*:[\s]*([^\n]*)\n"
    department_search = re.findall(department_regex, contents)
    if len(department_search) == 1:
        department = department_search[0]
    else:
        department = "UNENCOUNTERED FORMAT"

    department = department.strip() 

    return department


def parse_loc_type(contents):
    loc_type_regex = r"tipo de localidad[^\n:]*:[\s]*([^\n]*)\n"
    loc_type_search = re.findall(loc_type_regex, contents)
    if len(loc_type_search) == 1:
        loc_types = re.split(r"/", loc_type_search[0])

        if len(loc_types) == 5: 
            if re.search(r"x", loc_types[0]):
                loc_type = "comunidad"
            elif re.search(r"x", loc_types[1]):
                loc_type = "barrio"
            elif re.search(r"x", loc_types[2]):
                loc_type = "campamiento"
            elif re.search(r"x", loc_types[3]):
                loc_type = "paraje"
            elif re.search(r"x", loc_types[4]):
                loc_type = "otro"
            else:
                loc_type = "N/A"
        elif ((len(loc_types) == 1) 
              and not re.search(r"(\w+)[ ]+([^\n]+)", loc_type_search[0])): 
            loc_type = loc_type_search[0]
        else:
            loc_str_regex = r"(\w+)[ ]+([^\n]+)"
            loc_str_search = re.findall(loc_str_regex, loc_type_search[0])
            if len(loc_str_search) == 1: 
                loc_type = loc_str_search[0][0]
            else:
                loc_type = loc_str_search
    else:
        loc_type = "UNENCOUNTERED FORMAT"

    loc_type = loc_type.strip()

    return loc_type


def parse_loc_specific(contents):
    # location_type_specific
    loc_specific_regex = r"\]\s*especificar:?[ _]*([\w\d \t,\.]*)"
    loc_specific_search = re.findall(loc_specific_regex, contents)
    if len(loc_specific_search) == 1:
        loc_specific = loc_specific_search[0]
    else:
        loc_type_regex = r"tipo de localidad[^\n:]*:[\s]*([^\n]*)\n"
        loc_type_search = re.findall(loc_type_regex, contents)
        if len(loc_type_search) == 1:
            loc_str_regex = r"(\w+)[ ]+([^\n/]+)"
            loc_str_search = re.findall(loc_str_regex, loc_type_search[0])
            if len(loc_str_search) == 1: 
                loc_specific = loc_str_search[0][1]
            else:
                loc_str_regex = r"\][ _]*([\w\d \t,]*[_]*[\w\d \t,]*)"
                loc_str_search = re.findall(loc_str_regex, loc_type_search[0])
                if len(loc_str_search) == 1: 
                    loc_specific = loc_str_search[0]
                else:
                    loc_specific = "N/A"
        else:
            loc_specific = "UNENCOUNTERED FORMAT"

    if loc_specific == "":
        loc_specific = "N/A"

    loc_specific = loc_specific.replace("_", " ")
    loc_specific = loc_specific.strip()

    return loc_specific

### Process the Whole File

In [None]:
def process_files(source_directory):
    dataset = []

    for process_file in os.listdir(source_directory):
        file_path = os.path.join(source_directory, process_file)

        # with statements automatically control the closing of files
        with open(file_path, "r") as file:
            contents = file.read()
            contents = contents.lower()

            parsers = {
                          "country": parse_country,
                          "interviewee": parse_interviewee,
                          "sex": parse_sex,
                          "ethnicity": parse_ethnicity,
                          "survey_date": parse_survey_date,
                          "survey_hour": parse_survey_hour,
                          "latitude": parse_latitude,
                          "longitude": parse_longitude,
                          "region": parse_region, 
                          "province": parse_province, 
                          "department": parse_department, 
                          "loc_type": parse_loc_type,
                          "loc_specific": parse_loc_specific
                      }

            data_dict = {}
            data_dict["filename"] = process_file

            for key, parser in parsers.items():
                data_dict[key] = parser(contents)

            dataset.append(data_dict)

    # convert list to DataFrame
    raw_df = pd.DataFrame(data=dataset)

    return raw_df

## Transform
### Pipeline Functions

In [None]:
def start_pipeline(dataf):
    return dataf.copy() 


def handle_null_data(dataf):
    # fill missing values with null
    dataf = dataf.replace("", np.NaN)
    
    return dataf


def clean_columns(dataf):
    lat_coords = dataf["latitude"].astype(float)
    long_coords = dataf["longitude"].astype(float)
    dataf["latitude"] = lat_coords.where(long_coords < -50, long_coords)
    dataf["longitude"] = long_coords.where(long_coords < -50, lat_coords)
    
    return dataf


def set_dtypes(dataf):
    dataf["latitude"] = dataf["latitude"].astype(float)
    dataf["longitude"] = dataf["longitude"].astype(float)
    
    return dataf

### Create Raw and Cleaned DataFrame

In [None]:
raw_df = process_files(source_directory)

clean_df = (raw_df
            .pipe(start_pipeline)
            .pipe(handle_null_data)
            .pipe(clean_columns)
            .pipe(set_dtypes))

## Load
### Export to CSV

In [None]:
datasets_directory = "../datasets/"
filename = "section_1.csv"
file_path = os.path.join(datasets_directory, filename)

clean_df.to_csv(file_path, index=False, na_rep="null")

***
# Testing Code
### View the Data

In [None]:
clean_df.head(10)

### Check Each Column for Parsing Errors and Standardize Values

In [None]:
cols_to_check = clean_df.columns
cols_to_drop = ["filename", "interviewee"]

cols_to_check = cols_to_check.drop(cols_to_drop)

for col in cols_to_check:
    print("*" * 50)
    print(" " * 5 + col)
    print(clean_df[col].value_counts(dropna=False))
    print("*" * 50)
    print("\n")

### Duplicate Rows

In [None]:
clean_df[clean_df.duplicated(subset=["interviewee", "sex"], keep=False)]

### Plotting Coordinates

In [None]:
import cartopy.crs as ccrs
import matplotlib.pyplot as plt

In [None]:
longitude_range = (clean_df.longitude.min(),   clean_df.longitude.max())      
latitude_range = (clean_df.latitude.min(), clean_df.latitude.max())

print("Longitude Range: {}".format(longitude_range))
print("Latitude Range: {}".format(latitude_range))

In [None]:
ax = plt.axes(projection=ccrs.PlateCarree())

ax.stock_img()
ax.gridlines()
ax.coastlines()
ax.set_extent([-170, -20, -80, 10], crs=ccrs.PlateCarree())

plt.scatter(x=clean_df["longitude"], y=clean_df["latitude"], 
            transform=ccrs.PlateCarree(), color="red")

plt.show()