In [60]:
import csv
import re
import pandas as pd

regiones_name = [
    "Tarapacá",
    "Antofagasta",
    "Atacama",
    "Coquimbo",
    "Valparaíso",
    "O’Higgins",
    "Maule",
    "Biobío",
    "Araucanía",
    "RM",
    "Los Ríos",
    "Los Lagos",
    "Aysén",
    "Magallanes",
    "Arica y Parinacota",
    "Ñuble",
]

regiones_iso = [
    'CL-TA', 
    'CL-AN', 
    'CL-AT',
    'CL-CO',  
    'CL-VS',        
    'CL-OH', 
    'CL-MA',
    'CL-BI', 
    'CL-AR', 
    'CL-RM', 
    'CL-LR', 
    'CL-LL',
    'CL-AI', 
    'CL-MG', 
    'CL-AP',   
    'CL-NB',
]

columnas = [
    'id_user',
    'id_file',
    'age',
    'sex',
    'comuna',
    'region',
    'education', 
    'group',
    'date',
    'init_time',
    'end_time',
    'priority',
    'emotion',
    'age_range',
    'code_comuna',
    'emotion_verified',
    'emotion_verified2',
]

new_columnas = [
    'id_user',
    'id_file',
    'age',
    'sex',
    'comuna',
    'region',
    'region_iso',
    'education', 
    'group',
    'date',
    'init_time',
    'end_time',
    'priority',
    'emotion',
    'age_range',
    'code_comuna',
    'emotion_verified',
    'emotion_verified2',
]

sex_options = ['F', 'H', 'OTRO']

sex_malo = ['Básica_Incompleta', 'Básica_Completa', 'Media_Incompleta', 'Universitaria_Completa', 'Técnico_Incompleto', 'Técnico_Completo']

education_name = {
    'basica_incompleta' : 'Básica Incompleta',
    'basica_completa' : 'Básica Completa',
    'tecnico_completo' : 'Técnico Completo',
    'universitaria_completa' : 'Universitaria Completa',
    'universitaria_incompleta' : 'Universitaria Incompleta',
    'media_incompleta' : 'Media Incompleta',
    'media_completa' : 'Media Completa',
    'postgrado' : 'Postgrado',
    'sin_edu_formal' : 'Sin Educatión Formal',
    'indeterminado' : 'Indeterminado',
    'educacion_especial' : 'Educación Especial',
    'tecnico_incompleto' : 'Técnico Incompleto',
    'nr' : 'NR',
}

education_malo = ['h', 'f']

education_options = [
    'basica_incompleta',
    'basica_completa',
    'tecnico_completo',
    'universitaria_completa',
    'universitaria_incompleta',
    'media_incompleta',
    'media_completa', 
    'postgrado',
    'sin_edu_formal',
    'indeterminado',
    'educacion_especial',
    'tecnico_incompleto',
]

input_file = 'emo_per_user_v4.csv'
output_file = 'emo_per_user_v5.csv'


In [61]:
with open(input_file) as csv_file:
    csv_reader = csv.DictReader(csv_file, delimiter=',')
    new_csv = list()
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
        else:          
            id_file = row['id_file']
            id_user = row['id_user']
            comuna = row['comuna']
            code_comuna = row['code_comuna']
            region_code = int(row['region']) - 1
            if region_code == -100:
                region_code = 11
            region_name = regiones_name[region_code]
            region_iso = regiones_iso[region_code]
            group = row['group']                
            age = int(re.search(r'\d+',row['age']).group())
            age_range = row['age_range']
            education = row['education'].replace(' ','')
            priority = row['priority']
            emotion = row['emotion']
            emotion_verified = row['emotion_verified']
            emotion_verified2 = row['emotion_verified2']
            date = row['date']
            init_time = row['init_time']
            end_time = row['end_time']
            sex = row['sex'].replace(' ','')
            
            if sex not in sex_options:
                if sex != 'NR':
                    if sex in sex_malo:
                        aux = sex
                        sex = education.capitalize()
                        education = aux.replace('_', ' ')
                    else:    
                        sex = 'NR'
            
            if education not in education_options:                
                if education != 'nr':
                    if education in education_malo:
                        aux = sex
                        sex = education.capitalize()
                        education = aux.replace('_', ' ')
                    else:    
                        education = 'NR'
                
            
            if education != 'NR':
                if education in education_name.keys():
                    education = education_name[education]

            new_csv.append({
                "id_file": id_file, 
                "id_user": id_user, 
                "comuna": comuna, 
                "code_comuna": code_comuna,
                "region": region_name,
                "region_iso": region_iso,
                "group": group,
                "age": age,
                "age_range": age_range,
                "education" : education,
                "sex" : sex,
                "priority": priority,
                "emotion": emotion,
                "emotion_verified": emotion_verified,
                "emotion_verified2" : emotion_verified2,
                "date" : date,
                "init_time" : init_time,
                "end_time" : end_time,
            })           
                
            line_count += 1
    try:
        with open(output_file, 'w') as new_csv_file:
            writer = csv.DictWriter(new_csv_file, fieldnames = new_columnas)
            writer.writeheader()
            for data in new_csv:
                writer.writerow(data)
    except IOError:
        print("I/O error")

In [62]:
#Para comprobar que los valores estén correctos
sexos = dict()
educaciones = dict()
with open(output_file) as csv_file:
    csv_reader = csv.DictReader(csv_file, delimiter=',')
    new_csv = list()
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
        else:          
            education = row['education']
            sex = row['sex']
            
            educaciones[education] = 1
            sexos[sex] = 1   
                
            line_count += 1
    print(sexos)
    print(educaciones)

{'F': 1, 'H': 1, 'NR': 1, 'OTRO': 1}
{'Básica Incompleta': 1, 'Básica Completa': 1, 'Técnico Completo': 1, 'NR': 1, 'Universitaria Completa': 1, 'Universitaria Incompleta': 1, 'Media Incompleta': 1, 'Media Completa': 1, 'Postgrado': 1, 'Sin Educatión Formal': 1, 'Indeterminado': 1, 'Educación Especial': 1, 'Técnico Incompleto': 1}
