In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Aug  1 10:43:34 2021

@author: IgVinçon
"""

############################ IMPORT LIBRARIES ################################
import unicodedata
import json
import re
from ast import literal_eval

import numpy as np
import pandas as pd

############################# SUB-ROUTINES ###################################

In [2]:
########################### CLEANING DATAFRAMES ##############################
# Open the data, if it's not already in memory.
anii_df = pd.read_csv('anii_corregido.csv', index_col = 0, na_values = 'n/a')
csic_df = pd.read_csv('csic.csv', index_col = 0, na_values = 'n/a')
ei_df = pd.read_csv('ei.csv', index_col = 0, na_values = 'n/a')
sni_df = pd.read_csv('sni.csv', index_col = 0, na_values = 'n/a')
# Open manually compiled data and data with researchers' names (includes alts.)
manual_df = pd.read_excel('manual.xlsx', na_values = 'n/a')
researchers_df = pd.read_excel('docentes_ifymp.xlsx', na_values = 'n/a')

############################# ANII DATAFRAME ##################################
#anii_df['anio'] = anii_df['codigo'].replace({'(.*)((?<=\_)\d{4}(?=\_))(.*)': '\\2'}, regex = True)
#anii_df.to_csv('anii_corregido.csv')

# The column "beneficiario" has both the institution and the name of the
# reponsible person for the project. Thus that information must be separated.
anii_df['institucion'] = (
    anii_df['beneficiario'].replace(r'^.*\:', '', regex = True).str.strip()
)
anii_df['beneficiario'] = (
    anii_df['beneficiario'].replace(r'\:.*$', '', regex = True).str.strip()
)
anii_df = anii_df.replace(r'^\s*$', np.nan, regex = True)

# Calculate the finishing date for each project:
# Replace non date values with missing values.
anii_df['fecha de inicio'] = (
    anii_df['fecha de inicio'].replace('00.00.0000', np.nan)
)
# Convert 'fecha de inicio' to datetime format
anii_df['fecha de inicio'] = (
    pd.to_datetime(anii_df['fecha de inicio'], format = '%d.%m.%Y')
)
# Assign a variable to store the data in 'duracion' as integers
anii_df['offset'] = (
    pd.to_numeric(anii_df['duracion']
                  .replace(r'\s\w*$', '', regex = True).str.strip(),
                  errors = 'coerce').fillna(0).astype('Int8')
)
# Create 'fecha fin' column that adds 'fecha de inicio' and 'offset'
anii_df['fecha fin'] = (
    anii_df.apply(
        lambda x: x['fecha de inicio'] + pd.DateOffset(months = x['offset']),
        axis = 1
    )
)
anii_df = anii_df.drop(columns = 'offset')
# Clean the 'responsable' column of multiple whitespaces.
anii_df['beneficiario'] = (
    anii_df['beneficiario'].str.replace(' +', ' ', regex = False)
)

anii_df['beneficiario'] = anii_df['beneficiario'].str.split(',')
# Change dtype for 'anio' to integer (Int32).
anii_df['anio'] = anii_df['anio'].astype('float').astype('Int32')

############################# SNI DATAFRAME ##################################
# The 'name' columns is in Last Name , First Name format. In order to join the
# datasets it should be in First Name Last Name (or Full Name) format.
sni_df['nombres'] = (
    sni_df['nombre'].replace(r'^.*\,', '', regex = True).str.strip() # limpiar con unicode data (tildes)
)
sni_df['nombres'] = sni_df['nombres'].str.replace(' +', ' ', regex = False)
sni_df['apellidos'] = (
    sni_df['nombre'].replace(r'\,.*$', '', regex = True).str.strip()
)
sni_df['apellidos'] = sni_df['apellidos'].str.replace(' +', ' ', regex = False)
sni_df['nombre'] = sni_df['nombres'] + ' ' + sni_df['apellidos']

############################## EI DATAFRAME ###################################
# Create 'fecha de inicio' and 'fecha fin' from 'periodo'. Then convert them
# to datetime format.
ei_df['fecha de inicio'] = (
    ei_df['periodo'].replace(r'\-\s.*$', '', regex = True).str.strip()
)
ei_df['fecha fin'] = (
    ei_df['periodo'].replace(r'^.*\s\-', '', regex = True).str.strip()
)
ei_df['fecha de inicio'] = (
    pd.to_datetime(ei_df['fecha de inicio'], format = '%Y-%m-%d')
)
ei_df['fecha fin'] = pd.to_datetime(ei_df['fecha fin'], format = '%Y-%m-%d')
# Extract year from 'convocatoria' and thus create the column 'anio'.
ei_df['anio'] = (
    ei_df['convocatoria'].replace(r'^\D*|\s\(.*\)$|\s\-\s.*$',
                                  '', regex = True).str.strip()
)
# Clean 'convocatoria' from having each row start with the word 'convocatoria'
# (thus avoiding redundancy).
ei_df['convocatoria'] = (
    ei_df['convocatoria']
    .replace(r'^convocatoria\s\-\s|^convocatoria\sal|^convocatoria\s',
             '', regex = True).str.strip()
)
# Create column 'enlace' to contain, as a list, 'url_grupo' and 'url_convo'.
ei_df['enlace'] = ei_df[['url_grupo', 'url_convo']].values.tolist()
# If dataset was opened (and not in working memory) it may be necessary to
# convert 'responsables' to list (from string) for later use in .explode
#isna = ei_df['responsables'].isna()
#ei_df.loc[isna, 'responsables'] = pd.Series([[]] * isna.sum()).values
ei_df['responsables'] = ei_df['responsables'].fillna('[]')
ei_df['responsables'] = ei_df['responsables'].apply(literal_eval)
# Change dtype for 'anio' to integer (Int32).
ei_df['anio'] = ei_df['anio'].astype('float').astype('Int32')
# Correct mistakes in some values (e.g.: "vazquez" should be "vasquez").
to_repl = (
    ei_df[ei_df['grupo'].str.contains('cicea', na = False)].index.to_list()
)
for i in range(len(to_repl)):
    ei_df['responsables'].iloc[to_repl[i]].remove('alejandro vazquez')
    ei_df['responsables'].iloc[to_repl[i]].extend(['alejandro vasquez'])
    
to_repl = (
    ei_df[ei_df['grupo'].str.contains('cicada', na = False)].index.to_list()
)
ei_df['responsables'].iloc[to_repl[0]].extend(['alvaro cabana fajardo'])

############################ CSIC DATAFRAME ###################################
# If dataset was opened (and not in working memory) it may be necessary to
# convert 'responsables' to list (from string) for later use in .explode
#isna = csic_df['responsables'].isna()
#csic_df.loc[isna, 'responsables'] = pd.Series([[]] * isna.sum()).values
csic_df['responsables'] = csic_df['responsables'].fillna('[]')
csic_df['responsables'] = csic_df['responsables'].apply(literal_eval)
# Change dtype for 'anio' to integer (Int32).
csic_df['ano'] = csic_df['ano'].astype('float').astype('Int32')

############################ MANUAL DATAFRAME ###################################
manual_df['responsables'] = manual_df['responsables'].apply(literal_eval)
# Change dtype for 'anio' to integer (Int32).
manual_df['anio'] = manual_df['anio'].astype('float').astype('Int32')

############################ RESEARCHERS DF ###################################
# Normalize data in the columns.
for col in researchers_df.columns[:-2]:
    researchers_df[col] = (researchers_df[col]
                           .str.lower()
                           .str.normalize('NFKD')   
                           .str.encode('ascii', 'ignore')
                           .str.decode('utf8')
                           .str.strip())

# Replace common names, that can be matched erronously, with NaN.
to_repl = ['diego gonzalez', # antes estaba "gonzalez garcia"
           'cecilia gomez', # problema empty string, debería estar solucionado
           'ignacio rodriguez', # antes estaba "monica da silva"
           'nicolas lopez', # empty string, debería estar solucionado
           'monica perez',
           'juan rodriguez',
           'sebastian morales',
           'daniel perez'] # agregado "daniel perez", situación empty string
researchers_df = researchers_df.replace(to_repl, np.nan)

# Create a Regex list out of the columns in 'cols' in order to have matches
# when merging dataframes. This will account for name variations, for example,
# it would match 'alejandro maiche' and 'alejandro maiche marini'.
cols = [c for c in researchers_df.columns[5:-2]]    
researchers_df = researchers_df.fillna('')
to_repl = []
for idx, row in researchers_df[cols].iterrows():
    tmp = row.to_list()
    tmp[:] = [x for x in tmp if x]
    to_repl.append('|'.join(tmp))
# Create a new column with the Regex, replace the empty strings with NaN and
# clean the DF of NaN (those that don't have any name variation, e.g.: 'cecilia
# gomez'). Thus avoiding matching with empty strings when "merging".
researchers_df['regex'] = to_repl
researchers_df = researchers_df.replace('', np.nan)
researchers_df = researchers_df.dropna(how = 'all', subset = cols)
to_repl = researchers_df['regex'].to_list()
vals = researchers_df['id'].to_list()
# Drop columns that are not needed in the merge
cols.extend(['grado', 'primer nombre', 'segundo nombre', 
            'primer apellido', 'segundo apellido', 'regex'])
researchers_df = researchers_df.drop(columns = cols)

########################## REARRANGING COLUMNS ################################
# Rename and rearrange columns order.
anii_df = anii_df.rename(
    columns = {'beneficiario': 'responsables',
               'instrumento': 'convocatoria',
               'codigo': 'id',
               'fase_estado': 'fase estado anii',
               'subsidio': 'monto',
               'proyecto': 'nombre de proyecto / grupo',
               'institucion': 'instituciones',
               'area': 'area anii',
               'sector': 'area'
              }
)
cols = ['id', 'responsables', 'instituciones', 'convocatoria', 'anio',
        'fecha de inicio', 'fecha fin', 'duracion', 'area', 'monto']
anii_df = anii_df[cols + [c for c in anii_df.columns if c not in cols]]

csic_df = csic_df.rename(
    columns = {'ano': 'anio',
               'monto total': 'monto',
               'proyecto': 'nombre de proyecto / grupo',
               'programa': 'convocatoria',
               'area proyecto': 'area'}
)
cols = ['responsables', 'convocatoria', 'anio', 'monto']
csic_df = csic_df[cols + [c for c in csic_df.columns if c not in cols]]

ei_df = ei_df.rename(
    columns = {'servicios involucrados': 'instituciones',
               'grupo': 'nombre de proyecto / grupo'}
)
cols = ['responsables', 'instituciones', 'convocatoria', 'anio',
        'fecha de inicio', 'fecha fin', 'periodo']
ei_df = ei_df[cols + [c for c in ei_df.columns if c not in cols]]

sni_df = sni_df.rename(columns = {'nombre': 'nombre completo'})
cols = ['nombre completo', 'nombres', 'apellidos', 'nivel']
sni_df = sni_df[cols + [c for c in sni_df.columns if c not in cols]]

In [3]:
########################## PROJECTS MERGING ################################
# Add a column to identify the founding party once the DFs are joined
anii_df['institucion financiadora'] = 'anii'
csic_df['institucion financiadora'] = 'csic'
ei_df['institucion financiadora'] = 'ei'
for col in ['fase estado anii', 'area anii']:
    ei_df[col] = np.nan
    csic_df[col] = np.nan

cols =  ['responsables',
         'instituciones',
         'institucion financiadora',
         'convocatoria',
         'anio',
         'fecha de inicio',
         'fecha fin',
         'area',
         'monto',
         'nombre de proyecto / grupo',
         'enlace',
         'fase estado anii',
         'area anii'] # agregar id anii? y dpto anii?

# Drop columns that will not be used in joined DF.
for df in [anii_df, csic_df, ei_df]:
    df.drop(columns = [c for c in df.columns if c not in cols], inplace = True)
# Concat DFs, one with all the projects, another with research only projects.
projects_df  = pd.DataFrame(columns = cols)
#research_df = pd.DataFrame(columns = cols)
projects_df = pd.concat([anii_df, csic_df, ei_df, manual_df], ignore_index = True)
#research_df = pd.concat([anii_df[anii_df['area anii'] == 'investigacion'],
 #                        csic_df, ei_df, manual_df], ignore_index = True)

# Final "merge" that identifies researchers from IFyMP and assigns them an ID. <<<<<<<<<<<<<<<<<<<<< revisar
projects_df = (
    projects_df[projects_df['responsables'].notna()]
    .explode('responsables', True)
)
projects_df['id'] = projects_df['responsables'].replace(to_repl, vals,
                                                        regex = True)
projects_df['id'] = projects_df['id'].replace({'(.*)(psi_ifmp\S*)(.*)': '\\2'},
                                              regex = True)
projects_df = pd.merge(projects_df, researchers_df, how = 'left', on = 'id')
# Filter by researchers from IFMP and save to Excel.
projects_df = projects_df[projects_df['id'].str.contains('psi', na = False)]
projects_df = projects_df.reset_index(drop = True)
projects_df.to_excel('investigacion.xlsx')
# Filter by most recent (and probably ongoing) research projects only.
flter = (
    ((projects_df['fecha fin'] > '2022-01-01') | 
    (projects_df['fecha fin'].isna())) &
    (projects_df['anio'] > 2017) & 
    ((projects_df['area anii'] == 'investigacion') |
    projects_df['area anii'].isna()) &
    (projects_df['convocatoria'] != 'becas posgrado')
)
research_df = projects_df[flter].reset_index(drop = True)
research_df.to_excel('proyectos.xlsx')

############################# SNI MERGING #####################################
# "Merge" that identifies researchers from IFyMP in SNI (assigning an ID).
sni_df['id'] = sni_df['nombre completo'].replace(to_repl, vals, regex = True)
sni_df['id'] = sni_df['id'].replace({'(.*)(psi_ifmp\S*)(.*)': '\\2'}, 
                                    regex = True)
sni_df = pd.merge(sni_df, researchers_df, how = 'left', on = 'id')
# Filter by IFyMP researchers, then export to Excel.
ifmp_sni_df = sni_df[sni_df['id'].str.contains('psi', na = False)]
ifmp_sni_df = ifmp_sni_df.reset_index(drop = True)
ifmp_sni_df.to_excel('sni_ifmp.xlsx')