In [None]:
import pandas as pd
import altair as alt
from glob import glob
#
import seaborn as sns
import matplotlib.pyplot as plt
#
from ipywidgets import interact

# archivos de INAPI
APPLICATIONS = glob("../../data/INAPI/Applications/*")
REGISTERS = glob("../../data/INAPI/Registers/*")

df= []
for i in APPLICATIONS:
    df.append(pd.read_excel(i))

df = pd.concat(df)
df["Year"] = df["ApplicationNumber"].astype(str).str[0:4]
df = df[df["ApplicantRegion"]\
     .isin(["Maule", "Bío Bío", "Ñuble", "Libertador B. O'Higgins"])].copy()

df2= []
for i in REGISTERS:
    df2.append(pd.read_excel(i))

df2 = pd.concat(df2)
df2["Year"] = df2["ApplicationNumber"].astype(str).str[0:4]
df2 = df2[df2["ApplicantRegion"]\
     .isin(["Maule", "Bío Bío", "Ñuble", "Libertador B. O'Higgins"])].copy()

df.head(2)

In [None]:
df_macrozone = df.copy()
df_macrozone2  = df2.copy()

In [None]:
df_macrozone["Applicants"] = df_macrozone["Applicants"].str.upper()\
                            .str.normalize('NFKD').str.encode('ascii', 
                                                              errors='ignore')\
                            .str.decode('utf-8').str.replace(".", "")

stats = df_macrozone.groupby(["ApplicantRegion", "Year"])\
        .agg({"ApplicationNumber": "nunique"}).reset_index()

population = [
    {"ApplicantRegion": "Libertador B. O'Higgins", "Population": 914555},
    {"ApplicantRegion": "Maule", "Population": 1044950},
    {"ApplicantRegion": "Ñuble", "Population": 480609},
    {"ApplicantRegion": "Bío Bío", "Population": 1556805}
]

df_population = pd.DataFrame(population)
df_population

df_agg = stats[stats["Year"].isin(["2017", "2018", "2019"])]\
          .groupby(["ApplicantRegion"]).agg({"ApplicationNumber": "mean"})\
          .reset_index()

df_agg = df_agg.merge(df_population, on="ApplicantRegion")
df_agg["Per Capita"] = 100000 * df_agg["ApplicationNumber"] / df_agg["Population"]

df_agg

In [None]:
sns.set(rc={'figure.figsize':(11.7,8.27)}, style="whitegrid")
ax = sns.barplot(x="ApplicantRegion", y="Per Capita", data=df_agg)
ax.set_ylabel("Número de Aplicaciones c/100.000 habitantes")
ax.set_xlabel("Región")

In [None]:
#Se agrupan las solicitudes por solicitante y año, posteriormente obtenermos los solicitantes que por año realizaron más solicitudes

df_macrozone_top = df_macrozone

df_macrozone_top['Applicants'] = df_macrozone_top["Applicants"].str.upper().str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8').str.replace(".", "")

stats2 = df_macrozone_top.groupby(["Applicants", "Year"]).agg({"ApplicationNumber": "nunique"}).reset_index()

df_top = stats2.sort_values('ApplicationNumber', ascending=False).head(20)

df_top

# Nuevo bloque de Luis, creación de funciones que retorna dataFrame de solicitantes o registros


In [None]:
#En caso de querer saber acerca de los colaboradores, comentar data.groupby... por:
#data = data.groupby(["Applicants", "Year"]).agg({"ApplicationNumber": "nunique"}).reset_index() 
#Esto aplica para registers y applicants

#Las instituciones con más solicitudes son 
#"UNIVERSIDAD DE CONCEPCION","UNIVERSIDAD DEL BIO BIO","CONTENEDORES SAN FERNANDO SPA", "MARIO GUERRERO MENDEZ", "VICTOR MANUEL ROSALES ITURRA"

def data_applicants(inst):
    data = df_macrozone2[df_macrozone2["Applicants"].str.contains(inst, regex=True, na=False)]
    data = data.groupby(["Year"]).agg({"ApplicationNumber": "nunique"}).reset_index()
    data = data.sort_values('Year', ascending=True)
    return data

def data_registers(inst):
    data = df_macrozone[df_macrozone["Applicants"].str.contains(inst, regex=True, na=False)]
    data = data.groupby(["Year"]).agg({"ApplicationNumber": "nunique"}).reset_index()
    data = data.sort_values('Year', ascending=True)
    return data
    
def merge_data(inst):
    # mejoras de fandi
    a = data_applicants(inst)
    b = data_registers(inst)
    a = AddYears0(b , a)
    b = AddYears0(a , b)
    
    merged = pd.merge(a, b ,  on=['Year'])
    merged = merged.rename(columns={'ApplicationNumber_x':'Applicants',
                                   'ApplicationNumber_y':'Registers'})
    merged.insert(1, 'Applicant', inst, allow_duplicates=False)
    
    return merged
    
# improve merge. Add rows from dataframeB that doesn't exist in dataframeA
# Warning: use only once per dataframe -- or -- do only via merge_data function
def AddYears0(dataframeA , dataframeB):
    # to initiate dataframe
    add = pd.DataFrame([[1,2]], columns=['Year','ApplicationNumber'])
    for i in dataframeA.index:
        flag = True
        for j in dataframeB.index:
            #print(dataframeA['Year'].tolist()[i])
            if(dataframeA['Year'].tolist()[i] == dataframeB['Year'].tolist()[j]):
                #print('\t '+ dataframeB['Year'].tolist()[j] )
                flag = False
        if flag:
            row = pd.DataFrame([[dataframeA['Year'][i],0]], columns=['Year','ApplicationNumber'])
            #print(row)
            add = pd.concat([add,row])
    add = add[add["Year"] != 1]
    return pd.concat([add,dataframeB]).reset_index(drop=True)

# Punto 1: Corregido ✅ ✅
#### Las dos siguientes celdas muestran los datos separados por región

In [None]:
# First we need split df_macrozone and df_macrozone2
zone_list = ['Bío Bío','Maule','Ñuble',  "Libertador B. O'Higgins"]

def transform(aux):
    if (aux == 'Registers'):
        return 'Registro'
    return 'Solicitud'

def get_zone(zone_name, inst):
    # Obtain subset from zone_name
    zone_applications = df_macrozone[df_macrozone["ApplicantRegion"] == zone_name]
    zone_applications = zone_applications[zone_applications["Applicants"].str.contains(inst, regex=True, na=False)]
    zone_applications = zone_applications.groupby(["Year"]).agg({"ApplicationNumber": "nunique"}).reset_index()
    zone_applications = zone_applications.sort_values('Year', ascending=True)
    
    zone_registers = df_macrozone2[df_macrozone2["ApplicantRegion"] == zone_name]
    zone_registers = zone_registers[zone_registers["Applicants"].str.contains(inst, regex=True, na=False)]
    zone_registers = zone_registers.groupby(["Year"]).agg({"ApplicationNumber": "nunique"}).reset_index()
    zone_registers = zone_registers.sort_values('Year', ascending=True)
    
    # Create data with 0 if it doesn't exist in one or another dataframe
    #zone_applications = AddYears0(zone_registers ,zone_applications )
    #zone_registers = AddYears0(zone_applications , zone_registers)
    
    # Merge Dataframes 
    merged = pd.merge(zone_applications, zone_registers ,  on=['Year'])
    merged = merged.rename(columns={'ApplicationNumber_x':'Applicants',
                                   'ApplicationNumber_y':'Registers'})
    merged.insert(1, 'Applicant', inst, allow_duplicates=False)
    
    # Do tidy to deploy correctly
    tidy = pd.melt(frame=merged, id_vars=['Applicant','Year'],
                   value_vars=['Applicants','Registers'],
                  var_name='Type', value_name='ApplicationNumber')
    tidy['Type'] = tidy['Type'].apply(transform)
    
    if (len(tidy['Applicant'].tolist()) == 0):
        #print ("no hay data para region de "+ zone_name)
        return None
    
    # now we are available to deploy data on chart
    
    grafico = alt.Chart(tidy).mark_bar().encode(
    x=alt.X('Type:O', title=None, axis=None),
    y=alt.Y('ApplicationNumber:Q', title=None),
    color=alt.Color('Type:N', legend=alt.Legend(title="Tipo"), scale=alt.Scale(scheme='set1')),
    column=alt.Column('Year:N', header=alt.Header(title=None))
    ).properties(
        title=[zone_name,""]
    )
    
    return grafico

def getZonesChartsByInstitution(institucion):
    charts= []
    for aux in zone_list:
        p_chart = get_zone(aux,institucion)
        if (p_chart):
            charts.append(p_chart)

    # solución parche para mostrar los datos dependiendo de como vengan
    if (len(charts)== 1):
        deploy = alt.hconcat(charts[0]
                            ).resolve_scale(y='shared') 
    elif (len(charts)== 2):
        deploy = alt.hconcat(charts[0],charts[1]
                            ).resolve_scale(y='shared')
    elif (len(charts)== 3):
        deploy = alt.hconcat(charts[0],charts[1]
                            ).resolve_scale(y='shared') & alt.hconcat(charts[2]
                                                                     ).resolve_scale(y='shared')
    elif (len(charts)== 4):
        deploy = alt.hconcat(charts[0],charts[1]
                            ).resolve_scale(y='shared') & alt.hconcat(charts[2],charts[3]
                                                                     ).resolve_scale(y='shared')    
    return deploy.configure_legend(
        titleFontSize=18,
        labelFontSize=14
    ).configure_title(
        fontSize=20,
        font='Courier',
        anchor='middle',
        color='gray'
    ).properties(
        title=(["Registros vs Solicitudes de Patentes", institucion, "   "])
    )

In [None]:
# YOU CAN ADD MANUALY OR CONSULT toConcatenate DATAFRAME WITH COLABORATORS

lista_instituciones = ['UNIVERSIDAD DE CONCEPCION', 'UNIVERSIDAD DEL BIO BIO', 
                       'UNIVERSIDAD DE TALCA','UNIVERSIDAD DE LA SERENA',
                       'UNIVERSIDAD CATOLICA DE TEMUCO']

interact(getZonesChartsByInstitution, institucion= lista_instituciones)

# Punto 2 a corregir
## Corregido ✅
#### Gráficos de conteo por Región
#### Conteo para Applicants 

In [None]:
chart = alt.Chart(df_macrozone[["ApplicantRegion","Year"]]).mark_bar().encode(
    x=alt.X('Year:O', title=None),
    y=alt.Y('count()', title='Solicitudes'),
    color = alt.Color('ApplicantRegion:N', legend=alt.Legend(title="Región"), scale=alt.Scale(scheme='dark2')),
    tooltip = [alt.Tooltip ('Year'),alt.Tooltip('ApplicantRegion', title='Región'),
               alt.Tooltip('count()',title = 'N° de Applicants')]
).configure_legend(
    titleFontSize=18,
    labelFontSize=14
).configure_axis(
    labelFontSize=14,
    titleFontSize=16
).configure_title(
    fontSize=20,
    font='Courier',
    anchor='middle',
    color='gray'
).properties(
    title=(["Solicitudes por Región" , "   "])
)

chart