# Adding variables to model

In [1]:
import pandas as pd
import ETL as etl
import numpy as np

# Parameters
SECOPI_PROCESS_API = 'f789-7hwg'
SECOPII_PROCESS_API = 'jbjy-vk9h'
SECOPI_ADDITIONS_API = '7fix-nd37'
SECOPI_PUNISHMENT_API = '4n4q-k399'

pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [3]:
# General functions
import os

# Get query files function
def get_path(folder, file_path):
    current_directory = os.getcwd()
    return os.path.join(current_directory, '..', folder, file_path)


def get_query(folder, file_path):
    path = get_path(folder, file_path)
    # try to get the query
    with open(path, "r", encoding="utf8") as query_file:
        query = query_file.read()
    
    return query

def parse_to_list(ls):
    ids = [ str(i) for i in ls]
    ids = "'" + "','".join(ids) + "'"
    return ids

## Set Up

In [27]:
procesos_data = pd.read_csv(get_path('data', 'collected_obra_data.csv'),
                 dtype = {'ID_ADJUDICACION': str},
                 parse_dates=['START_DATE','CONTRACT_DATE'])

procesos_secop_i_data = procesos_data[-procesos_data['GROUP'].str.contains('SECOP II')].copy()

procesos_secop_i_data['HAVE_DEVIATION'] = (procesos_secop_i_data['COST_DEVIATION_NORM']  > 0) | (procesos_secop_i_data['TIME_DEVIATION'] > 0)
procesos_secop_i_data['HAVE_DEVIATION_COST'] = procesos_secop_i_data['COST_DEVIATION_NORM']  > 0
procesos_secop_i_data['HAVE_DEVIATION_TIME'] = procesos_secop_i_data['TIME_DEVIATION']  > 0

#Summary
def summary(x):
    data = {}

    data['unique contracts'] = x['CONTRACT_ID'].size
    data['value average'] = x['CONTRACT_VALUE_NORM'].mean()
    data['cost deviation'] = x['COST_DEVIATION_NORM'].mean()
    data['time duration avg'] = x['ORIGINAL_DEADLINE'].mean()
    data['time deviation'] = x['TIME_DEVIATION'].mean()

    return pd.Series(data)

# Impression
print("Project with Time/Cost Adition")
print("Both Adition 235 (47%)")
print("Cost Adition 144 (29%)")
print("Time Adition 203 (40%)")
variables = ['HAVE_DEVIATION','HAVE_DEVIATION_COST','HAVE_DEVIATION_TIME']
procesos_secop_i_data.groupby(variables)[variables].size()
procesos_secop_i_data.groupby(variables).apply(summary)

Project with Time/Cost Adition
Both Adition 235 (47%)
Cost Adition 144 (29%)
Time Adition 203 (40%)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,unique contracts,value average,cost deviation,time duration avg,time deviation
HAVE_DEVIATION,HAVE_DEVIATION_COST,HAVE_DEVIATION_TIME,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
False,False,False,232.0,2508.694979,0.0,116.590517,0.0
True,False,True,68.0,5407.885222,0.0,120.735294,0.903239
True,True,False,26.0,2908.961126,0.24639,139.076923,0.0
True,True,True,94.0,8895.008364,0.236668,194.148936,0.748937


## Variables before project

### Municipality Type breakdown

* The 5 & 6 type are the unique categories where the adition projects is under 50%
* The other category has the highest value average project, but seems a missing data because the principal departments are incluided there (Bogotá, Antioquia, etc)

In [5]:
procesos_secop_i_data.groupby(['MUNICIPALITY_TYPE', 'HAVE_DEVIATION']).apply(summary)

Unnamed: 0_level_0,Unnamed: 1_level_0,unique contracts,value average,cost deviation,time deviation
MUNICIPALITY_TYPE,HAVE_DEVIATION,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
No Definido,False,1.0,50.0,0.0,0.0
OTHER,False,39.0,7339.387236,0.0,0.0
OTHER,True,48.0,18329.171949,0.149612,0.541861
TYPE_1,False,4.0,1823.422385,0.0,0.0
TYPE_1,True,10.0,9313.920153,0.151729,0.783472
TYPE_2,False,18.0,2669.198125,0.0,0.0
TYPE_2,True,11.0,5820.107434,0.165242,0.557143
TYPE_3,False,2.0,1681.319263,0.0,0.0
TYPE_3,True,4.0,1762.694557,0.171994,0.368056
TYPE_4,False,4.0,1665.05128,0.0,0.0


#### Detail for each adition type

In [6]:
procesos_secop_i_data.groupby(['MUNICIPALITY_TYPE', 'HAVE_DEVIATION_COST']).apply(summary)

Unnamed: 0_level_0,Unnamed: 1_level_0,unique contracts,value average,cost deviation,time deviation
MUNICIPALITY_TYPE,HAVE_DEVIATION_COST,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
No Definido,False,1.0,50.0,0.0,0.0
OTHER,False,60.0,8208.420169,0.0,0.213438
OTHER,True,27.0,24945.597986,0.265977,0.489001
TYPE_1,False,10.0,6769.935226,0.0,0.503305
TYPE_1,True,4.0,8183.384702,0.379322,0.700417
TYPE_2,False,23.0,2610.003579,0.0,0.170911
TYPE_2,True,6.0,8672.777618,0.302943,0.36627
TYPE_3,False,2.0,1681.319263,0.0,0.0
TYPE_3,True,4.0,1762.694557,0.171994,0.368056
TYPE_4,False,6.0,1369.598505,0.0,0.211111


In [7]:
procesos_secop_i_data.groupby(['MUNICIPALITY_TYPE', 'HAVE_DEVIATION_TIME']).apply(summary)

Unnamed: 0_level_0,Unnamed: 1_level_0,unique contracts,value average,cost deviation,time deviation
MUNICIPALITY_TYPE,HAVE_DEVIATION_TIME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
No Definido,False,1.0,50.0,0.0,0.0
OTHER,False,43.0,7861.90895,0.025814,0.0
OTHER,True,44.0,18817.597067,0.137986,0.591121
TYPE_1,False,4.0,1823.422385,0.0,0.0
TYPE_1,True,10.0,9313.920153,0.151729,0.783472
TYPE_2,False,19.0,2599.435835,0.013634,0.0
TYPE_2,True,10.0,6267.746715,0.155861,0.612857
TYPE_3,False,3.0,2085.795504,0.010926,0.0
TYPE_3,True,3.0,1385.343414,0.218399,0.490741
TYPE_4,False,4.0,1665.05128,0.0,0.0


In [29]:
# procesos_secop_i_data[procesos_secop_i_data['MUNICIPALITY_TYPE'] == 'OTHER'].groupby(['DEPARTMENT']).size()
procesos_secop_i_data[procesos_secop_i_data['MUNICIPALITY_TYPE'] == 'OTHER']

Unnamed: 0,CONTRACT_ID,ENTITY_NAME,DEPARTMENT,MUNICIPALITY_TYPE,PROCESS_TYPE,CONTRACT_OBJECT,OBJETC_DETAIL,ESTIMATED_COST_ORIG,CONTRACT_VALUE_ORIG,ADDITIONAL_COST_ORIG,FINAL_COST_ORIG,YEAR,CONTRACT_DATE,START_DATE,ORIGINAL_DEADLINE,END_DATE,ID_ADJUDICACION,URLPROCESO,NIT_ENTIDAD,ID_CONTRATISTA,dpto_y_muni_contratista,MUNICIPALITY,ESTIMATED_COST_NORM,CONTRACT_VALUE_NORM,ADDITIONAL_COST_NORM,FINAL_COST_NORM,ADDITIONAL_TIME,FINAL_DEADLINE,PROJECT_INTENSITY_NORM,PROJECT_INTENSITY_ORIG,AWARD_GROWTH_NORM,AWARD_GROWTH_ORIG,COST_DEVIATION_ORIG,COST_DEVIATION_NORM,TIME_DEVIATION,OWNER,REGION,GROUP,COST_GROUP_QUANTILE,HAVE_DEVIATION,HAVE_DEVIATION_COST,HAVE_DEVIATION_TIME
8,17-1-169835-6445803,CALDAS - GOBERNACION,CALDAS,OTHER,LICITACIÓN PÚBLICA,"TERRENOS, EDIFICIOS, ESTRUCTURAS Y VÍAS",MEJORAMIENTO MEDIANTE LA CONSTRUCCION DE HUELLAS EN CONCRETO; RED VIAL DEPARTAMENTO DE CALDAS,3.189708e+09,5.248444e+08,0.000000e+00,5.248444e+08,2017,2017-06-23,2017-09-18,60,2017-11-18,6445803,{'url': 'https://www.contratos.gov.co/consultas/detalleProceso.do?numConstancia=17-1-169835'},890801052,75056638,Caldas,Manizales,4323.755187,711.444078,0.000000,711.444078,21,81,11.857401,8.747407e+06,-83.545690,-83.545690,0.000000,0.000000,0.350000,DEPARTMENT_GOVERNMENT,ANDINA,Vias Rurales hasta 2020,2,True,False,True
16,17-1-184716-7238984,TOLIMA - GOBERNACION,TOLIMA,OTHER,LICITACIÓN PÚBLICA,"TERRENOS, EDIFICIOS, ESTRUCTURAS Y VÍAS",PAVIMENTACION DE LA VIA HERVEO - DEL GADITAS DEL DEPARTAMENTO DEL TOLIMA,4.778678e+09,4.778678e+09,4.488455e+08,5.227523e+09,2018,2018-03-15,2018-03-15,120,2018-10-11,7238984,{'url': 'https://www.contratos.gov.co/consultas/detalleProceso.do?numConstancia=17-1-184716'},800113672,10545813,Tolima,Ibagué,6116.770405,6116.770405,574.528064,6691.298470,90,210,50.973087,3.982232e+07,0.000000,0.000000,0.093927,0.093927,0.750000,DEPARTMENT_GOVERNMENT,ANDINA,Vias Rurales hasta 2020,4,True,True,True
17,17-1-179303-6641358,TOLIMA - GOBERNACION,TOLIMA,OTHER,LICITACIÓN PÚBLICA,"TERRENOS, EDIFICIOS, ESTRUCTURAS Y VÍAS",MEJORAMIENTO Y PAVIMENTACION DE LA VIA SALADO  SAN BERNARDO EN ZONA RURAL DE IBAGUE EN EL DEPARTAMENTO DEL TOLIMA,1.287531e+10,1.284154e+10,5.652561e+09,1.849410e+10,2017,2017-11-16,2017-11-16,300,2019-07-24,6641358,{'url': 'https://www.contratos.gov.co/consultas/detalleProceso.do?numConstancia=17-1-179303'},800113672,901127921,Tolima,Ibagué,17452.913093,17407.138922,7662.234941,25069.373863,315,615,58.023796,4.280514e+07,-0.262272,-0.262272,0.440178,0.440178,1.050000,DEPARTMENT_GOVERNMENT,ANDINA,Vias Rurales hasta 2020,4,True,True,True
20,17-1-180555-6881863,CUNDINAMARCA - INSTITUTO DE INFRAESTRUCTURA Y CONCESIONES DE CUNDINAMARCA - ICCU,BOGOTÁ D.C.,OTHER,LICITACIÓN PÚBLICA,"TERRENOS, EDIFICIOS, ESTRUCTURAS Y VÍAS",REHABILITACION DE LA VIA CLUB EL BOSQUE - TIBACUY - CUMACA DEL DEPARTAMENTO DE CUNDINAMARCA,2.803724e+09,2.542991e+09,0.000000e+00,2.542991e+09,2018,2018-01-18,2018-01-31,180,2018-07-31,6881863,{'url': 'https://www.contratos.gov.co/consultas/detalleProceso.do?numConstancia=17-1-180555'},900258711,901145885,Cundinamarca,Bogotá D.C.,3588.803318,3255.061548,0.000000,3255.061548,0,180,18.083675,1.412773e+07,-9.299528,-9.299528,0.000000,0.000000,0.000000,DEPARTMENT_GOVERNMENT,ANDINA,Vias Rurales hasta 2020,4,False,False,False
29,18-21-3973-7834103,BOGOTA D.C. - ALCALDIA LOCAL DE SUMAPAZ,BOGOTÁ D.C.,OTHER,LICITACIÓN OBRA PÚBLICA,"TERRENOS, EDIFICIOS, ESTRUCTURAS Y VÍAS",CONTRATAR LAS OBRAS PARA LA CONSERVACION DE LA MALLA VIAL LOCAL DE SUMAPAZ; POR EL SISTEMA DE PRECIOS UNITARIOS FIJOS; SIN FORMULA DE REAJUSTE Y A MONTO AGOTABLE,1.462851e+10,1.462851e+10,2.220000e+09,1.684851e+10,2018,2018-10-24,2018-12-18,360,2020-06-18,7834103,{'url': 'https://www.contratos.gov.co/consultas/detalleProceso.do?numConstancia=18-21-3973'},No Definido,901224330,Bogotá D.C.,Bogotá D.C.,18724.690224,18724.690224,2841.629098,21566.319322,180,540,52.013028,4.063476e+07,0.000000,0.000000,0.151758,0.151758,0.500000,OTHER,ANDINA,Vias Rurales hasta 2020,4,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
389,21-21-22652-12281220,ANTIOQUIA - ALCALDIA MUNICIPIO DE ANDES,ANTIOQUIA,OTHER,LICITACIÓN OBRA PÚBLICA,"TERRENOS, EDIFICIOS, ESTRUCTURAS Y VÍAS",PAVIMENTACION DE VIAS RURALES MEDIANTE EL PROCESO CONSTRUCTIVO DE PLACA HUELLA EN EL SECTOR QUEBRADA ARRIBA ETAPA 1.,1.712466e+09,1.658997e+09,0.000000e+00,1.658997e+09,2021,2021-07-15,2022-08-05,120,2022-12-05,12281220,{'url': 'https://www.contratos.gov.co/consultas/detalleProceso.do?numConstancia=21-21-22652'},890.980.342-7,900439169,Antioquia,Andes,1884.883412,1826.031025,0.000000,1826.031025,0,120,15.216925,1.382497e+07,-3.122336,-3.122336,0.000000,0.000000,0.000000,DEPARTMENT_GOVERNMENT,ANDINA,Vias Rurales 2021-2023,3,False,False,False
403,21-21-24351-11519746,GUAVIARE - GOBERNACION,GUAVIARE,OTHER,LICITACIÓN OBRA PÚBLICA,"TERRENOS, EDIFICIOS, ESTRUCTURAS Y VÍAS",PAVIMENTACION DE LA VIA 7506 QUE COMUNICA A LOS MUNICIPIOS: SAN JOSE DEL GUAVIARE Y EL RETORNO  ETAPA - 2; GUAVIARE,5.721321e+10,5.711796e+10,0.000000e+00,5.711796e+10,2021,2021-10-25,2021-11-12,390,2023-02-12,11519746,{'url': 'https://www.contratos.gov.co/consultas/detalleProceso.do?numConstancia=21-21-24351'},800103196,901529899,Guaviare,San José del Guaviare,62973.658245,62868.817592,0.000000,62868.817592,60,450,161.202096,1.464563e+08,-0.166483,-0.166483,0.000000,0.000000,0.153846,DEPARTMENT_GOVERNMENT,AMAZONIA,Vias Rurales 2021-2023,4,True,False,True
405,21-21-23295-11466400,NORTE DE SANTANDER - AREA METROPOLITANA DE CUCUTA,NORTE DE SANTANDER,OTHER,LICITACIÓN OBRA PÚBLICA,"TERRENOS, EDIFICIOS, ESTRUCTURAS Y VÍAS",MANTENIMIENTO DE TRAMOS VIALES AFECTADOS AL CONTRATO DE CONCESION N 006-2007 SUSCRITO ENTRE EL INSTITUTO NACIONAL DE CONCESIONES - INCO - HOY AGENCIA NACIONAL DE INFRAESTRUCTURA - ANI - REVERTIDAS AL AREA METROPOLITANA DE CUCUTA CON LOS RECURSOS NO EJECUTADOS A LA FECHA DE LIQUIDACION DE COMUN ACUERDO ENTRE LS PARTES DEL CONTRATO DE CONCESION.,5.095300e+08,4.557373e+08,0.000000e+00,4.557373e+08,2021,2021-08-31,2021-09-21,120,2022-01-21,11466400,{'url': 'https://www.contratos.gov.co/consultas/detalleProceso.do?numConstancia=21-21-23295'},800153197,13478936,Norte De Santander,San José de Cúcuta,560.831516,501.622736,0.000000,501.622736,0,120,4.180189,3.797811e+06,-10.557320,-10.557320,0.000000,0.000000,0.000000,DEPARTMENT_GOVERNMENT,ANDINA,Vias Rurales 2021-2023,1,False,False,False
407,21-21-26789-11869730,CAUCA - ALCALDIA MUNICIPIO DE SUAREZ,CAUCA,OTHER,LICITACIÓN OBRA PÚBLICA,"TERRENOS, EDIFICIOS, ESTRUCTURAS Y VÍAS",MANTENIMIENTO Y MEJORAMIENTO DE VIAS RURALES DEL PROGRAMA COLOMBIA RURAL EN EL MUNICIPIO DE SUAREZ CAUCA,1.000000e+09,1.000000e+09,0.000000e+00,1.000000e+09,2021,2021-12-23,2022-01-04,7,2022-01-11,11869730,{'url': 'https://www.contratos.gov.co/consultas/detalleProceso.do?numConstancia=21-21-26789'},800117687-5,901550716,Cauca,Suárez,1100.683965,1100.683965,0.000000,1100.683965,0,7,157.240566,1.428571e+08,0.000000,0.000000,0.000000,0.000000,0.000000,DEPARTMENT_GOVERNMENT,PACIFICA,Vias Rurales 2021-2023,3,False,False,False


### Region breakdown
* The Andina region shows the most differents in the value of contrats and has the major number of projects
* The orinoquia have the inverse case of the region above

In [9]:
procesos_secop_i_data.groupby(['REGION', 'HAVE_DEVIATION']).apply(summary)

Unnamed: 0_level_0,Unnamed: 1_level_0,unique contracts,value average,cost deviation,time deviation
REGION,HAVE_DEVIATION,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AMAZONIA,False,8.0,1550.61898,0.0,0.0
AMAZONIA,True,5.0,16820.77584,0.079214,0.547436
ANDINA,False,153.0,1475.585004,0.0,0.0
ANDINA,True,136.0,6217.067733,0.150025,0.756031
CARIBE,False,23.0,2430.130826,0.0,0.0
CARIBE,True,11.0,6698.183718,0.215392,0.630664
ORINOQUIA,False,20.0,11645.81297,0.0,0.0
ORINOQUIA,True,22.0,10076.416089,0.154148,0.563961
OTRA,False,1.0,50.0,0.0,0.0
PACIFICA,False,27.0,2036.611449,0.0,0.0


#### Detail for each adition type

In [10]:
procesos_secop_i_data.groupby(['REGION', 'HAVE_DEVIATION_COST']).apply(summary)

Unnamed: 0_level_0,Unnamed: 1_level_0,unique contracts,value average,cost deviation,time deviation
REGION,HAVE_DEVIATION_COST,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AMAZONIA,False,9.0,8363.75216,0.0,0.017094
AMAZONIA,True,4.0,5308.765402,0.099018,0.645833
ANDINA,False,202.0,1689.398521,0.0,0.236824
ANDINA,True,87.0,8391.117427,0.234522,0.631975
CARIBE,False,26.0,3506.31705,0.0,0.158654
CARIBE,True,8.0,4801.098325,0.296164,0.351538
ORINOQUIA,False,27.0,12519.680547,0.0,0.098721
ORINOQUIA,True,15.0,7771.069239,0.226084,0.649444
OTRA,False,1.0,50.0,0.0,0.0
PACIFICA,False,35.0,2970.729754,0.0,0.189643


In [11]:
procesos_secop_i_data.groupby(['REGION', 'HAVE_DEVIATION_TIME']).apply(summary)

Unnamed: 0_level_0,Unnamed: 1_level_0,unique contracts,value average,cost deviation,time deviation
REGION,HAVE_DEVIATION_TIME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AMAZONIA,False,8.0,1550.61898,0.0,0.0
AMAZONIA,True,5.0,16820.77584,0.079214,0.547436
ANDINA,False,170.0,1662.746801,0.025083,0.0
ANDINA,True,119.0,6627.048413,0.135625,0.864036
CARIBE,False,25.0,2580.456216,0.016029,0.0
CARIBE,True,9.0,7229.069388,0.218733,0.770811
ORINOQUIA,False,23.0,10376.82815,0.03243,0.0
ORINOQUIA,True,19.0,11364.7561,0.13923,0.653008
OTRA,False,1.0,50.0,0.0,0.0
PACIFICA,False,31.0,1914.511722,0.032113,0.0


### Started Year breakdown

In [12]:
procesos_secop_i_data.groupby(['YEAR', 'HAVE_DEVIATION']).apply(summary)

Unnamed: 0_level_0,Unnamed: 1_level_0,unique contracts,value average,cost deviation,time deviation
YEAR,HAVE_DEVIATION,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014,False,1.0,324.675323,0.0,0.0
2015,False,78.0,4068.234311,0.0,0.0
2015,True,59.0,13727.12588,0.146919,0.626928
2016,False,22.0,2044.900515,0.0,0.0
2016,True,25.0,4779.259903,0.251128,0.510058
2017,False,40.0,921.284578,0.0,0.0
2017,True,30.0,2093.159967,0.101,0.582368
2018,False,31.0,1326.660744,0.0,0.0
2018,True,24.0,4774.634391,0.227545,0.609422
2019,False,35.0,1771.777626,0.0,0.0


#### Detail for each adition type

In [13]:
procesos_secop_i_data.groupby(['YEAR', 'HAVE_DEVIATION_COST']).apply(summary)

Unnamed: 0_level_0,Unnamed: 1_level_0,unique contracts,value average,cost deviation,time deviation
YEAR,HAVE_DEVIATION_COST,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014,False,1.0,324.675323,0.0,0.0
2015,False,99.0,5165.078819,0.0,0.189191
2015,True,38.0,16207.365792,0.228111,0.480495
2016,False,30.0,2472.802843,0.0,0.151852
2016,True,17.0,5310.895507,0.369306,0.482112
2017,False,56.0,1073.802454,0.0,0.17736
2017,True,14.0,2822.374621,0.216429,0.538492
2018,False,33.0,1284.471139,0.0,0.025253
2018,True,22.0,5151.370949,0.248231,0.626946
2019,False,45.0,1851.089842,0.0,0.112778


In [14]:
procesos_secop_i_data.groupby(['YEAR', 'HAVE_DEVIATION_TIME']).apply(summary)

Unnamed: 0_level_0,Unnamed: 1_level_0,unique contracts,value average,cost deviation,time deviation
YEAR,HAVE_DEVIATION_TIME,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2014,False,1.0,324.675323,0.0,0.0
2015,False,86.0,4053.907725,0.020642,0.0
2015,True,51.0,15266.404683,0.135157,0.72527
2016,False,26.0,1913.463326,0.051969,0.0
2016,True,21.0,5462.822022,0.23462,0.607212
2017,False,46.0,958.597484,0.023043,0.0
2017,True,24.0,2314.612411,0.082085,0.72796
2018,False,37.0,1933.350271,0.033342,0.0
2018,True,18.0,4676.874913,0.234858,0.812563
2019,False,35.0,1771.777626,0.0,0.0


### Punishment last year

In [15]:
ids_entidad = list(set(procesos_secop_i_data['NIT_ENTIDAD']))
ids_contratistas = list(set(procesos_secop_i_data['ID_CONTRATISTA']))

ids_entidad = parse_to_list(ids_entidad)
ids_contratistas = parse_to_list(ids_contratistas)

# Get data
query = get_query('bin/queries', 'request_punishment.sql')
query = query.format(NIT_ENTIDAD_LIST = ids_entidad, NIT_CONTRATISTAS_LIST = ids_contratistas)


temp_punishment_data = etl.extract_data(query, id_data=SECOPI_PUNISHMENT_API, api_key=None)
temp_punishment_data['PENALTY_DATE'] = pd.to_datetime(temp_punishment_data['PENALTY_DATE'])

temp_punishment_data.head()



El numero de contratos extraidos: 646


Unnamed: 0,PENALTY_ID,NIT_ENTIDAD,ID_CONTRATISTA,PENALTY_VALUE,PENALTY_DATE
0,147-2020,800100532-8,900.659.669-0,1658082,2021-05-06
1,SIN NUMERO,800100134-1,9001209218,74000000,2016-07-13
2,06-334-2015,890204646-3,900260321,999055,2017-12-18
3,06-045-2019,890204646-3,890935513,15903790,2019-06-26
4,ACTA CONJUNTA 088 DE 2015,890204646-3,800183770,1564734671,2020-12-11


In [16]:
# entidad
temp_punishment_entidad_data = pd.merge(procesos_secop_i_data[['CONTRACT_ID','NIT_ENTIDAD','CONTRACT_DATE','END_DATE']], temp_punishment_data, 
                                    how='inner', on='NIT_ENTIDAD')

def penalty_summary(x,pref):
    data = {}

    data['NUM_PENALTIES' + pref] = x['PENALTY_ID'].size
    #data['VAL_PENALTIES'] = x['PENALTY_VALUE'].sum()
    return pd.Series(data)

temp_punishment_entidad_last = temp_punishment_entidad_data[(temp_punishment_entidad_data['CONTRACT_DATE'] > temp_punishment_entidad_data['PENALTY_DATE']) &
                                                            (temp_punishment_entidad_data['PENALTY_DATE'] > (temp_punishment_entidad_data['CONTRACT_DATE'] - pd.DateOffset(years=1)))]\
                                                                .drop_duplicates().groupby('CONTRACT_ID').apply(penalty_summary, '_ENTIDAD_LAST_Y')

procesos_secop_i_data.reset_index(drop = True, inplace = True)
temp_punishment_entidad_last.reset_index(inplace = True)

procesos_secop_i_data = pd.merge(procesos_secop_i_data, 
                                 temp_punishment_entidad_last, 
                                 how='left', on='CONTRACT_ID')

In [17]:
def summary_punishment(x):
    data = {}
    data['unique contracts'] = x['CONTRACT_ID'].size
    data['value average'] = x['CONTRACT_VALUE_NORM'].mean()
    data['cost deviation'] = x['COST_DEVIATION_NORM'].mean()
    data['time deviation'] = x['TIME_DEVIATION'].mean()
    data['avg penalties to entity last year'] = x['NUM_PENALTIES_ENTIDAD_LAST_Y'].mean()
    data['var penalties to entity last year'] = x['NUM_PENALTIES_ENTIDAD_LAST_Y'].std()

    return pd.Series(data)

procesos_secop_i_data.groupby('HAVE_DEVIATION').apply(summary_punishment)

Unnamed: 0_level_0,unique contracts,value average,cost deviation,time deviation,avg penalties to entity last year,var penalties to entity last year
HAVE_DEVIATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,232.0,2508.694979,0.0,0.0,41.5,30.60112
True,188.0,6805.850907,0.152409,0.701172,48.653846,24.48337


### Time or cost deviation in the past year

In [28]:
# Get data
query = get_query('bin/queries', 'request_cost_and_time_deviation_historical.sql')
query = query.format(NIT_ENTIDAD_LIST = ids_entidad)

temp_adition_historical = etl.extract_data(query, id_data=SECOPI_PROCESS_API, api_key=None)

temp_adition_historical.head(10)



El numero de contratos extraidos: 35518


Unnamed: 0,NIT_ENTIDAD,MONTH_AT,PROJECTS,PROJECTS_W_COST_ADITION,PROJECTS_W_TIME_ADITION
0,591500887-4,2007-08-01T00:00:00.000,1,0,0
1,591500887-4,2008-01-01T00:00:00.000,1,0,0
2,591500887-4,2008-02-01T00:00:00.000,32,0,0
3,591500887-4,2008-03-01T00:00:00.000,23,0,0
4,591500887-4,2008-04-01T00:00:00.000,24,0,2
5,591500887-4,2008-05-01T00:00:00.000,25,0,4
6,591500887-4,2008-06-01T00:00:00.000,18,1,0
7,591500887-4,2008-07-01T00:00:00.000,11,0,0
8,591500887-4,2008-08-01T00:00:00.000,2,0,0
9,591500887-4,2008-09-01T00:00:00.000,2,1,0


In [None]:
temp_adition_historical['MONTH_AT'] = pd.to_datetime(temp_adition_historical['MONTH_AT'])
procesos_secop_i_data['CONTRACT_DATE'] = pd.to_datetime(procesos_secop_i_data['CONTRACT_DATE'])

merged_df = pd.merge(procesos_secop_i_data, temp_adition_historical, how='inner', on='NIT_ENTIDAD',
                     left_index=False, right_index=False, sort=False)

# Filter the merged DataFrame to include only rows where the date falls within the range of stated_at and ended_at columns
merged_df = merged_df[(merged_df['MONTH_AT'] >= (merged_df['CONTRACT_DATE'] - pd.DateOffset(years=1))) & 
                      (merged_df['MONTH_AT'] < merged_df['CONTRACT_DATE'])]

def proportion_cost_adition(x):
    data = {}

    data['cost'] = x['PROJECTS_W_COST_ADITION'].mean()
    data['time'] = x['PROJECTS_W_TIME_ADITION'].mean()
    data['total'] = x['PROJECTS'].mean()

    return pd.Series(data)

merged_df.groupby('CONTRACT_ID').apply(proportion_cost_adition)

## Variables during project

### First addition time

In [19]:
ids = list(set(procesos_secop_i_data['ID_ADJUDICACION']))

ids = parse_to_list(ids)

query = get_query('bin/queries', 'request_first_adding_date.sql').format(LIST_UID = ids)

temp_additions_data = etl.extract_data(query, id_data=SECOPI_ADDITIONS_API, api_key=None)

temp_additions_data.head()



El numero de contratos extraidos: 190


Unnamed: 0,ID_ADJUDICACION,NUM_ADDITION,NUM_ADDITION_VALUE,NUM_ADDITION_TIME,FIRST_ADDITION_AT,FIRST_ADDITION_TIME_AT,FIRST_ADDITION_VALUE_AT
0,10159407,4,0,4,2020-12-30T00:00:00.000,2020-12-30T00:00:00.000,
1,10285913,1,0,1,2021-08-25T00:00:00.000,2021-08-25T00:00:00.000,
2,10296834,2,0,2,2020-12-23T00:00:00.000,2020-12-23T00:00:00.000,
3,10326352,1,1,1,2021-10-15T00:00:00.000,2021-10-15T00:00:00.000,2021-10-15T00:00:00.000
4,10502654,3,1,3,2020-07-17T00:00:00.000,2020-07-17T00:00:00.000,2020-10-09T00:00:00.000


In [20]:
temp_additions_data['FIRST_ADDITION_AT'] = pd.to_datetime(temp_additions_data['FIRST_ADDITION_AT'])
procesos_secop_i_data = pd.merge(procesos_secop_i_data, temp_additions_data, 
                                    how='left', on='ID_ADJUDICACION')

procesos_secop_i_data['DAY_DIFF_FIRST_ADDITION'] = (procesos_secop_i_data['FIRST_ADDITION_AT'] - procesos_secop_i_data['START_DATE']) / np.timedelta64(1, 'D')
procesos_secop_i_data['RATIO_FIRST_ADDITION'] = procesos_secop_i_data['DAY_DIFF_FIRST_ADDITION'] / procesos_secop_i_data['ORIGINAL_DEADLINE']

bins = [0, 0.25, 0.5, 0.75, 1, float('inf')]  # Define your bin edges
labels = ['very early', 'early', 'medium', 'late', 'very late']  # Assign labels to bins

# Create a new categorical column based on the bins
procesos_secop_i_data['FIRST_ADITION_TIME_GROUP'] = pd.cut(procesos_secop_i_data['RATIO_FIRST_ADDITION'], bins=bins, labels=labels, right=False)

procesos_secop_i_data['FIRST_ADITION_TIME_GROUP'] = procesos_secop_i_data['FIRST_ADITION_TIME_GROUP'].cat.add_categories('NA').fillna('NA')

procesos_secop_i_data[['URLPROCESO','START_DATE', 'ORIGINAL_DEADLINE', 'FIRST_ADDITION_AT', 'DAY_DIFF_FIRST_ADDITION', 'FIRST_ADITION_TIME_GROUP']].head(5)

Unnamed: 0,URLPROCESO,START_DATE,ORIGINAL_DEADLINE,FIRST_ADDITION_AT,DAY_DIFF_FIRST_ADDITION,FIRST_ADITION_TIME_GROUP
0,{'url': 'https://www.contratos.gov.co/consultas/detalleProceso.do?numConstancia=17-1-182595'},2017-12-27,90,NaT,,
1,{'url': 'https://www.contratos.gov.co/consultas/detalleProceso.do?numConstancia=15-1-138089'},2015-07-06,120,2015-10-27,113.0,late
2,{'url': 'https://www.contratos.gov.co/consultas/detalleProceso.do?numConstancia=18-1-190669'},2018-08-27,90,2018-11-26,91.0,very late
3,{'url': 'https://www.contratos.gov.co/consultas/detalleProceso.do?numConstancia=17-1-173247'},2017-06-12,90,NaT,,
4,{'url': 'https://www.contratos.gov.co/consultas/detalleProceso.do?numConstancia=15-1-145573'},2015-10-01,90,NaT,,


In [21]:
procesos_secop_i_data[procesos_secop_i_data['HAVE_DEVIATION']].groupby(['FIRST_ADITION_TIME_GROUP']).apply(summary)

  procesos_secop_i_data[procesos_secop_i_data['HAVE_DEVIATION']].groupby(['FIRST_ADITION_TIME_GROUP']).apply(summary)


Unnamed: 0_level_0,unique contracts,value average,cost deviation,time deviation
FIRST_ADITION_TIME_GROUP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
very early,5.0,1353.123036,0.258177,0.817778
early,10.0,3189.214315,0.235009,0.289722
medium,9.0,7484.867789,0.298815,0.302957
late,39.0,9753.318722,0.13701,0.716851
very late,123.0,6415.561352,0.135167,0.760031
,2.0,1992.46285,0.176871,0.333333


In [22]:
#procesos_secop_i_data['NUM_ADDITION'] = procesos_secop_i_data['NUM_ADDITION'].astype(int)
procesos_secop_i_data.groupby('NUM_ADDITION').apply(summary).sort_values(by='NUM_ADDITION', ascending=True)

Unnamed: 0_level_0,unique contracts,value average,cost deviation,time deviation
NUM_ADDITION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,107.0,3734.463812,0.131945,0.462906
12,1.0,29387.520087,0.531379,0.729825
2,38.0,7750.654848,0.156367,0.700186
3,24.0,3577.652793,0.112784,1.015005
4,10.0,6357.629727,0.278701,1.678333
5,4.0,35933.073845,0.24143,1.601322
6,3.0,38609.527421,0.129275,1.457826
7,1.0,48006.69303,0.29095,0.451613
9,2.0,50584.616279,0.461626,1.289345


### Punishment during the project

In [23]:
# entidad
temp_punishment_entidad_active = temp_punishment_entidad_data[(temp_punishment_entidad_data['END_DATE'] > temp_punishment_entidad_data['PENALTY_DATE']) &
                                                            (temp_punishment_entidad_data['PENALTY_DATE'] > temp_punishment_entidad_data['CONTRACT_DATE'])]\
                                                                .drop_duplicates().groupby('CONTRACT_ID').apply(penalty_summary, '_ENTIDAD_ACTIVE')

#union new rows
procesos_secop_i_data.reset_index(drop = True, inplace = True)
temp_punishment_entidad_active.reset_index(inplace = True)

procesos_secop_i_data = pd.merge(procesos_secop_i_data, 
                                 temp_punishment_entidad_active, 
                                 how='left', on='CONTRACT_ID')

def summary_punishment_v2(x):
    data = {}
    data['unique contracts'] = x['CONTRACT_ID'].size
    data['value average'] = x['CONTRACT_VALUE_NORM'].mean()
    data['cost deviation'] = x['COST_DEVIATION_NORM'].mean()
    data['time deviation'] = x['TIME_DEVIATION'].mean()
    data['avg penalties to entity during project'] = x['NUM_PENALTIES_ENTIDAD_ACTIVE'].mean()
    data['var penalties to entity during project'] = x['NUM_PENALTIES_ENTIDAD_ACTIVE'].std()

    return pd.Series(data)

procesos_secop_i_data.groupby('HAVE_DEVIATION').apply(summary_punishment_v2)

Unnamed: 0_level_0,unique contracts,value average,cost deviation,time deviation,avg penalties to entity during project,var penalties to entity during project
HAVE_DEVIATION,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
False,232.0,2508.694979,0.0,0.0,17.666667,12.627808
True,188.0,6805.850907,0.152409,0.701172,17.7,19.381725
