# Adding new samples on the main table

The way to extrat procurement reports data from SECOP I is requesting to API Socrata. So we could use several queries to get different sets of reports.

The query that we were working bring us the reports of third part paths, if we run the same code we got 857 records

In [1]:
# Parameters
SECOPI_PROCESS_API = 'f789-7hwg'
SECOPII_PROCESS_API = 'jbjy-vk9h'

QUERIES_FOLDER = 'bin\queries'

In [2]:
# General functions
import os
import pandas as pd
import numpy as np
import ETL as etl

# Get query files function
def get_path(folder, file_path):
    current_directory = os.getcwd()
    return os.path.join(current_directory, '..', folder, file_path)


def get_query(folder, file_path):
    path = get_path(folder, file_path)
    # try to get the query
    with open(path, "r", encoding="utf8") as query_file:
        query = query_file.read()
    
    return query


In [3]:
# Get vias rurales procurements
query = get_query(QUERIES_FOLDER, 'request_data_vias_rurales.sql')
vias_rurales_data = etl.extract_data(query, id_data=SECOPI_PROCESS_API, api_key=None)

# apply the cleaning process
vias_rurales_data = etl.process_data(vias_rurales_data)



El numero de contratos extraidos: 147


## Getting samples from several escenarios
So using the same process above, we are going to get more records based on different escenarios as show here:
* "Vias secundarias" projects
* Extend the timeline until end of 2023
* Record from SECOP II
* Other method to catch the interested records

### "Vias secundarias" projects
We ran a query with new set of conditions to get these kind of records. As result we got 41 record extra
Note a new process were added to remove the overlaped records between the first dataset and this. So it found 580 record overlaped

In [4]:
query = get_query(QUERIES_FOLDER, 'request_data_vias_secundarias.sql')

vias_secundarias_data = etl.extract_data(query, id_data=SECOPI_PROCESS_API, api_key=None)

# apply the cleaning process
vias_secundarias_data = etl.process_data(vias_secundarias_data)

# remove the records those is alredy in the first data set
unique_contracts_vr = list(set(vias_rurales_data['CONTRACT_ID']))

unique_contracts_vs = [x for x in list(set(vias_secundarias_data['CONTRACT_ID'])) 
                        if not(x in unique_contracts_vr)]

vias_secundarias_data = vias_secundarias_data[vias_secundarias_data["CONTRACT_ID"].isin(unique_contracts_vs)]

print("Numero de contratos luego de remover overlap: {records}".format(records = len(vias_secundarias_data)))



El numero de contratos extraidos: 678
Numero de contratos luego de remover overlap: 678


## Extend the timeline until end of 2023
Until now the queries have a condition to get records before to jan-2021. So if we extend the timeline to jan-2024 90 records are founded about "vias rurales" and 7 to "vias secundarias"

In [5]:
query = get_query(QUERIES_FOLDER, 'request_data_vias_rurales_t2.sql')

vias_ruralest2_data = etl.extract_data(query, id_data=SECOPI_PROCESS_API, api_key=None)

# apply the cleaning process
vias_ruralest2_data = etl.process_data(vias_ruralest2_data)

# remove the records those is alredy in the first data set
unique_contracts = unique_contracts_vr + unique_contracts_vs

unique_contracts_vrt2 = [x for x in list(set(vias_ruralest2_data['CONTRACT_ID'])) 
                        if not(x in unique_contracts)]

vias_ruralest2_data = vias_ruralest2_data[vias_ruralest2_data["CONTRACT_ID"].isin(unique_contracts_vrt2)]

print("Numero de contratos luego de remover overlap: {records}".format(records = len(vias_ruralest2_data)))



El numero de contratos extraidos: 20
Numero de contratos luego de remover overlap: 20


In [6]:
query = get_query(QUERIES_FOLDER, 'request_data_vias_secundarias_t2.sql')

vias_secundariast2_data = etl.extract_data(query, id_data=SECOPI_PROCESS_API, api_key=None)

# apply the cleaning process
vias_secundariast2_data = etl.process_data(vias_secundariast2_data)

# remove the records those is alredy in the first data set
unique_contracts = unique_contracts + unique_contracts_vrt2

unique_contracts_vst2 = [x for x in list(set(vias_secundariast2_data['CONTRACT_ID'])) 
                        if not(x in unique_contracts)]

vias_secundariast2_data = vias_secundariast2_data[vias_secundariast2_data["CONTRACT_ID"].isin(unique_contracts_vst2)]

print("Numero de contratos luego de remover overlap: {records}".format(records = len(vias_secundariast2_data)))



El numero de contratos extraidos: 23
Numero de contratos luego de remover overlap: 23


## Record from SECOP II

In [7]:
query = get_query(QUERIES_FOLDER, 'request_data_vias_rurales_sii.sql')

vias_ruralessii_data = etl.extract_data(query, id_data=SECOPII_PROCESS_API, api_key=None)

# apply the cleaning process
if len(vias_ruralessii_data) > 0:
    vias_ruralessii_data = etl.process_data(vias_ruralessii_data)



El numero de contratos extraidos: 0


In [8]:
query = get_query(QUERIES_FOLDER, 'request_data_vias_rurales_sii_t2.sql')

vias_ruralessiit2_data = etl.extract_data(query, id_data=SECOPII_PROCESS_API, api_key=None)

# apply the cleaning process
if len(vias_ruralessiit2_data) > 0:
    vias_ruralessiit2_data = etl.process_data(vias_ruralessiit2_data)



El numero de contratos extraidos: 0


In [9]:
query = get_query(QUERIES_FOLDER, 'request_data_vias_secundarias_sii.sql')

vias_secundariassii_data = etl.extract_data(query, id_data=SECOPII_PROCESS_API, api_key=None)

if len(vias_secundariassii_data) > 0:

    # apply the cleaning process
    vias_secundariassii_data = etl.process_data(vias_secundariassii_data)

    # remove the records those is alredy in the first data set
    unique_contracts = list(set(vias_ruralessii_data['CONTRACT_ID'])) + list(set(vias_ruralessiit2_data['CONTRACT_ID']))

    unique_contracts_siivs = [x for x in list(set(vias_secundariassii_data['CONTRACT_ID'])) 
                                    if not(x in unique_contracts)]

    vias_secundariassii_data = vias_secundariassii_data[vias_secundariassii_data["CONTRACT_ID"].isin(unique_contracts_siivs)]

    print("Numero de contratos luego de remover overlap: {records}".format(records = len(vias_secundariassii_data)))



El numero de contratos extraidos: 0


In [10]:
query = get_query(QUERIES_FOLDER, 'request_data_vias_secundarias_sii_t2.sql')

vias_secundariassiit2_data = etl.extract_data(query, id_data=SECOPII_PROCESS_API, api_key=None)

if len(vias_secundariassiit2_data) > 0:
    # apply the cleaning process
    vias_secundariassiit2_data = etl.process_data(vias_secundariassiit2_data)

    # remove the records those is alredy in the first data set
    unique_contracts_siivst2 = [x for x in list(set(vias_secundariassiit2_data['CONTRACT_ID'])) 
                                        if not(x in unique_contracts)]

    vias_secundariassiit2_data = vias_secundariassiit2_data[vias_secundariassiit2_data["CONTRACT_ID"].isin(unique_contracts_siivst2)]

    print("Numero de contratos luego de remover overlap: {records}".format(records = len(vias_secundariassiit2_data)))



El numero de contratos extraidos: 0


## Summary


There is not match data from SECOP II with the proposed query from SECOP I. About "vias rurales" field was matched 167 records:

In [11]:
# Group data vias rurales
vias_rurales_data['GROUP'] = 'Vias Rurales hasta 2020'
vias_ruralest2_data['GROUP'] = 'Vias Rurales 2021-2023'
# vias_ruralessii_data['GROUP'] = 'Vias Rurales en SECOP II hasta 2020'
# vias_ruralessiit2_data['GROUP'] = 'Vias Rurales en SECOP II 2021-2023'

collected_data_vias_rurales = pd.concat([vias_rurales_data, vias_ruralest2_data
                                        #,vias_ruralessii_data, vias_ruralessiit2_data
                                    ])

collected_data_vias_rurales.to_csv(get_path('data', 'collected_vias_rurales_data_excel.csv'), index=False, sep=';', decimal=',')
collected_data_vias_rurales.to_csv(get_path('data', 'collected_vias_rurales_data.csv'), index=False)

def summary(x):
    data = {}

    data['unique contracts'] = x['CONTRACT_ID'].size
    data['value total'] = x['CONTRACT_VALUE_NORM'].sum()
    data['value average'] = x['CONTRACT_VALUE_NORM'].mean()
    data['cost deviation total'] = x['COST_DEVIATION_NORM'].sum()
    data['cost deviation average'] = x['COST_DEVIATION_NORM'].mean()

    return pd.Series(data)

collected_data_vias_rurales.groupby('GROUP').apply(summary)

Unnamed: 0_level_0,unique contracts,value total,value average,cost deviation total,cost deviation average
GROUP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Vias Rurales 2021-2023,20.0,74264.341723,3713.217086,0.681213,0.034061
Vias Rurales hasta 2020,147.0,246159.895237,1674.55711,10.159198,0.06911


In [12]:
# Group data vias rurales
vias_secundarias_data['GROUP'] = 'Vias secundarias hasta 2020'
vias_secundariast2_data['GROUP'] = 'Vias secundarias 2021-2023'
# vias_secundariassii_data['GROUP'] = 'Vias secundarias en SECOP II hasta 2020'
#vias_secundariassiit2_data['GROUP'] = 'Vias secundarias en SECOP II 2021-2023'

# collected_data = pd.concat([vias_rurales_data, vias_secundarias_data, 
#                             vias_ruralest2_data, vias_secundariast2_data
#                            , vias_ruralessii_data, vias_ruralessiit2_data,
#                            vias_secundariassii_data, vias_secundariassiit2_data
#                            ])

collected_data_vias_secundarias = pd.concat([vias_secundarias_data, vias_secundariast2_data
                                        #,vias_ruralessii_data, vias_ruralessiit2_data
                                    ])

collected_data_vias_secundarias.to_csv(get_path('data', 'collected_vias_secundarias_data_excel.csv'), index=False, sep=';', decimal=',')
collected_data_vias_secundarias.to_csv(get_path('data', 'collected_vias_secundarias_data.csv'), index=False)

# Summary by GROUP
collected_data_vias_secundarias.groupby('GROUP').apply(summary)

Unnamed: 0_level_0,unique contracts,value total,value average,cost deviation total,cost deviation average
GROUP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Vias secundarias 2021-2023,23.0,95691.32,4160.492081,3.793872,0.164951
Vias secundarias hasta 2020,678.0,4279713.0,6312.261549,49.583648,0.073132
