# Hackathon Oil&Gas: Data Warehouse

## Participants
- Cindy Ortega - cindy.or03@gmail.com
- Nicolás Bueno - nbuenoz@unal.edu.co
- Alejandro Uribe - jduriber@unal.edu.co

## Libraries

In [1]:
import sqlite3
import numpy as np
import pandas as pd
from utils import dataProc
from utils import downloadData
import matplotlib.pyplot as plt
from pathlib import Path
from IPython.display import display_html

## Downloading, Loading & Cleanning Data

In [2]:
# Scrappes and downloads the data from ANH website
download = downloadData.downloadData()
download.getData()

#Load Blind test data and Data
#data = dataProc.dataProc()
#df = data.loadData()
#df_dict_blind = data.loadBlindData()
#df_dict = data.cleanData(df)

Scrapping started
Scrapping Failed. Trying again
Scrapping started
Scrapping finished
['/Operaciones-Regalías-y-Participaciones/Sistema-Integrado-de-Operaciones/Documentos%20compartidos/Producción%20Fiscalizada%20Crudo%202020%20Agosto.xlsx', '/Operaciones-Regalías-y-Participaciones/Sistema-Integrado-de-Operaciones/Documentos%20compartidos/Producción%20Fiscalizada%20Crudo%202020%20Julio.xlsx', '/Operaciones-Regalías-y-Participaciones/Sistema-Integrado-de-Operaciones/Documents/Producción%20Fiscalizada%20Crudo%202020%20Junio.xlsx', '/Operaciones-Regalías-y-Participaciones/Sistema-Integrado-de-Operaciones/Documents/Producción%20Fiscalizada%20Crudo%202020%20Mayo.xlsx', '/Operaciones-Regalías-y-Participaciones/Sistema-Integrado-de-Operaciones/Documents/Producción%20Fiscalizada%20Crudo%202020%20Abril.xlsx', '/Operaciones-Regalías-y-Participaciones/Sistema-Integrado-de-Operaciones/Documents/Producción%20Fiscalizada%20Crudo%202020%20Marzo.xlsx', '/Operaciones-Regalías-y-Participaciones/Sistema-

TypeError: 'NoneType' object is not iterable

## Database
The data can be also read from a SQLite Database as follows.

In [None]:
db_dir=Path('./database/anh_data.db')
conn = sqlite3.connect(db_dir)
c = conn.cursor()

The tables' names stored in the database can be retrieved as follows:

In [4]:
c.execute('''SELECT name FROM sqlite_master WHERE type ='table' AND name NOT LIKE 'sqlite_%';''').fetchall()

[('crude_2017',),
 ('crude_2016',),
 ('crude_2013',),
 ('crude_2019',),
 ('crude_2015',),
 ('crude_2014',),
 ('crude_2018',),
 ('crude_2020',)]

The `crude_2018` table's headers ca be retrieved as follows:

In [5]:
c.execute('''PRAGMA table_info(crude_2019)''').fetchall()

[(0, 'departamento', 'TEXT', 0, None, 0),
 (1, 'municipio', 'TEXT', 0, None, 0),
 (2, 'operadora', 'TEXT', 0, None, 0),
 (3, 'contrato', 'TEXT', 0, None, 0),
 (4, 'campo', 'TEXT', 0, None, 0),
 (5, 'enero', 'TEXT', 0, None, 0),
 (6, 'febrero', 'TEXT', 0, None, 0),
 (7, 'marzo', 'TEXT', 0, None, 0),
 (8, 'abril', 'TEXT', 0, None, 0),
 (9, 'mayo', 'TEXT', 0, None, 0),
 (10, 'junio', 'TEXT', 0, None, 0),
 (11, 'julio', 'TEXT', 0, None, 0),
 (12, 'agosto', 'TEXT', 0, None, 0),
 (13, 'septiembre', 'TEXT', 0, None, 0),
 (14, 'octubre', 'TEXT', 0, None, 0),
 (15, 'noviembre', 'TEXT', 0, None, 0),
 (16, 'diciembre', 'TEXT', 0, None, 0)]

Other queries are shown below:

In [6]:
c.execute('''SELECT * FROM crude_2018 LIMIT 1''').fetchall()

[('antioquia',
  'PUERTO NARE',
  'ecopetrol s.a.',
  'operacion-directa ecopetrol',
  'area teca-cocorna',
  '1290.88838709677',
  '1232.14785714286',
  '1146.54709677419',
  '1183.47233333333',
  '1264.54193548387',
  '1201.59633333333',
  '1276.36161290323',
  '1326.87032258065',
  '1404.01266666667',
  '1218.92483870968',
  '1333.641',
  '1373.87677419355')]

For example, the top 5 departamentos which produced crude in January 2018.

In [7]:
c.execute('''SELECT departamento, SUM(enero) AS total FROM crude_2018 GROUP BY crude_2018.departamento 
                ORDER BY total DESC LIMIT 5''').fetchall()

[('meta', 425024.3983870972),
 ('casanare', 162805.02645161285),
 ('santander', 62680.61967741938),
 ('arauca', 46571.40838709677),
 ('putumayo', 36501.45064516129)]

## Blind test

In [11]:
df_dict_blind['2017'].head()

Unnamed: 0,DEPARTAMENTO,MUNICIPIO,OPERADORA,CONTRATO,CAMPO,ENERO,FEBRERO,MARZO,ABRIL,MAYO,JUNIO,JULIO,AGOSTO,SEPTIEMBRE,OCTUBRE,NOVIEMBRE,DICIEMBRE,CUENCA,EMPRESA
0,cf33cb8a,cf33cb8a,d5580f74,76a16657,1f d2689f,12371.083713,14146.300572,3994.040924,13955.411987,12916.569159,12134.962767,11616.805382,12365.190366,12010.883245,10913.532571,9918.024556,9141.248604,,
1,cf33cb8a,cf33cb8a,d5580f74,76a16657,9ac1420f,5.367156,0.0,58.617761,435.456412,322.485383,253.958847,233.769454,252.992992,284.516555,274.742251,41.468587,246.1525,,
2,cf33cb8a,cf33cb8a,d5580f74,76a16657,9b395bc9,120.462831,569.442204,226.753564,478.519945,548.853073,564.864503,458.804125,494.690402,432.01278,380.752349,493.309441,442.52726,,
3,cf33cb8a,cf33cb8a,d5580f74,29ded6f4,2f614c0b,23208.879752,22340.062006,1600.464821,21219.954457,22985.774451,23339.99972,19351.403126,22098.474846,22122.077469,21993.692529,20269.221852,22206.168757,,
4,cf33cb8a,cf33cb8a,d5580f74,29ded6f4,043b305e,4386.088856,4038.953896,186.762991,3757.329474,4340.204936,4265.247167,2957.583498,3792.334088,4064.682751,3756.061936,3999.688716,4103.523887,,


## Data Analysis

In [None]:
#Function to display datasets side by side to save vertical space
#Input: dataframes and tables(dfs) titles

def dpy_side_by_side(dfs:list, titles:list):
    
    output = ""
    combined = dict(zip(titles, dfs))
    for title, df in combined.items():
        output += df.style.set_table_attributes("style='display:inline'").set_caption(title)._repr_html_()
        output += "\xa0\xa0\xa0"
    display(HTML(output))

In [None]:
#Identify null values in the datasets
dpy_side_by_side([df_dict['2018'].isnull().sum().to_frame(), df_dict['2019'].isnull().sum().to_frame(), df_dict['2020'].isnull().sum().to_frame()],["2018","2019","2020"])

In [None]:
#Delete duplicates if there's any
df_dict['2018']=df_dict['2018'].drop_duplicates()
df_dict['2019']=df_dict['2019'].drop_duplicates()
df_dict['2020']=df_dict['2020'].drop_duplicates()

#### Top 5 fields with the highest production in 2020

In [11]:
#get location of january in the dataset 
January= df_dict['2020'].columns.get_loc("enero")
#Get total production by row
df_dict['2020']['Total_Prod']=df_dict['2020'].iloc[:,January::].fillna(0).sum(axis=1)

In [15]:
#Production 2020 by field
Prod_2020=df_dict['2020'].groupby('campo')['Total_Prod'].sum().reset_index(name ='Anual_Prod')
#Top 5 anual production by field
Highest_Prod=Prod_2020.sort_values(by=['Anual_Prod'], ascending=False).iloc[0:5,:].reset_index(drop=True)

#Visualization
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,4))  # 1 row, 2 columns
bbox=[0, 0, 1, 1]

#table visualization
mpl_table = ax1.table(cellText = Highest_Prod.values, rowLabels = Highest_Prod.index, bbox=bbox, colLabels=Highest_Prod.columns)
mpl_table.auto_set_font_size(False)
mpl_table.set_fontsize(font_size)

#bar plot visualization
ax1.axis('off')
font_size=14
Highest_Prod.plot(x='campo', y='Anual_Prod', kind="bar",figsize=(15,5), ax=ax2)
fig.tight_layout()  


Unnamed: 0,campo,Anual_Prod
313,rubiales,857001.07
73,castilla,537068.14
93,chichimene,388023.81
75,castilla norte,380005.77
298,quifa,304783.67


#### Companies who have reported production in more than 5 fields in Casanare in 2018

In [16]:
#Total production in 2018
Jan_2018= df_dict['2018'].columns.get_loc("enero")
df_dict['2018']['Total_Prod']=df_dict['2018'].iloc[:,Jan_2018::].fillna(0).sum(axis=1)

#Get fields with production different from 0 reported
Op_Field_2018=df_dict['2018'][df_dict['2018']['Total_Prod']!=0]

#Number of productive fields by companies
Op_Field_2018=df_dict['2018'].groupby(['operadora','departamento'])['campo'].count().reset_index(name ='#Prod_Fields')
Op_casanare_2018=Op_Field_2018[(Op_Field_2018['departamento']=='CASANARE')&(Op_Field_2018['#Prod_Fields']>5)]
casanare_2018=Op_casanare_2018.sort_values(by=['#Prod_Fields'], ascending=False).reset_index(drop=True)

#Visualization
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(15,8))  # 1 row, 2 columns
bbox=[0, 0, 1, 1]

#table visualization
font_size=10
mpl_table = ax1.table(cellText = casanare_2018.values, rowLabels = casanare_2018.index, bbox=bbox, colLabels=casanare_2018.columns)
mpl_table.auto_set_font_size(False)
mpl_table.set_fontsize(font_size)

#bar plot visualization
ax1.axis('off')
font_size=10
casanare_2018.plot(x='operadora', y='#Prod_Fields', kind="bar",figsize=(15,10), ax=ax2)
fig.tight_layout()  


operadora       0
departamento    0
#Prod_Fields    0
dtype: int64

Unnamed: 0,operadora,departamento,#Prod_Fields


#### Top 5 contracts with the highest production in 2018

In [17]:
#Get production by contracts 
Cont_Highest_Prod= df_dict['2018'].groupby('contrato')['Total_Prod'].sum().reset_index(name ='Anual_Prod')
#Get top 5 contracts with the highest production
Five_Hg_prod=Cont_Highest_Prod.sort_values(by=['Anual_Prod'], ascending=False).iloc[0:5].reset_index(drop=True)

#Visualization
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,4))  # 1 row, 2 columns
bbox=[0, 0, 1, 1]

#table visualization
font_size=11
mpl_table = ax1.table(cellText = Five_Hg_prod.values, rowLabels = Five_Hg_prod.index, bbox=bbox, colLabels=Five_Hg_prod.columns)
mpl_table.auto_set_font_size(False)
mpl_table.set_fontsize(font_size)

#bar plot visualization
ax1.axis('off')
font_size=11
Five_Hg_prod.plot(x='contrato', y='Anual_Prod', kind="bar",figsize=(15,7), ax=ax2)
fig.tight_layout()  

Unnamed: 0,contrato,Anual_Prod
44,cubarral,2177189.0
138,rubiales,1433612.0
91,lla 34,725579.6
130,quifa,554438.3
75,la cira infantas,542415.1


#### Top 10 companies with the highest production in August 2019

In [18]:
#Get production by company
Op_High_Prod_2019= df_dict['2019'].groupby('operadora')['agosto'].sum().reset_index(name ='Prod_Aug')
#Get top 10 companies with the highest production in Aug. 
Op_Hg_Pd_2019=Op_High_Prod_2019.sort_values(by=['Prod_Aug'], ascending=False).iloc[0:10].reset_index(drop=True)

#Visualization
fig, (ax1,ax2) = plt.subplots(1,2, figsize=(10,10))  # 1 row, 2 columns
bbox=[0, 0, 1, 1]

#table visualization
font_size=11
mpl_table = ax1.table(cellText = Op_Hg_Pd_2019.values, rowLabels = Op_Hg_Pd_2019.index, bbox=bbox, colLabels=Op_Hg_Pd_2019.columns)
mpl_table.auto_set_font_size(False)
mpl_table.set_fontsize(font_size)

#bar plot visualization
ax1.axis('off')
font_size=11
Op_Hg_Pd_2019.plot(x='operadora', y='Prod_Aug', kind="bar",figsize=(15,8), ax=ax2)
fig.tight_layout() 

Unnamed: 0,operadora,Prod_Aug
7,ecopetrol s.a.,473986.41
10,frontera energy colombia corp sucursal colombia,81441.68
11,geopark colombia s.a.s.,69154.42
24,occidental de colombia llc,53299.12
9,equion energía limited,34950.01
13,gran tierra energy colombia ltd,29555.2
21,mansarovar energy colombia ltd,25256.75
14,hocol s.a.,19614.02
28,parex resources colombia ltd. sucursal,15727.97
3,cepsa colombia s.a.,12702.09


### <center> Comparative Analysis </center> 

In [None]:
#Get first quarter information
Tri_one_2019= df_dict['2019'].iloc[:,0:8]
Tri_one_2019= Tri_one_2019.copy()
Tri_one_2020= df_dict['2020'].iloc[:,0:8]
Tri_one_2020=Tri_one_2020.copy()
#total production by field 1st quarter
Tri_one_2019['total']= Tri_one_2019.iloc[:,5::].sum(axis=1)
Tri_one_2020['total']= Tri_one_2020.iloc[:,5::].sum(axis=1)

#Get second quarter information
Tri_two_2019= pd.concat([df_dict['2019'].iloc[:,0:5], df_dict['2019'].iloc[:,8:11]], axis=1)
Tri_two_2019=Tri_two_2019.copy()
Tri_two_2020= pd.concat([df_dict['2020'].iloc[:,0:5], df_dict['2020'].iloc[:,8:11]], axis=1)
Tri_two_2020=Tri_two_2020.copy()
#total production by field 2nd quarter
Tri_two_2019['total']= Tri_two_2019.iloc[:,5::].sum(axis=1)
Tri_two_2020['total']= Tri_two_2020.iloc[:,5::].sum(axis=1)

In [None]:
#Display the null values in the datasets
dpy_side_by_side([Tri_one_2019.isnull().sum().to_frame(),Tri_one_2020.isnull().sum().to_frame(),Tri_two_2019.isnull().sum().to_frame(),Tri_two_2020.isnull().sum().to_frame() ],
                 ['Null 1st Tri 2019','Null 1st Tri 2020','Null 2nd Tri 2019','Null 2nd Tri 2020'])

In [None]:
#Display the max, min, mean, std, Q1, Q2 an Q3 of the total production 
dpy_side_by_side([Tri_one_2019.describe(), Tri_two_2019.describe(), Tri_one_2020.describe(),  
                  Tri_two_2020.describe()],['Prod 1st Tri 2019','Prod 2nd Tri 2019','Prod 1st Tri 2020','Prod 2nd Tri 2020'])

In [None]:
#Display the number of fields with zero production in the quarter
P0_2019_1=Tri_one_2019[Tri_one_2019['total']==0].count().to_frame()
P0_2020_1=Tri_one_2020[Tri_one_2020['total']==0].count().to_frame()
P0_2019_2=Tri_two_2019[Tri_two_2019['total']==0].count().to_frame()
P0_2020_2=Tri_two_2020[Tri_two_2019['total']==0].count().to_frame()

dpy_side_by_side([P0_2019_1,P0_2019_2,P0_2020_1,P0_2020_2],
                 ['# Fields Prod=0 1st Tri 2019','# Fields Prod=0 2nd Tri 2019','# Fields Prod=0 1st Tri 2020',
                  '# Fields Prod=0 2nd Tri 2020'])

Based on the description results and without further analysis, we could hypothesize that the current COVID-19 crisis affected the production since the maximum production went from approximately 361662 in 2019 to 303165 in 2020, as well as the mean production (from 5869 to 5468). The previous results have shown that even though in the description we can see a reduction in the fields from 2019 to 2020, those fields in 2019 were actually non-productive ones. As we can visualize, in 2019 there were 42 to 44 non-productive fields in the first and second quarters. On the other hand, you can see a recovery in the first quarter of 2020, and then relapse in the second quarter of the same year.

In [None]:
#Display Top 10 fields with the highest production in the quarters 
#The iloc position can be switched to [-50:-1] to check the lowest values during the first two quarters of 2019 and 2020
Top10field_2019_1= Tri_one_2019.groupby('campo')['total'].sum().reset_index(name ='Prod_field').sort_values(by=['Prod_field'], ascending=False).iloc[0:10].reset_index(drop=True)
Top10field_2019_2=Tri_one_2020.groupby('campo')['total'].sum().reset_index(name ='Prod_field').sort_values(by=['Prod_field'], ascending=False).iloc[0:10].reset_index(drop=True)
Top10field_2020_1=Tri_two_2019.groupby('campo')['total'].sum().reset_index(name ='Prod_field').sort_values(by=['Prod_field'], ascending=False).iloc[0:10].reset_index(drop=True)
Top10field_2020_2=Tri_two_2020.groupby('campo')['total'].sum().reset_index(name ='Prod_field').sort_values(by=['Prod_field'], ascending=False).iloc[0:10].reset_index(drop=True)

#call function to display results 
dpy_side_by_side([Top10field_2019_1,Top10field_2019_2,Top10field_2020_1,Top10field_2020_2],
                 ['Top Fields 1st Tri 2019','Top Fields 2nd Tri 2019','Top Fields 1st Tri 2020',
                  'Top Fields 2nd Tri 2020'])

#visualization
fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2, figsize=(10,10))  # 1 row, 2 columns
font_size=11
Top10field_2019_1.plot(x='campo', y='Prod_field', kind="bar",figsize=(15,8), ax=ax1)
Top10field_2019_2.plot(x='campo', y='Prod_field', kind="bar",figsize=(15,8), ax=ax2)
Top10field_2020_1.plot(x='campo', y='Prod_field', kind="bar",figsize=(15,8), ax=ax3)
Top10field_2020_2.plot(x='campo', y='Prod_field', kind="bar",figsize=(15,8), ax=ax4)
fig.tight_layout() 

In [None]:
#Display Top 10 regions with the with the highest production in the quarters 
#The iloc position can be switched to [-50:-1] to check the lowest values during the first two quarters of 2019 and 2020
Top10dto_2019_1= Tri_one_2019.groupby('departamento')['total'].sum().reset_index(name ='Prod_dpto').sort_values(by=['Prod_dpto'], ascending=False).iloc[0:10].reset_index(drop=True)
Top10dto_2019_2=Tri_one_2020.groupby('departamento')['total'].sum().reset_index(name ='Prod_dpto').sort_values(by=['Prod_dpto'], ascending=False).iloc[0:10].reset_index(drop=True)
Top10dto_2020_1=Tri_two_2019.groupby('departamento')['total'].sum().reset_index(name ='Prod_dpto').sort_values(by=['Prod_dpto'], ascending=False).iloc[0:10].reset_index(drop=True)
Top10dto_2020_2=Tri_two_2020.groupby('departamento')['total'].sum().reset_index(name ='Prod_dpto').sort_values(by=['Prod_dpto'], ascending=False).iloc[0:10].reset_index(drop=True)

#Call funcion to display results
dpy_side_by_side([Top10dto_2019_1,Top10dto_2019_2,Top10dto_2020_1,Top10dto_2020_2],
                 ['Top dpto prod 1st Tri 2019','Top dpto prod 2nd Tri 2019','Top dpto prod 1st Tri 2020',
                  'Top dpto prod 2nd Tri 2020'])

#Visualization
fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2, figsize=(10,10))  # 1 row, 2 columns
font_size=11
Top10dto_2019_1.plot(x='departamento', y='Prod_dpto', kind="bar",figsize=(15,8), ax=ax1)
Top10dto_2019_2.plot(x='departamento', y='Prod_dpto', kind="bar",figsize=(15,8), ax=ax2)
Top10dto_2020_1.plot(x='departamento', y='Prod_dpto', kind="bar",figsize=(15,8), ax=ax3)
Top10dto_2020_2.plot(x='departamento', y='Prod_dpto', kind="bar",figsize=(15,8), ax=ax4)
fig.tight_layout() 

In [None]:
#Display Top 10 fields by region with the highest production in the quarters 
#The iloc position can be switched to [-50:-1] to check the lowest production during the first two quarters of 2019 and 2020
Top_DPtofield_2019_1= Tri_one_2019.groupby(['departamento','campo'])['total'].sum().reset_index(name ='Prod_dpto').sort_values(by=['Prod_dpto'], ascending=False).iloc[0:10].reset_index(drop=True)
Top_DPtofield_2019_2=Tri_one_2020.groupby(['departamento','campo'])['total'].sum().reset_index(name ='Prod_dpto').sort_values(by=['Prod_dpto'], ascending=False).iloc[0:10].reset_index(drop=True)
Top_DPtofield_2020_1=Tri_two_2019.groupby(['departamento','campo'])['total'].sum().reset_index(name ='Prod_dpto').sort_values(by=['Prod_dpto'], ascending=False).iloc[0:10].reset_index(drop=True)
Top_DPtofield_2020_2=Tri_two_2020.groupby(['departamento','campo'])['total'].sum().reset_index(name ='Prod_dpto').sort_values(by=['Prod_dpto'], ascending=False).iloc[0:10].reset_index(drop=True)

#Call function to display results 
dpy_side_by_side([Top_DPtofield_2019_1,Top_DPtofield_2019_2,Top_DPtofield_2020_1,Top_DPtofield_2020_2],
                 ['Top fields by dpto prod 1st Tri 2019','Top fields by dpto prod 2nd Tri 2019','Top fields by dpto prod 1st Tri 2020',
                  'Top fields by dpto prod 2nd Tri 2020']) 

In [None]:
#Display Top 10 towns with the highest production in the quarters 
#The iloc position can be switched to [-50:-1] to check the lowest values during the first two quarters of 2019 and 2020
Top10mpio_2019_1= Tri_one_2019.groupby('municipio')['total'].sum().reset_index(name ='Prod_mpio').sort_values(by=['Prod_mpio'], ascending=False).iloc[0:10].reset_index(drop=True)
Top10mpio_2019_2=Tri_one_2020.groupby('municipio')['total'].sum().reset_index(name ='Prod_mpio').sort_values(by=['Prod_mpio'], ascending=False).iloc[0:10].reset_index(drop=True)
Top10mpio_2020_1=Tri_two_2019.groupby('municipio')['total'].sum().reset_index(name ='Prod_mpio').sort_values(by=['Prod_mpio'], ascending=False).iloc[0:10].reset_index(drop=True)
Top10mpio_2020_2=Tri_two_2020.groupby('municipio')['total'].sum().reset_index(name ='Prod_mpio').sort_values(by=['Prod_mpio'], ascending=False).iloc[0:10].reset_index(drop=True)

#Call function to display results 
dpy_side_by_side([Top10mpio_2019_1,Top10mpio_2019_2,Top10mpio_2020_1,Top10mpio_2020_2],
                 ['Top town prod 1st Tri 2019','Top town prod 2nd Tri 2019','Top town prod 1st Tri 2020',
                  'Top town prod 2nd Tri 2020'])

The department of Meta is one of the largest oil producer regions since it has the Rubiales, Castilla, Chichimene fields, among others, which are the fields with the highest production recorded in 2019 and 2020. The decrease in production in these fields is remarkable through the quarters, but it is not crucial for the survival of the fields, unlike fields like Volcanera in the department of Casanare, whose reduction in production was low from the second quarter of 2019 to 2020, and this reduction was crucial for the survival of the field as it finally reached 0% production in the second quarter of 2020.

The same situation persists with the following results: 

In [None]:
#Display Top 10 companies with the highest production in the quarters 
#The iloc position can be switched to [-50:-1] to check the lowest values during the first two quarters of 2019 and 2020
Top10op_2019_1= Tri_one_2019.groupby('operadora')['total'].sum().reset_index(name ='Prod_Op').sort_values(by=['Prod_Op'], ascending=False).iloc[0:10].reset_index(drop=True)
Top10op_2019_2=Tri_one_2020.groupby('operadora')['total'].sum().reset_index(name ='Prod_Op').sort_values(by=['Prod_Op'], ascending=False).iloc[0:10].reset_index(drop=True)
Top10op_2020_1=Tri_two_2019.groupby('operadora')['total'].sum().reset_index(name ='Prod_Op').sort_values(by=['Prod_Op'], ascending=False).iloc[0:10].reset_index(drop=True)
Top10op_2020_2=Tri_two_2020.groupby('operadora')['total'].sum().reset_index(name ='Prod_Op').sort_values(by=['Prod_Op'], ascending=False).iloc[0:10].reset_index(drop=True)

#Call function to display results 
dpy_side_by_side([Top10op_2019_1,Top10op_2019_2,Top10op_2020_1,Top10op_2020_2],
                 ['Top companies 1st Tri 2019','Top companies 2nd Tri 2019','Top companies 1st Tri 2020',
                  'Top companies 2nd Tri 2020'])

In [None]:
#Display Top 10 contracts with the highest production in the quarters 
#The iloc position can be switched to [-50:-1] to check the lowest values during the first two quarters of 2019 and 2020
Top10ctto_2019_1=Tri_one_2019.groupby('contrato')['total'].sum().reset_index(name ='Prod_ctto').sort_values(by=['Prod_ctto'], ascending=False).iloc[0:10].reset_index(drop=True)
Top10ctto_2019_2=Tri_one_2020.groupby('contrato')['total'].sum().reset_index(name ='Prod_ctto').sort_values(by=['Prod_ctto'], ascending=False).iloc[0:10].reset_index(drop=True)
Top10ctto_2020_1=Tri_two_2019.groupby('contrato')['total'].sum().reset_index(name ='Prod_ctto').sort_values(by=['Prod_ctto'], ascending=False).iloc[0:10].reset_index(drop=True)
Top10ctto_2020_2=Tri_two_2020.groupby('contrato')['total'].sum().reset_index(name ='Prod_ctto').sort_values(by=['Prod_ctto'], ascending=False).iloc[0:10].reset_index(drop=True)

dpy_side_by_side([Top10ctto_2019_1,Top10ctto_2019_2,Top10ctto_2020_1,Top10ctto_2020_2],
                 ['Top contracts 1st Tri 2019','Top contracts 2nd Tri 2019','Top contracts 1st Tri 2020',
                  'Top contracts 2nd Tri 2020'])

#Visualization
fig, ((ax1,ax2),(ax3,ax4)) = plt.subplots(2,2, figsize=(10,10))  # 1 row, 2 columns
font_size=11
Top10ctto_2019_1.plot(x='contrato', y='Prod_ctto', kind="bar",figsize=(15,8), ax=ax1)
Top10ctto_2019_2.plot(x='contrato', y='Prod_ctto', kind="bar",figsize=(15,8), ax=ax2)
Top10ctto_2020_1.plot(x='contrato', y='Prod_ctto', kind="bar",figsize=(15,8), ax=ax3)
Top10ctto_2020_2.plot(x='contrato', y='Prod_ctto', kind="bar",figsize=(15,8), ax=ax4)
fig.tight_layout() 