## Este notebook explora los datos de produccion de Oil & Gas en Colombia en 2019.

Este notebook hace un analisis descriptivo, explorando los datos de produccion de Oil & Gas de 2019 recopilados de .https://datos.gov.co/. El notebook resuelve las siguientes preguntas basicas, para conocer mas del sector de hidrocarburos en Colombia.

- Cuales departamentos producen mas crudo y gas?

- Cual fue el promedio de produccion de los ultimos 5 yrs de crudo y gas?

This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported.

In [0]:
# ------------------- Cuales son los departamentos 'petroleros' y 'Gasiferos' por excelencia en Colombia? ----------------------------------------------#

In [0]:
# Analizamos la produccion de Gas Total & promedio en 2019

In [0]:
# File location and type
file_location = "/FileStore/tables/Producci_n_Fiscalizada_de_Gas_2019.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df)

Departamento,Municipio,Latitud,Longitud,Geolocalizacion,Operadora,Contrato,Campo,Enero,Febrero,Marzo,Abril,Mayo,Junio,Julio,Agosto,Septiembre,Octubre,Noviembre,Diciembre
ANTIOQUIA,PUERTO NARE,6.126539139,-74.70317722,POINT (-74.70317722 6.126539139),ECOPETROL S.A.,TECA COCORNA,AREA TECA-COCORNA,0.21,0.2,0.2,0.19,0.21,0.2,0.2,0.2,0.22,0.21,0.23,0.23
ANTIOQUIA,PUERTO NARE,6.126539139,-74.70317722,POINT (-74.70317722 6.126539139),MANSAROVAR ENERGY COLOMBIA LTD,NARE,NARE SUR,0.03,0.03,0.04,0.04,0.05,0.05,0.04,0.04,0.04,0.04,0.05,0.04
ANTIOQUIA,PUERTO NARE,6.126539139,-74.70317722,POINT (-74.70317722 6.126539139),MANSAROVAR ENERGY COLOMBIA LTD,NARE,UNDERRIVER,0.14,0.19,0.17,0.2,0.22,0.21,0.18,0.17,0.18,0.19,0.19,0.17
ANTIOQUIA,PUERTO TRIUNFO,5.954830151,-74.6861918,POINT (-74.6861918 5.954830151),ECOPETROL S.A.,TECA COCORNA,AREA TECA-COCORNA,0.07,0.07,0.07,0.07,0.06,0.06,0.05,0.05,0.04,0.04,0.04,0.03
ANTIOQUIA,RIO NEGRO,6.150785807,-75.41076052,POINT (-75.41076052 6.150785807),PAREX RESOURCES COLOMBIA LTD. SUCURSAL,BORANDA,BORANDA,0.0,0.0,0.0,0.0,0.0,0.01,0.01,0.03,0.04,0.0,0.0,0.0
ANTIOQUIA,YONDO,6.925159318,-74.15824041,POINT (-74.15824041 6.925159318),ECOPETROL S.A.,MAGDALENA MEDIO,CASABE,0.57,0.57,0.59,0.56,0.51,0.52,0.5,0.42,0.38,0.43,0.52,0.54
ANTIOQUIA,YONDO,6.925159318,-74.15824041,POINT (-74.15824041 6.925159318),ECOPETROL S.A.,MAGDALENA MEDIO,CASABE SUR,0.59,0.8,0.86,0.86,0.7,0.5,0.57,0.34,0.65,0.79,0.77,0.78
ANTIOQUIA,YONDO,6.925159318,-74.15824041,POINT (-74.15824041 6.925159318),ECOPETROL S.A.,MAGDALENA MEDIO,PEÑAS BLANCAS,0.21,0.21,0.18,0.19,0.2,0.19,0.19,0.1,0.17,0.18,0.17,0.19
ARAUCA,ARAUCA,6.796280825,-70.50921153,POINT (-70.50921153 6.796280825),OCCIDENTAL DE COLOMBIA LLC,CHIPIRÓN,CHIPIRÓN,0.38,0.35,0.36,0.39,0.31,0.34,0.2,0.35,0.32,0.36,0.38,0.4
ARAUCA,ARAUCA,6.796280825,-70.50921153,POINT (-70.50921153 6.796280825),OCCIDENTAL DE COLOMBIA LLC,CHIPIRÓN,GALEMBO,0.01,0.01,0.01,0.01,0.01,0.01,0.0,0.01,0.01,0.01,0.01,0.0


In [0]:
df.printSchema()

root
 |-- Departamento: string (nullable = true)
 |-- Municipio: string (nullable = true)
 |-- Latitud: double (nullable = true)
 |-- Longitud: double (nullable = true)
 |-- Geolocalizacion: string (nullable = true)
 |-- Operadora: string (nullable = true)
 |-- Contrato: string (nullable = true)
 |-- Campo: string (nullable = true)
 |-- Enero: double (nullable = true)
 |-- Febrero: double (nullable = true)
 |-- Marzo: double (nullable = true)
 |-- Abril: double (nullable = true)
 |-- Mayo: double (nullable = true)
 |-- Junio: double (nullable = true)
 |-- Julio: double (nullable = true)
 |-- Agosto: double (nullable = true)
 |-- Septiembre: double (nullable = true)
 |-- Octubre: double (nullable = true)
 |-- Noviembre: double (nullable = true)
 |-- Diciembre: double (nullable = true)



In [0]:
from pyspark.sql.functions import *
from pyspark.sql import functions as F
totalGasProduction = (df.agg(F.sum(df.Enero+df.Febrero+df.Marzo+df.Abril+df.Mayo+df.Junio+df.Julio+df.Agosto+df.Septiembre+df.Octubre+df.Noviembre+df.Diciembre))
.withColumnRenamed("sum((((((((((((Enero + Febrero) + Marzo) + Abril) + Mayo) + Junio) + Julio) + Agosto) + Septiembre) + Octubre) + Noviembre) + Diciembre))", "GasTotals"))

In [0]:
totalGasProduction = totalGasProduction.withColumn("GasTotals", totalGasProduction.GasTotals*30*1000)

In [0]:
totalGasProduction.show()

+-------------------+
|          GasTotals|
+-------------------+
|7.760814000000005E8|
+-------------------+



In [0]:
totalGasProductionperDept = (df.groupBy("Departamento")
.agg(F.sum(df.Enero+df.Febrero+df.Marzo+df.Abril+df.Mayo+df.Junio+df.Julio+df.Agosto+df.Septiembre+df.Octubre+df.Noviembre+df.Diciembre))
.withColumnRenamed("sum((((((((((((Enero + Febrero) + Marzo) + Abril) + Mayo) + Junio) + Julio) + Agosto) + Septiembre) + Octubre) + Noviembre) + Diciembre))","GasTotalsperDept")
.orderBy(col("GasTotalsperDept").desc()))

In [0]:
totalGasProductionperDept = totalGasProductionperDept.withColumn("GasTotalsperDept", totalGasProductionperDept.GasTotalsperDept*30*1000)

In [0]:
totalGasProductionperDept.show(5)

+------------------+--------------------+
|      Departamento|    GasTotalsperDept|
+------------------+--------------------+
|          CASANARE|          5.597055E8|
|           GUAJIRA| 6.707339999999999E7|
|             SUCRE|             3.555E7|
|           CORDOBA|3.3362399999999993E7|
|         SANTANDER|1.4264999999999998E7|
|   DEPARTAMENTO NN|           1.27998E7|
|             CESAR|           8056800.0|
|            BOYACA|   6444599.999999998|
|              META|           6157800.0|
|         ATLANTICO|   6148200.000000001|
|         MAGDALENA|   5154900.000000001|
|             HUILA|   5017500.000000001|
|          PUTUMAYO|  3843899.9999999995|
|            TOLIMA|           3598500.0|
|            ARAUCA|  3257700.0000000005|
|           BOLIVAR|           2289300.0|
|NORTE DE SANTANDER|  1584000.0000000002|
|         ANTIOQUIA|   673199.9999999999|
|            NARIÑO|            630000.0|
|      CUNDINAMARCA|  468900.00000000006|
+------------------+--------------

In [0]:
AvgGasProductionRate = (df.agg(F.avg(df.Enero+df.Febrero+df.Marzo+df.Abril+df.Mayo+df.Junio+df.Julio+df.Agosto+df.Septiembre+df.Octubre+df.Noviembre+df.Diciembre))
.withColumnRenamed("avg((((((((((((Enero + Febrero) + Marzo) + Abril) + Mayo) + Junio) + Julio) + Agosto) + Septiembre) + Octubre) + Noviembre) + Diciembre))", "GasavgRate"))

In [0]:
AvgGasProductionRate = AvgGasProductionRate.withColumn("GasavgRate", AvgGasProductionRate.GasavgRate*(338*1000)/12)

In [0]:
AvgGasProductionRate.show()

+-----------------+
|       GasavgRate|
+-----------------+
|2155781.666666668|
+-----------------+



In [0]:
AvgGasProductionRatePerDept = (df.groupBy("Departamento")
.agg(F.avg(df.Enero+df.Febrero+df.Marzo+df.Abril+df.Mayo+df.Junio+df.Julio+df.Agosto+df.Septiembre+df.Octubre+df.Noviembre+df.Diciembre))
.withColumnRenamed("avg((((((((((((Enero + Febrero) + Marzo) + Abril) + Mayo) + Junio) + Julio) + Agosto) + Septiembre) + Octubre) + Noviembre) + Diciembre))", "GasavgRatePerDept")
.orderBy(col("GasavgRatePerDept").desc()))

In [0]:
AvgGasProductionRatePerDept = AvgGasProductionRatePerDept.withColumn("GasavgRatePerDept", AvgGasProductionRatePerDept.GasavgRatePerDept/12)

In [0]:
AvgGasProductionRatePerDept.show(5)

+---------------+------------------+
|   Departamento| GasavgRatePerDept|
+---------------+------------------+
|        GUAJIRA| 93.15749999999998|
|DEPARTAMENTO NN|            35.555|
|       CASANARE| 17.08502747252747|
|        CORDOBA|10.297037037037036|
|          SUCRE|             9.875|
+---------------+------------------+
only showing top 5 rows



In [0]:
display(totalGasProductionperDept)

Departamento,GasTotalsperDept
CASANARE,559705500.0
GUAJIRA,67073399.99999999
SUCRE,35550000.0
CORDOBA,33362399.999999996
SANTANDER,14264999.999999998
DEPARTAMENTO NN,12799800.0
CESAR,8056800.0
BOYACA,6444599.999999998
META,6157800.0
ATLANTICO,6148200.000000001


In [0]:
# Podemos hacer el mismo analisis para el petroleo en 2019.

In [0]:
# File location and type
file_location = "/FileStore/tables/Producci_n_Fiscalizada_de_Petr_leo_2019.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df1 = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

display(df1)

Departamento,Municipio,Latitud,Longitud,Geolocalizacion,Operadora,Contrato,Campo,Enero,Febrero,Marzo,Abril,Mayo,Junio,Julio,Agosto,Septiembre,Octubre,Noviembre,Diciembre
ANTIOQUIA,PUERTO NARE,6.126539139,-74.70317722,POINT (-74.70317722 6.126539139),ECOPETROL S.A.,TECA COCORNA,AREA TECA-COCORNA,1291.74,1285.1,1281.02,1254.71,1323.26,1268.1,1198.19,1212.39,1356.37,1286.82,1546.81,1558.79
ANTIOQUIA,PUERTO NARE,6.126539139,-74.70317722,POINT (-74.70317722 6.126539139),MANSAROVAR ENERGY COLOMBIA LTD,NARE,NARE SUR,179.33,209.05,241.55,244.42,264.89,260.38,210.67,218.99,187.88,189.81,200.45,167.47
ANTIOQUIA,PUERTO NARE,6.126539139,-74.70317722,POINT (-74.70317722 6.126539139),MANSAROVAR ENERGY COLOMBIA LTD,NARE,UNDERRIVER,798.3,745.97,736.98,673.28,664.03,1154.56,639.45,710.29,662.89,698.13,657.42,629.09
ANTIOQUIA,PUERTO TRIUNFO,5.954830151,-74.6861918,POINT (-74.6861918 5.954830151),ECOPETROL S.A.,TECA COCORNA,AREA TECA-COCORNA,205.95,213.24,199.52,183.49,164.47,164.93,150.85,125.04,131.28,122.17,140.35,60.32
ANTIOQUIA,YONDO,6.925159318,-74.15824041,POINT (-74.15824041 6.925159318),ECOPETROL S.A.,MAGDALENA MEDIO,CASABE,10383.34,10307.74,10208.22,10161.94,10251.03,10108.62,10636.26,9707.98,9795.5,10265.62,10958.9,11769.92
ANTIOQUIA,YONDO,6.925159318,-74.15824041,POINT (-74.15824041 6.925159318),ECOPETROL S.A.,MAGDALENA MEDIO,CASABE SUR,1957.22,1923.78,1974.96,1952.15,1845.97,1739.7,1763.54,867.7,1450.86,1710.93,1815.17,1870.6
ANTIOQUIA,YONDO,6.925159318,-74.15824041,POINT (-74.15824041 6.925159318),ECOPETROL S.A.,MAGDALENA MEDIO,PEÑAS BLANCAS,1060.57,1063.98,1099.95,1186.04,1203.57,1130.16,1164.56,563.93,987.91,1061.45,1084.61,1125.26
ARAUCA,ARAUCA,6.796280825,-70.50921153,POINT (-70.50921153 6.796280825),OCCIDENTAL DE COLOMBIA LLC,CHIPIRÓN,CHIPIRÓN,7262.77,6149.36,5679.87,5515.9,5369.35,5876.67,4382.9,5399.29,4762.73,5451.39,5736.2,5725.81
ARAUCA,ARAUCA,6.796280825,-70.50921153,POINT (-70.50921153 6.796280825),OCCIDENTAL DE COLOMBIA LLC,CHIPIRÓN,GALEMBO,123.42,115.79,113.23,120.77,129.94,128.4,80.58,122.71,99.2,121.0,105.27,105.74
ARAUCA,ARAUCA,6.796280825,-70.50921153,POINT (-70.50921153 6.796280825),OCCIDENTAL DE COLOMBIA LLC,CHIPIRÓN,MACANA,296.68,238.29,240.06,211.73,137.45,182.1,226.55,281.84,227.77,202.61,178.03,160.94


In [0]:
df1.printSchema()

root
 |-- Departamento: string (nullable = true)
 |-- Municipio: string (nullable = true)
 |-- Latitud: double (nullable = true)
 |-- Longitud: double (nullable = true)
 |-- Geolocalizacion: string (nullable = true)
 |-- Operadora: string (nullable = true)
 |-- Contrato: string (nullable = true)
 |-- Campo: string (nullable = true)
 |-- Enero: double (nullable = true)
 |-- Febrero: double (nullable = true)
 |-- Marzo: double (nullable = true)
 |-- Abril: double (nullable = true)
 |-- Mayo: double (nullable = true)
 |-- Junio: double (nullable = true)
 |-- Julio: double (nullable = true)
 |-- Agosto: double (nullable = true)
 |-- Septiembre: double (nullable = true)
 |-- Octubre: double (nullable = true)
 |-- Noviembre: double (nullable = true)
 |-- Diciembre: double (nullable = true)



In [0]:
totalOilProduction = (df1.agg(F.sum(df1.Enero+df1.Febrero+df1.Marzo+df1.Abril+df1.Mayo+df1.Junio+df1.Julio+df1.Agosto+df1.Septiembre+df1.Octubre+df1.Noviembre+df1.Diciembre))
.withColumnRenamed("sum((((((((((((Enero + Febrero) + Marzo) + Abril) + Mayo) + Junio) + Julio) + Agosto) + Septiembre) + Octubre) + Noviembre) + Diciembre))", "OilTotals"))

In [0]:
totalOilProduction = totalOilProduction.withColumn("OilTotals", totalOilProduction.OilTotals*30)

In [0]:
totalOilProduction.show()

+-------------+
|    OilTotals|
+-------------+
|3.189182601E8|
+-------------+



In [0]:
totalOilProductionPerDept = (df1.groupBy("Departamento")
.agg(F.sum(df1.Enero+df1.Febrero+df1.Marzo+df1.Abril+df1.Mayo+df1.Junio+df1.Julio+df1.Agosto+df1.Septiembre+df1.Octubre+df1.Noviembre+df1.Diciembre))
.withColumnRenamed("sum((((((((((((Enero + Febrero) + Marzo) + Abril) + Mayo) + Junio) + Julio) + Agosto) + Septiembre) + Octubre) + Noviembre) + Diciembre))", "OilTotalsPerDept")
.orderBy(col("OilTotalsPerDept").desc()))

In [0]:
totalOilProductionPerDept = totalOilProductionPerDept.withColumn("OilTotalsPerDept", totalOilProductionPerDept.OilTotalsPerDept*30)

In [0]:
totalOilProductionPerDept.show(5)

+------------+--------------------+
|Departamento|    OilTotalsPerDept|
+------------+--------------------+
|        META|1.6008495300000006E8|
|    CASANARE| 6.129776879999999E7|
|   SANTANDER|2.1486554100000005E7|
|      ARAUCA|        2.01461136E7|
|      BOYACA|1.1526151500000002E7|
+------------+--------------------+
only showing top 5 rows



In [0]:
AvgOilProductionRate = (df1.agg(F.avg(df1.Enero+df1.Febrero+df1.Marzo+df1.Abril+df1.Mayo+df1.Junio+df1.Julio+df1.Agosto+df1.Septiembre+df1.Octubre+df1.Noviembre+df1.Diciembre))
.withColumnRenamed("avg((((((((((((Enero + Febrero) + Marzo) + Abril) + Mayo) + Junio) + Julio) + Agosto) + Septiembre) + Octubre) + Noviembre) + Diciembre))", "OilavgRate"))

In [0]:
AvgOilProductionRate = AvgOilProductionRate.withColumn("OilavgRate", AvgOilProductionRate.OilavgRate*456/12)

In [0]:
AvgOilProductionRate.show()

+-----------------+
|       OilavgRate|
+-----------------+
|885884.0558333333|
+-----------------+



In [0]:
AvgOilProductionRatePerDept = (df1.groupBy("Departamento")
.agg(F.avg(df1.Enero+df1.Febrero+df1.Marzo+df1.Abril+df1.Mayo+df1.Junio+df1.Julio+df1.Agosto+df1.Septiembre+df1.Octubre+df1.Noviembre+df1.Diciembre))
.withColumnRenamed("avg((((((((((((Enero + Febrero) + Marzo) + Abril) + Mayo) + Junio) + Julio) + Agosto) + Septiembre) + Octubre) + Noviembre) + Diciembre))", "OilavgRatePerDept")
.orderBy(col("OilavgRatePerDept").desc()))

In [0]:
AvgOilProductionRatePerDept = AvgOilProductionRatePerDept.withColumn("OilavgRatePerDept", AvgOilProductionRatePerDept.OilavgRatePerDept/12)

In [0]:
AvgOilProductionRatePerDept.show(5)

+---------------+-----------------+
|   Departamento|OilavgRatePerDept|
+---------------+-----------------+
|           META|5775.070454545456|
|        BOLIVAR|3673.161666666667|
|DEPARTAMENTO NN|        2547.7925|
|         BOYACA|2462.852884615385|
|      ANTIOQUIA|2228.926547619047|
+---------------+-----------------+
only showing top 5 rows



In [0]:
display(totalOilProductionPerDept)

Departamento,OilTotalsPerDept
META,160084953.00000006
CASANARE,61297768.79999999
SANTANDER,21486554.100000005
ARAUCA,20146113.6
BOYACA,11526151.500000002
PUTUMAYO,10382262.9
HUILA,8490293.399999997
CESAR,7835883.9
ANTIOQUIA,5616894.899999999
BOLIVAR,5289352.800000001
