In [85]:
%cd practicas/

[Errno 2] No such file or directory: 'practicas/'
/workspace/practicas


  bkms = self.shell.db.get('bookmarks', {})


In [86]:
from pyspark.sql import SparkSession

spark = ( SparkSession.builder
         .appName("pr503")
         .master("spark://spark-master:7077")
         .getOrCreate()
         )
 
sc = spark.sparkContext

In [87]:
from pyspark.sql.types import StructType, StructField, BooleanType, IntegerType, StringType, DoubleType, LongType, TimestampType
from pyspark.sql.functions import col, lit, split, upper, concat_ws, lpad, log, round, bround, greatest, to_date, date_add, month


schema_crop = StructType([
    StructField("Crop", StringType(), True),
    StructField("Region", StringType(), True),
    StructField("Soil_Type", StringType(), True),
    StructField("Soil_ph", DoubleType(), True),
    StructField("Rainfall_mm", DoubleType(), True),
    StructField("Temperature_c", DoubleType(), True),
    StructField("Humidity_pct", DoubleType(), True),
    StructField("Fertilizer_Used_kg", DoubleType(), True),
    StructField("Irrigation", StringType(), True),
    StructField("Pesticides_Used_kg", DoubleType(), True),
    StructField("Planting_Density", DoubleType(), True),
    StructField("Previous_Crop", StringType(), True),
    StructField("Yield_ton_per_ha", DoubleType(), True)
])

df_crop = (spark.read
             .format("csv")
             .schema(schema_crop)
             .option("header", "True")
             .load("./data/crop_yield_dataset.csv"))
df_crop.printSchema()
df_crop.show(5)

root
 |-- Crop: string (nullable = true)
 |-- Region: string (nullable = true)
 |-- Soil_Type: string (nullable = true)
 |-- Soil_ph: double (nullable = true)
 |-- Rainfall_mm: double (nullable = true)
 |-- Temperature_c: double (nullable = true)
 |-- Humidity_pct: double (nullable = true)
 |-- Fertilizer_Used_kg: double (nullable = true)
 |-- Irrigation: string (nullable = true)
 |-- Pesticides_Used_kg: double (nullable = true)
 |-- Planting_Density: double (nullable = true)
 |-- Previous_Crop: string (nullable = true)
 |-- Yield_ton_per_ha: double (nullable = true)

+------+--------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+
|  Crop|  Region|Soil_Type|Soil_ph|Rainfall_mm|Temperature_c|Humidity_pct|Fertilizer_Used_kg|Irrigation|Pesticides_Used_kg|Planting_Density|Previous_Crop|Yield_ton_per_ha|
+------+--------+---------+-------+-----------+-------------+------------+-------

# 1. Creación de un ID único

### Limpieza

In [88]:
df_eng = (df_crop.withColumn("Region", split(col("Region"), "_")[1]))
df_eng.show(5)

+------+------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+
|  Crop|Region|Soil_Type|Soil_ph|Rainfall_mm|Temperature_c|Humidity_pct|Fertilizer_Used_kg|Irrigation|Pesticides_Used_kg|Planting_Density|Previous_Crop|Yield_ton_per_ha|
+------+------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+
| Maize|     C|    Sandy|   7.01|     1485.4|         19.7|        40.3|             105.1|      Drip|              10.2|            23.2|         Rice|          101.48|
|Barley|     D|     Loam|   5.79|      399.4|         29.1|        55.4|             221.8| Sprinkler|              35.5|             7.4|       Barley|          127.39|
|  Rice|     C|     Clay|   7.24|      980.9|         30.5|        74.4|              61.2| Sprinkler|              40.0|             5.1|        Whea

### Formato

In [89]:
df_eng = (df_eng.withColumn("Crop", upper(col("Crop"))))
df_eng.show(5)

+------+------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+
|  Crop|Region|Soil_Type|Soil_ph|Rainfall_mm|Temperature_c|Humidity_pct|Fertilizer_Used_kg|Irrigation|Pesticides_Used_kg|Planting_Density|Previous_Crop|Yield_ton_per_ha|
+------+------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+
| MAIZE|     C|    Sandy|   7.01|     1485.4|         19.7|        40.3|             105.1|      Drip|              10.2|            23.2|         Rice|          101.48|
|BARLEY|     D|     Loam|   5.79|      399.4|         29.1|        55.4|             221.8| Sprinkler|              35.5|             7.4|       Barley|          127.39|
|  RICE|     C|     Clay|   7.24|      980.9|         30.5|        74.4|              61.2| Sprinkler|              40.0|             5.1|        Whea

### Concatenacion y Relleno

In [90]:
df_eng = (df_eng.withColumn("concat", concat_ws("-", lpad(col("region"), 3, "X"), col("Crop"))))
df_eng = (df_eng.withColumn("concat", concat_ws("_", lit("CODIGO"), col("concat"))))
df_eng.show(5)

+------+------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+-----------------+
|  Crop|Region|Soil_Type|Soil_ph|Rainfall_mm|Temperature_c|Humidity_pct|Fertilizer_Used_kg|Irrigation|Pesticides_Used_kg|Planting_Density|Previous_Crop|Yield_ton_per_ha|           concat|
+------+------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+-----------------+
| MAIZE|     C|    Sandy|   7.01|     1485.4|         19.7|        40.3|             105.1|      Drip|              10.2|            23.2|         Rice|          101.48| CODIGO_XXC-MAIZE|
|BARLEY|     D|     Loam|   5.79|      399.4|         29.1|        55.4|             221.8| Sprinkler|              35.5|             7.4|       Barley|          127.39|CODIGO_XXD-BARLEY|
|  RICE|     C|     Clay|   7.24|      980.9|         30.5| 

# 2. Transformacion matematica

In [91]:
df_eng = df_eng.withColumn("Rainfall_mm",round((log(col("Rainfall_mm")) + 1), 2))
df_eng = df_eng.withColumn("Yield_ton_per_ha",round(col("Yield_ton_per_ha"), 1))
df_eng = df_eng.withColumn("Rendimiento_Bancario", bround(col("Yield_ton_per_ha"), 0))
df_eng.show(5)

+------+------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+-----------------+--------------------+
|  Crop|Region|Soil_Type|Soil_ph|Rainfall_mm|Temperature_c|Humidity_pct|Fertilizer_Used_kg|Irrigation|Pesticides_Used_kg|Planting_Density|Previous_Crop|Yield_ton_per_ha|           concat|Rendimiento_Bancario|
+------+------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+-----------------+--------------------+
| MAIZE|     C|    Sandy|   7.01|        8.3|         19.7|        40.3|             105.1|      Drip|              10.2|            23.2|         Rice|           101.5| CODIGO_XXC-MAIZE|               102.0|
|BARLEY|     D|     Loam|   5.79|       6.99|         29.1|        55.4|             221.8| Sprinkler|              35.5|             7.4|       Barley|           1

# 3. Comparacion de insumos

In [92]:
df_eng = df_eng.withColumn("Max_Quimico_kg", greatest(col("Fertilizer_Used_kg"), col("Pesticides_Used_kg")))
df_eng.show(5)

+------+------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+-----------------+--------------------+--------------+
|  Crop|Region|Soil_Type|Soil_ph|Rainfall_mm|Temperature_c|Humidity_pct|Fertilizer_Used_kg|Irrigation|Pesticides_Used_kg|Planting_Density|Previous_Crop|Yield_ton_per_ha|           concat|Rendimiento_Bancario|Max_Quimico_kg|
+------+------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+-----------------+--------------------+--------------+
| MAIZE|     C|    Sandy|   7.01|        8.3|         19.7|        40.3|             105.1|      Drip|              10.2|            23.2|         Rice|           101.5| CODIGO_XXC-MAIZE|               102.0|         105.1|
|BARLEY|     D|     Loam|   5.79|       6.99|         29.1|        55.4|             221.8| Sprinkler|  

# 4. Simulacion de fechas

In [93]:
df_eng = df_eng.withColumn("Fecha_Siembra", to_date(lit("2023-04-01")))
df_eng = df_eng.withColumn("Fecha_Estimada_Cosecha", date_add(col("Fecha_Siembra"), 150))
df_eng = df_eng.withColumn("Mes_Cosecha", month(col("Fecha_Estimada_Cosecha")))
df_eng.show(5)

+------+------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+-----------------+--------------------+--------------+-------------+----------------------+-----------+
|  Crop|Region|Soil_Type|Soil_ph|Rainfall_mm|Temperature_c|Humidity_pct|Fertilizer_Used_kg|Irrigation|Pesticides_Used_kg|Planting_Density|Previous_Crop|Yield_ton_per_ha|           concat|Rendimiento_Bancario|Max_Quimico_kg|Fecha_Siembra|Fecha_Estimada_Cosecha|Mes_Cosecha|
+------+------+---------+-------+-----------+-------------+------------+------------------+----------+------------------+----------------+-------------+----------------+-----------------+--------------------+--------------+-------------+----------------------+-----------+
| MAIZE|     C|    Sandy|   7.01|        8.3|         19.7|        40.3|             105.1|      Drip|              10.2|            23.2|         Rice|           101.5| CODIGO_XXC-