In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt update
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m987.4/987.4 kB[0m [31m39.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for PyDrive (setup.py) ... [?25l[?25hdone
Hit:1 https://cli.github.com/packages stable InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:11 https

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pandas as pd

# Creamos el Spark Context

In [4]:
# create the Spark Session
spark = SparkSession.builder.getOrCreate()

# create the Spark Context
sc = spark.sparkContext

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
DATA_PATH = "/content/drive/MyDrive/CienciaDeDatos/TP1/data/"

sqlContext = SQLContext(sc)

print(sqlContext.read.csv(DATA_PATH + "products.csv", header=True, inferSchema=True).head(5))

rdd_products = (
    sqlContext.read.csv(DATA_PATH + "products.csv", header=True, inferSchema=True)
    .select("product_id", "price", "brand")
    .rdd
)

rdd_inventory_logs = (
    sqlContext.read.csv(DATA_PATH + 'inventory_logs.csv', header=True, inferSchema=True)
    .select('product_id', 'quantity_change')
    .rdd
)




# 8) Obtener el precio total de todo el stock en inventario por marca, de las 5 marcas cuyo valor es mayor.


In [7]:
def normalize_category(value):
    if value is None:
        return None
    v = str(value).strip().lower()
    if v in {"nan", "na", "undefined", "none", ""}:
        return None
    return v.title()

rdd_products_clean = (
    rdd_products
    .filter(lambda row: row["price"] is not None)
    .map(lambda row: (int(row["product_id"]),
                      (float(row["price"]), normalize_category(row["brand"]))))
    .filter(lambda row: row[1][1] is not None)
)

rdd_inventory_amount = (
    rdd_inventory_logs
    .filter(lambda row: row["quantity_change"] is not None)
    .map(lambda row: (int(row["product_id"]), int(row["quantity_change"])))
    .reduceByKey(lambda a, b: a + b)
)

rdd_products_inventory = rdd_products_clean.join(rdd_inventory_amount)

value_per_brand = (
    rdd_products_inventory
    .map(lambda row: (row[1][0][1], row[1][0][0] * row[1][1]))  # (marca, valor_total)
    .reduceByKey(lambda a, b: a + b)
)

top_value_brands = value_per_brand.takeOrdered(5, key=lambda x: -x[1])

print("----------------------- Valor total del inventario -----------------------")
for brand, total_value in top_value_brands:
    print(f"{brand}: ${total_value:,.2f}")
print("------------------------------------------------------------------------")

----------------------- Valor total del inventario -----------------------
Sony: $20,109,228.45
Tiffany & Co.: $12,944,969.23
Viator: $12,054,744.93
Canon: $11,833,037.65
Weber: $11,378,595.65
------------------------------------------------------------------------


# Conclusiones:

Los resultados muestran valores altos en el precio del inventario actual por marca. Sony queda claramente arriba de todo con un valor de 20M, el resto están bastante más abajo. Tiene sentido ya que es una compañia de electrónica que vende productos caros a todo el mundo. No deja de sorprender lo distinto de su magnitud comparados con el resto, que se ubican los otros 4 varios escalones abajo, entre 12M y 11M. Se logró encontrar una correlación con la realidad entre los datos aleatorizados.
