In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt update
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m987.4/987.4 kB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for PyDrive (setup.py) ... [?25l[?25hdone
Hit:1 https://cli.github.com/packages stable InRelease
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,632 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 htt

In [2]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pandas as pd

# Creamos el Spark Context

In [4]:
# create the Spark Session
spark = SparkSession.builder.getOrCreate()

# create the Spark Context
sc = spark.sparkContext

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
DATA_PATH = "/content/drive/MyDrive/CienciaDeDatos/TP1/data/"

sqlContext = SQLContext(sc)

rdd_products = (
    sqlContext.read.csv(DATA_PATH + "products.csv", header=True, inferSchema=True)
    .select("product_id", 'brand', "stock_quantity")
    .rdd
    )




# 5) Porcentaje de productos cuyo stock es al menos 20% más alto que el stock promedio de su marca

In [27]:
import math

STOCK_THRESHOLD = 1.2

def clean_quantity(value):
    if value is None or value in {"nan", "na", "undefined", "none", ""}:
        return 0
    return int(value)

def normalize_category(value):
    if value is None:
        return None
    v = str(value).strip().lower()
    if v in {"nan", "na", "undefined", "none", ""}:
        return None
    return v.title()

rdd_avg_brand_stocks = (
    rdd_products
    .map(lambda row: (normalize_category(row["brand"]), (clean_quantity(row["stock_quantity"]), 1)))
    .filter(lambda row: row[0] is not None)
    .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]) )
    .map(lambda row: (row[0], row[1][0] / row[1][1]))
)

rdd_products_by_brand = (
    rdd_products
    .map(lambda row: (normalize_category(row["brand"]),
                      clean_quantity(row["stock_quantity"])))
    .filter(lambda x: x[0] is not None)
)

products_above_threshold = (
    rdd_products_by_brand.join(rdd_avg_brand_stocks)
    .map(lambda x: 1 if x[1][0] >= STOCK_THRESHOLD * x[1][1] else 0)
    .reduce(lambda a,b: a+b)
)

total_branded_products = rdd_products_by_brand.count()
percentage_above_threshold = (products_above_threshold / total_branded_products) * 100 if total_branded_products else 0

truncated_percentage = math.floor(percentage_above_threshold * 100) / 100

print(f"Porcentaje de productos con stock ≥ {STOCK_THRESHOLD * 100:.0f}% del promedio de su marca: {truncated_percentage}%")

Porcentaje de productos con stock ≥ 120% del promedio de su marca: 41.62%


# Conclusiones:
El resultado obtenido fue que el 41,62% de los productos presentan stock por encima del 120% del promedio de su marca, lo que sugiere una distribución de inventario sesgada con posible sobrestock en un subconjunto relevante. De todas formas analizando más a fondo es un resultado coherente. Muchas tiendas tienen un porcentaje de productos que tienen mucha más demanda y su stock es considerablemente mayor como consecuencia. Llama la atención que el valor sea tan alto de todas formas.