In [None]:
!pip install pyspark
!pip install -U -q PyDrive
!apt update
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Hit:1 https://cli.github.com/packages stable InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:11 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
42 packages can be upgraded. Run 'apt list --upgradable' to see them.
[1;33mW: [0mSkipping acquire of configured file 'main/source/Sou

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext
from pyspark.sql import SQLContext
import pandas as pd

# Creamos el Spark Context

In [None]:
# create the Spark Session
spark = SparkSession.builder.getOrCreate()

# create the Spark Context
sc = spark.sparkContext

In [None]:
type(sc)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
DATA_PATH = "/content/drive/MyDrive/CienciaDeDatos/TP1/data/orders.csv"

sqlContext = SQLContext(sc)
df = sqlContext.read.csv(DATA_PATH, header=True, inferSchema=True)

selected_columns_df = df.select("shipping_address", "discount_amount", "total_amount", "currency")
rdd = selected_columns_df.rdd

rdd = rdd.filter(lambda row: row["shipping_address"] is not None and row["discount_amount"] is not None)



In [None]:
rdd.count()

3755451

# 1) Cuál es el estado que más descuentos tiene en total? y en promedio? Supongan que de una direccion del estilo: 3123 Alan Extension Port Andrea, MA 26926, “MA” es el estado.

### Hipótesis tomadas:

Se consideró al estado como una unidad representada por las 2 letras en el anteúltimo espacio del shipping address, cómo por ejemplo “AP” en “USNV Morrison FPO AP 90901”.

In [None]:
# Cotizaciones
rates = {
    "USD": 1.0,
    "GBP": 0.7391,
    "CAD": 1.3869,
    "EUR": 0.8547
}

# Funciones de limpieza y normalización:
def to_float(x):
    try:
        return float(x)
    except:
        return 0.0

def to_usd(amount, currency):
    current_currency = "USD" if currency is None else str(currency).strip().upper()
    rate = rates.get(current_currency, 1.0)
    return amount / rate

def get_state(address):
    try:
        state = str(address.split()[-2].strip().upper())
        return state
    except:
        return None

def discount_value_to_usd(row):
    percentage = to_float(row["discount_amount"])
    total_amount = to_float(row["total_amount"])
    local_value = (percentage / 100.0) * total_amount
    try:
        currency = row["currency"]
    except Exception:
        currency = "USD"
    return to_usd(local_value, currency)


In [None]:
discounted_orders_usd = (
    rdd
    .map(lambda row: (get_state(row["shipping_address"]), discount_value_to_usd(row)))
    .filter(lambda x: x[0] is not None and x[1] > 0.0)
)

discount_sum_by_state = discounted_orders_usd.reduceByKey(lambda a, b: a + b)

top_sum_state = discount_sum_by_state.takeOrdered(1, key=lambda x: -x[1])
print("Top suma descuento (USD):", top_sum_state)


Top suma descuento (USD): [('AP', 361103.07038235094)]


In [None]:
sum_count_by_state = (
    discounted_orders_usd
    .mapValues(lambda v: (v, 1))
    .reduceByKey(lambda a, b: (a[0] + b[0], a[1] + b[1]))
)

avg_discount_by_state = sum_count_by_state.mapValues(lambda sc: sc[0] / sc[1])

top_avg_state = avg_discount_by_state.takeOrdered(1, key=lambda x: -x[1])
print("Top promedio descuento (USD):", top_avg_state)


Top promedio descuento (USD): [('AP', 13.19146161987108)]


# Conclusiones:

El resultado mostró a AP como el estado con mayor monto descontado, en total y en promedio, con USD 361103.07 en total y USD 13.26 por orden.
Respecto al procesamiento realizado sin Spark, se notó que este contenía un mayor monto descontado total, de 348050.53. Esto puede deberse a que en esta consulta fuimos menos estrictos con los filtros de valores nulos. El promedio de descuento por estado fue exactamente el mismo valor, así que se valida que el procesamiento se dió exitosamente.