# Traballando con ficheiro purchases.txt

## Load data

Load the data in `???/purchases.txt` into an RDD:

In [5]:
rdd = sc.textFile('purchases.txt')
rdd.take(5)

[u"2012-01-01\t09:00\tSan Jose\tMen's Clothing\t214.05\tAmex",
 u"2012-01-01\t09:00\tFort Worth\tWomen's Clothing\t153.57\tVisa",
 u'2012-01-01\t09:00\tSan Diego\tMusic\t66.08\tCash',
 u'2012-01-01\t09:00\tPittsburgh\tPet Supplies\t493.51\tDiscover',
 u"2012-01-01\t09:00\tOmaha\tChildren's Clothing\t235.63\tMasterCard"]

False


## Filter 'San Jose' data

Filter data from the RDD keeping only "San Jose" lines.

In [8]:
# Filter the RDD to keep only lines containing "San Jose"
san_jose_lines = rdd.filter(lambda line: "San Jose" in line)

# Optionally, collect and display the results to verify
san_jose_results = san_jose_lines.collect()
print(san_jose_results[:5])  # Print the first 5 lines that match


[u"2012-01-01\t09:00\tSan Jose\tMen's Clothing\t214.05\tAmex", u"2012-01-01\t09:00\tSan Jose\tWomen's Clothing\t215.82\tCash", u'2012-01-01\t09:09\tSan Jose\tToys\t337.71\tCash', u'2012-01-01\t09:17\tSan Jose\tGarden\t192.82\tCash', u'2012-01-01\t09:19\tSan Jose\tCameras\t95.81\tCash']


## Count the number of purchases in San Jose

In [9]:
# Filter the RDD to keep only lines containing "San Jose"
san_jose_lines = rdd.filter(lambda line: "San Jose" in line)

# Count the number of purchases in San Jose
num_purchases_san_jose = san_jose_lines.count()

# Print the result
print("Number of purchases in San Jose:", num_purchases_san_jose)


('Number of purchases in San Jose:', 39898)


## Find the maximum cost

Extract the column with the cost strings:

In [10]:
# Extract the column with the cost strings (fifth column)
cost_strings = san_jose_lines.map(lambda line: line.split("\t")[4])

# Optionally, collect and display the results to verify
cost_results = cost_strings.collect()
print(cost_results[:5])  # Print the first 5 cost strings


[u'214.05', u'215.82', u'337.71', u'192.82', u'95.81']


And now we can convert them to floats:

In [11]:
# Extract the column with the cost strings (fifth column)
cost_strings = san_jose_lines.map(lambda line: line.split("\t")[4])

# Convert the cost strings to floats
cost_floats = cost_strings.map(lambda cost: float(cost))

# Optionally, collect and display the results to verify
cost_results = cost_floats.collect()
print(cost_results[:5])  # Print the first 5 cost floats


[214.05, 215.82, 337.71, 192.82, 95.81]


Finally we can calculate the maximum temperature:

In [12]:
# Assuming cost_floats RDD is already created by converting cost strings to floats

# Calculate the maximum cost
max_cost = cost_floats.max()

# Print the result
print("Maximum cost in San Jose purchases:", max_cost)


('Maximum cost in San Jose purchases:', 499.99)


Or directly with **max** function

In [13]:
# Collect the cost floats into a list
cost_results = cost_floats.collect()

# Calculate the maximum cost using the max() function
max_cost = max(cost_results)

# Print the result
print("Maximum cost in San Jose purchases:", max_cost)


('Maximum cost in San Jose purchases:', 499.99)


## Find the minimum cost

In [15]:
# Filtrar los valores que son mayores que 0
filtered_costs = cost_floats.filter(lambda x: x > 0)

# Calcular el costo mínimo de los valores filtrados
min_cost = filtered_costs.min()

# Imprimir el resultado
print("Costo mínimo en compras de San Jose (sin tener en cuenta 0.0):", min_cost)



('Costo m\xc3\xadnimo en compras de San Jose (sin tener en cuenta 0.0):', 0.03)
