# Aplicações de Map e Reduce

# Contagem de Patentes citadas (Map / Reduce Clássico)

In [1]:
!wget https://raw.githubusercontent.com/daoleen/HadoopLearning/master/cite75_99.txt

--2024-10-22 00:15:34--  https://raw.githubusercontent.com/daoleen/HadoopLearning/master/cite75_99.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 15956 (16K) [text/plain]
Saving to: ‘cite75_99.txt’


2024-10-22 00:15:35 (152 MB/s) - ‘cite75_99.txt’ saved [15956/15956]



Ler o arquivo

In [2]:
citacoes = []
with open("cite75_99.txt") as f:
    lines = f.readlines()
    lines.pop(0) # remove a linha do cabeçalho
    for line in lines:
        citacoes.append( line.strip() )

In [7]:
citacoes[0]

'3858241,956203'

In [8]:
citacoes[0].split(",")

['3858241', '956203']

## MAP

'3858241,956203' --> (956203, 1)

In [14]:
citacoes_map = map(lambda x: (int(x.split(",")[1]), 1), citacoes)

In [15]:
citacoes_map = map(lambda c: ((c.split(",")[1],1)), citacoes)

In [16]:
citacoes_map

<map at 0x7f8934389ab0>

## SHUFFLE

In [17]:
group_by_patente = {}

In [18]:
for patente, valor in citacoes_map:
    try:      
        group_by_patente[patente].append(valor)
    except:
        group_by_patente[patente] = [valor]

## REDUCE

In [26]:
from functools import reduce

In [30]:
patente_count = {}
for k, v in group_by_patente.items():
    patente_count[k] = reduce (lambda x,y: x+y, v) 

In [31]:
import pandas as pd

In [32]:
df = pd.DataFrame(list(patente_count.items()),columns = ["Patente","Contagem"])
df.nlargest(5,"Contagem")

Unnamed: 0,Patente,Contagem
26,3621837,2
28,3755824,2
260,3310865,2
0,956203,1
1,1324234,1


## NO DASK

In [33]:
import dask.dataframe as dd

In [37]:
ddf = pd.DataFrame(dd,columns=["PATENTE","CITED"])

ValueError: DataFrame constructor not properly called!

In [None]:
resultado = ddf.groupby("CITED").size().compute()

In [None]:
resultado.sort_values(ascending=False)

# Configurando Spark localmente

Instalando o Spark na versão 3.1.3 no Colab. Eventualmente você precisará reiniciar seu ambiente.

In [38]:
!pip install pyspark==3.1.3

Collecting pyspark==3.1.3
  Downloading pyspark-3.1.3.tar.gz (214.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.0/214.0 MB[0m [31m44.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py4j==0.10.9 (from pyspark==3.1.3)
  Downloading py4j-0.10.9-py2.py3-none-any.whl.metadata (1.3 kB)
Downloading py4j-0.10.9-py2.py3-none-any.whl (198 kB)
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25ldone
[?25h  Created wheel for pyspark: filename=pyspark-3.1.3-py2.py3-none-any.whl size=214463458 sha256=48f098ccd97088b7e5e033b6247121ba3c2200a82bdb06e27cd6c88a2b995b10
  Stored in directory: /home/jovyan/.cache/pip/wheels/25/da/89/3c1760252397d50554c2b3a66ab0ea57e1460fdab21d0aa968
Successfully built pyspark
Installing collected packages: py4j, pyspark
  Attempting uninstall: py4j
    Found existing installation: py4j 0.10.9.7
    Uninstalling py4j-0.10.9.7:
  

# SETUP

Criar uma sessão do Spark

In [43]:
from pyspark.sql import SparkSession

In [46]:
spark = SparkSession \
        .builder \
        .master("local[*]")\
        .appName("Patente_RDD")\
        .getOrCreate()

In [47]:
spark

Crie um contexto (onde executamos comandos para os RDDs)

* [SparkContext](https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.SparkContext.html)

In [48]:
sc = spark.sparkContext

# Passo 1: Ler o arquivo em um RDD

* [textFile](https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.SparkContext.textFile.html)

In [49]:
rdd1 = sc.textFile("cite75_99.txt")

## Uma primeira ação (take) ....

In [50]:
rdd1.take(10)

['"CITING","CITED"',
 '3858241,956203',
 '3858241,1324234',
 '3858241,3398406',
 '3858241,3557384',
 '3858241,3634889',
 '3858242,1515701',
 '3858242,3319261',
 '3858242,3668705',
 '3858242,3707004']

## Precisamos remover o cabeçalho

In [51]:
header = rdd1.take(1)

In [52]:
header

['"CITING","CITED"']

## Uma transformação: filter 

In [53]:
rdd2 = rdd1.filter(lambda row: row!=header[0])

In [54]:
rdd2.take(5)

['3858241,956203',
 '3858241,1324234',
 '3858241,3398406',
 '3858241,3557384',
 '3858241,3634889']

## Agora é o MAP --> '3858241, 956203' => (956203, 1)

In [55]:
rdd3 = rdd2.map(lambda r: (r.split(",")[1],1))

In [56]:
rdd3.take(5)

[('956203', 1), ('1324234', 1), ('3398406', 1), ('3557384', 1), ('3634889', 1)]

## Agora é o Reduce, obtendo então a quantidade de citações de cada patente

* [reduceByKey](https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.RDD.reduceByKey.html)

In [57]:
%%time
rdd4 = rdd3.reduceByKey(lambda x, y: x+y)

CPU times: user 6.09 ms, sys: 0 ns, total: 6.09 ms
Wall time: 17.8 ms


## Agora é executar e obter o resultado

In [58]:
rdd4.take(20)

[('956203', 1),
 ('3634889', 1),
 ('3319261', 1),
 ('2949611', 1),
 ('3156927', 1),
 ('3221341', 1),
 ('3574238', 1),
 ('3684611', 1),
 ('14040', 1),
 ('17445', 1),
 ('2635670', 1),
 ('2912700', 1),
 ('3608095', 1),
 ('3621837', 2),
 ('3755824', 2),
 ('3451067', 1),
 ('3503079', 1),
 ('1600859', 1),
 ('3694819', 1),
 ('3706104', 1)]

## Obtendo as top-10 mais citadas ...uma nova ação: takeOrdered

* [takeOrdered](https://spark.apache.org/docs/3.1.1/api/python/reference/api/pyspark.RDD.takeOrdered.html)

In [65]:
%%time
rdd4.takeOrdered(10,key=lambda r: -r[1])

CPU times: user 4.3 ms, sys: 19 μs, total: 4.32 ms
Wall time: 93.6 ms


[('3621837', 2),
 ('3755824', 2),
 ('3310865', 2),
 ('956203', 1),
 ('3634889', 1),
 ('3319261', 1),
 ('2949611', 1),
 ('3156927', 1),
 ('3221341', 1),
 ('3574238', 1)]