In [None]:
!pip install mrjob



In [None]:
%%file wordcount_unstructured.py

import re
from mrjob.job import MRJob

class WordCount(MRJob):

  # tanto o mapper quanto o reducer, recebem (chave, valor) como entrada
  def mapper(self, _, value):
    words = re.findall("[a-z]+", value.lower())
    for word in words:
      yield word, 1

  def reducer(self, key, value):
    sum = 0
    for v in value:
      sum+=1
    yield key, sum

if __name__ == '__main__':
  WordCount.run()

Overwriting wordcount_unstructured.py


In [None]:
!rm -rf output
!python wordcount_unstructured.py frankenstein.txt --output-dir=output

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 1...
Creating temp directory /tmp/wordcount_unstructured.root.20250829.135455.534902
job output is in output
Removing temp directory /tmp/wordcount_unstructured.root.20250829.135455.534902...


In [None]:
%%file wordlen_unstructured.py

import re
from mrjob.job import MRJob

class WordLen(MRJob):

  # tanto o mapper quanto o reducer, recebem (chave, valor) como entrada
  def mapper(self, _, value):
    words = re.findall("[a-z]+", value.lower())
    for word in words:
      yield len(word), 1

  def reducer(self, key, value):
    sum = 0
    for v in value:
      sum+=1
    yield key, sum

if __name__ == '__main__':
  WordLen.run()

Overwriting wordlen_unstructured.py


In [None]:
!rm -rf output2
!python wordlen_unstructured.py frankenstein.txt --output-dir=output2

!cat output2/part-* > output2/wordlen_final

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 1...
Creating temp directory /tmp/wordlen_unstructured.root.20250829.140516.146597
job output is in output2
Removing temp directory /tmp/wordlen_unstructured.root.20250829.140516.146597...


# **Dados semi-estruturados**

In [None]:
!gzip -d reviews_Amazon_Instant_Video_5.json.gz

In [None]:
!head -5 reviews_Amazon_Instant_Video_5.json

{"reviewerID": "A11N155CW1UV02", "asin": "B000H00VBQ", "reviewerName": "AdrianaM", "helpful": [0, 0], "reviewText": "I had big expectations because I love English TV, in particular Investigative and detective stuff but this guy is really boring. It didn't appeal to me at all.", "overall": 2.0, "summary": "A little bit boring for me", "unixReviewTime": 1399075200, "reviewTime": "05 3, 2014"}
{"reviewerID": "A3BC8O2KCL29V2", "asin": "B000H00VBQ", "reviewerName": "Carol T", "helpful": [0, 0], "reviewText": "I highly recommend this series. It is a must for anyone who is yearning to watch \"grown up\" television. Complex characters and plots to keep one totally involved. Thank you Amazin Prime.", "overall": 5.0, "summary": "Excellent Grown Up TV", "unixReviewTime": 1346630400, "reviewTime": "09 3, 2012"}
{"reviewerID": "A60D5HQFOTSOM", "asin": "B000H00VBQ", "reviewerName": "Daniel Cooper \"dancoopermedia\"", "helpful": [0, 1], "reviewText": "This one is a real snoozer. Don't believe anythin

In [None]:
%%file wordcount_semistructured.py

from mrjob.job import MRJob
import re
import json

class WordCountAmazon(MRJob):
  def mapper(self, _, value):
    line = json.loads(value)
    review_text = line['reviewText']
    words = re.findall("[a-z]+", review_text.lower())
    for word in words:
      yield word, 1

  def reducer(self, key, value):
    yield key, sum(value)

if __name__ == '__main__':
  WordCountAmazon.run()

Overwriting wordcount_semistructured.py


In [None]:
!rm -rf output_amazon
!python wordcount_semistructured.py reviews_Amazon_Instant_Video_5.json --output-dir=output_amazon

!cat output_amazon/part-* > output_amazon/amazon_final

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 1...
Creating temp directory /tmp/wordcount_semistructured.root.20250829.142342.028215
job output is in output_amazon
Removing temp directory /tmp/wordcount_semistructured.root.20250829.142342.028215...


In [None]:
%%file overall_semistructured.py

from mrjob.job import MRJob
import json

class OverAllAmazon(MRJob):
  def mapper(self, _, value):
    line = json.loads(value)
    overall = float(line['overall'])
    if overall <= 2.5:
      yield "overall_less_2.5", 1
    else:
      yield "overall_greater_2.5", 1

  def reducer(self, key, value):
    yield key, sum(value)

if __name__ == '__main__':
  OverAllAmazon.run()

Overwriting overall_semistructured.py


In [None]:
!rm -rf output_amazonOverall
!python overall_semistructured.py reviews_Amazon_Instant_Video_5.json --output-dir=output_amazonOverall

!cat output_amazonOverall/part-* > output_amazonOverall/amazon_finalOverall

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 1...
Creating temp directory /tmp/overall_semistructured.root.20250829.143752.164609
job output is in output_amazonOverall
Removing temp directory /tmp/overall_semistructured.root.20250829.143752.164609...


In [None]:
!head -5 california_housing.csv

"longitude","latitude","housing_median_age","total_rooms","total_bedrooms","population","households","median_income","median_house_value"
-114.310000,34.190000,15.000000,5612.000000,1283.000000,1015.000000,472.000000,1.493600,66900.000000
-114.470000,34.400000,19.000000,7650.000000,1901.000000,1129.000000,463.000000,1.820000,80100.000000
-114.560000,33.690000,17.000000,720.000000,174.000000,333.000000,117.000000,1.650900,85700.000000
-114.570000,33.640000,14.000000,1501.000000,337.000000,515.000000,226.000000,3.191700,73400.000000


In [None]:
%%file maxpopulation_structured.py

from mrjob.job import MRJob

class maxpopulation_structured(MRJob):
  def mapper(self, _, value):
    fields = value.split(',')
    # verifica se tem todas as colunas
    if len(fields) == 9:
      try:
        population = float(fields[5])
        yield "population", population
      except ValueError:
        pass #ignora se nao for float


  def reducer(self, key, value):
    yield "max_population", max(value)

if __name__ == '__main__':
  maxpopulation_structured.run()

Overwriting maxpopulation_structured.py


In [None]:
!rm -rf output_California
!python maxpopulation_structured.py california_housing.csv --output-dir=output_California

!cat output_California/part-* > output_California/amazon_finalPopulation

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 1...
Creating temp directory /tmp/maxpopulation_structured.root.20250829.150247.223803
job output is in output_California
Removing temp directory /tmp/maxpopulation_structured.root.20250829.150247.223803...
