In [40]:
!pip install mrjob



In [41]:
%%file avg_temp_month_combiner.py
#temp média por mes usando combiner

from mrjob.job import MRJob
from mrjob.step import MRStep

class ForestFireAvgTempCombiner(MRJob):
  def mapper(self, _, value):
    fields = value.split(',')
    if len(fields) == 13:
      try:
        month = fields[2]
        temp = float(fields[8])
        yield month, [temp, 1]
      except ValueError:
        pass

  def combiner(self, key, value):
    partial_sum = 0
    partial_count = 0
    for v in value:
      partial_sum += v[0]
      partial_count += v[1]
    #ao invez de calcular a media, o combiner retorna somente as somas
    yield key, (partial_sum, partial_count)

  def reducer(self, key, value):
    sum_values = 0
    count = 0
    for v in value:
      sum_values += v[0]
      count += v[1]
    avg_temp = sum_values/count
    yield key, avg_temp

  def steps(self):
    return [MRStep(mapper=self.mapper,
                   combiner=self.combiner,
                   reducer=self.reducer)]

if __name__ == '__main__':
  ForestFireAvgTempCombiner.run()

Overwriting avg_temp_month_combiner.py


In [42]:
!rm -rf output
!python avg_temp_month_combiner.py forestfireinput.csv --output-dir=output
!cat /content/output/part-* > /content/output/avg_temp

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 1...
Creating temp directory /tmp/avg_temp_month_combiner.root.20250912.143507.870774
job output is in output
Removing temp directory /tmp/avg_temp_month_combiner.root.20250912.143507.870774...


## WordCount using COMBINER

In [43]:
%%file wordcount_combined.py

from mrjob.job import MRJob
from mrjob.job import MRStep
import re

class WordCountCombined(MRJob):

  def mapper(self, _, value):
    words = re.findall("[a-z]+", value.lower())
    for word in words:
      yield word, 1

  def reducer(self, key, value):
    yield key, sum(value)

# o combiner é exatamente o mesmo codigo do reducer, logo:
  def steps(self):
    return [MRStep(mapper=self.mapper,
                   combiner=self.reducer,
                   reducer=self.reducer)]

if __name__ == '__main__':
  WordCountCombined.run()


Overwriting wordcount_combined.py


In [44]:
!rm -rf output2
!python wordcount_combined.py frankenstein.txt --output-dir=output2

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 1...
Creating temp directory /tmp/wordcount_combined.root.20250912.143508.694967
job output is in output2
Removing temp directory /tmp/wordcount_combined.root.20250912.143508.694967...


## Make it sorted

In [45]:
%%file avg_temp_month_sorted.py
#temp média por mes usando combiner

from mrjob.job import MRJob
from mrjob.step import MRStep

class ForestFireAvgTempSorted(MRJob):
  def mapper(self, _, value):
    fields = value.split(',')
    if len(fields) == 13:
      try:
        month = fields[2]
        temp = float(fields[8])
        yield month, [temp, 1]
      except ValueError:
        pass

  def combiner(self, key, value):
    partial_sum = 0
    partial_count = 0
    for v in value:
      partial_sum += v[0]
      partial_count += v[1]
    #ao invez de calcular a media, o combiner retorna somente as somas
    yield key, (partial_sum, partial_count)

  def reducer(self, key, value):
    sum_values = 0
    count = 0
    for v in value:
      sum_values += v[0]
      count += v[1]
    avg_temp = sum_values/count
    yield "Avarage", [key, avg_temp]

  def reducer_sort(self, _, value):
    # transforma o value de interador para lista
    # invertendo (mes, temp) para (temp, mes)
    # para permitir a ordenacao por temperatura
    pairs = []
    for month, avg_temp in value:
      pairs.append((avg_temp, month))
    #ordenacao pela temperatura
    pairs.sort()
    # gera a saida no formato (chave, valor)
    # que deve ser (mes, temperatura)
    for avg_temp, month in pairs:
      yield month, avg_temp

  def steps(self):
    return [MRStep(mapper=self.mapper,
                   combiner=self.combiner,
                   reducer=self.reducer),
            MRStep(reducer=self.reducer_sort)]

if __name__ == '__main__':
  ForestFireAvgTempSorted.run()

Overwriting avg_temp_month_sorted.py


In [46]:
!rm -rf output3/
!python avg_temp_month_sorted.py forestfireinput.csv --output-dir=output3
!cat /content/output3/part-* > /content/output3/avg_temp_sorted

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 2...
Creating temp directory /tmp/avg_temp_month_sorted.root.20250912.143510.023076
Running step 2 of 2...
job output is in output3
Removing temp directory /tmp/avg_temp_month_sorted.root.20250912.143510.023076...


## Get the highest avg temp

In [47]:
%%file highest_avg_temp_month.py

from mrjob.job import MRJob
from mrjob.step import MRStep

class ForestFireAvgTempHighest(MRJob):
  def mapper(self, _, value):
    fields = value.split(',')
    if len(fields) == 13:
      try:
        month = fields[2]
        temp = float(fields[8])
        yield month, [temp, 1]
      except ValueError:
        pass

  def combiner(self, key, value):
    partial_sum = 0
    partial_count = 0
    for v in value:
      partial_sum += v[0]
      partial_count += v[1]
    #ao invez de calcular a media, o combiner retorna somente as somas
    yield key, (partial_sum, partial_count)

  def reducer(self, key, value):
    sum_values = 0
    count = 0
    for v in value:
      sum_values += v[0]
      count += v[1]
    avg_temp = sum_values/count
    yield "Avarage", [key, avg_temp]

  def reducer_max(self, _, value):
    max_value = -1
    for v in value:
      if v[1] > max_value:
        max_value = v[1]
        month = v[0]
    yield "Highest", [month, max_value]

  def steps(self):
    return [MRStep(mapper=self.mapper,
                   combiner=self.combiner,
                   reducer=self.reducer),
            MRStep(reducer=self.reducer_max)]

if __name__ == '__main__':
  ForestFireAvgTempHighest.run()

Overwriting highest_avg_temp_month.py


In [50]:
!rm -rf output4/
!python highest_avg_temp_month.py forestfireinput.csv --output-dir=output4

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 2...
Creating temp directory /tmp/highest_avg_temp_month.root.20250912.143619.362196
Running step 2 of 2...
job output is in output4
Removing temp directory /tmp/highest_avg_temp_month.root.20250912.143619.362196...


## Diferenca entre media de cada mes e a media anual

In [60]:
%%file temp_diffs.py

from mrjob.job import MRJob
from mrjob.step import MRStep

class ForestFireTempDiffs(MRJob):
  def mapper(self, _, value):
    fields = value.split(',')
    if len(fields) == 13:
      try:
        month = fields[2]
        temp = float(fields[8])
        yield month, [temp, 1]
        yield "Yearly", [temp, 1]
      except ValueError:
        pass

  def combiner(self, key, value):
    partial_sum = 0
    partial_count = 0
    for v in value:
      partial_sum += v[0]
      partial_count += v[1]
    #ao invez de calcular a media, o combiner retorna somente as somas
    yield key, (partial_sum, partial_count)

  def reducer(self, key, value):
    sum_values = 0
    count = 0
    for v in value:
      sum_values += v[0]
      count += v[1]
    avg_temp = sum_values/count
    yield "Avarage", [key, avg_temp]

  def reducer_diff(self, _, value):
    yearly_temp = 0
    month_temp = []
    for v0, v1 in value:
      if v0 == "Yearly":
        yearly_temp = v1
      else:
        month_temp.append((v0, v1))

    for v0, v1 in month_temp:
      diff = v1 - yearly_temp
      yield v0, diff


  def steps(self):
    return [MRStep(mapper=self.mapper,
                   combiner=self.combiner,
                   reducer=self.reducer),
            MRStep(reducer=self.reducer_diff)]

if __name__ == '__main__':
  ForestFireTempDiffs.run()

Overwriting temp_diffs.py


In [61]:
!rm -rf output5/
!python temp_diffs.py forestfireinput.csv --output-dir=output5
!cat /content/output5/part-* > /content/output5/temp_diffs

No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 2...
Creating temp directory /tmp/temp_diffs.root.20250912.150123.182672
Running step 2 of 2...
job output is in output5
Removing temp directory /tmp/temp_diffs.root.20250912.150123.182672...


## FASTA ENTROPY

In [62]:
%%file entropy_fasta.py

from mrjob.job import MRJob
from mrjob.step import MRStep
import math

class EntropyFasta(MRJob):

  def mapper(self, _, value):
    line = list(value) #split por caracter
    if line[0] == ">":
      pass
    else:
      for c in line:
        yield c, 1
        yield "total", 1

  def reducerA(self, key, value):
    yield "entropy", [key, sum(value)]

  def reducerB(self, key, value):
    # encontra o valor de "total" para calcular a probabilidade de cada char
    total = 0
    char_freq = []

    for k,v in value:
      if k == "total":
        total = v
      else:
        char_freq.append((k, v))

    # calcula a entropia
    for k,v in char_freq:
      prob = v/total
      entropy = -prob*math.log(prob, 2)
      yield k, entropy

  def steps(self):
    return [
      MRStep(mapper=self.mapper, reducer=self.reducerA),
      MRStep(reducer=self.reducerB)
    ]

if __name__ == '__main__':
  EntropyFasta.run()

Writing entropy_fasta.py


In [64]:
!rm -rf output6/
!python entropy_fasta.py Sars_cov_2.fasta --output-dir=output6


No configs found; falling back on auto-configuration
No configs specified for inline runner
Running step 1 of 2...
Creating temp directory /tmp/entropy_fasta.root.20250912.153209.125825
Running step 2 of 2...
job output is in output6
Removing temp directory /tmp/entropy_fasta.root.20250912.153209.125825...
