y24wk03

IcaroBernardes · Feb 24, 2024 · 1829810 · 1829810
1 parent c725b61
commit 1829810
Show file tree

Hide file tree

Showing 99 changed files with 545 additions and 60 deletions.
diff --git a/2024/week03/data.xlsx b/2024/week03/data.xlsx
diff --git a/2024/week03/week03.R b/2024/week03/week03.R
@@ -0,0 +1,137 @@
+# 0. Initial setup ##########
+## Loads packages
+library(dplyr)
+library(forcats)
+library(ggplot2)
+library(ggtext)
+library(ggview)
+library(glue)
+library(junebug)
+library(readxl)
+library(scales)
+library(systemfonts)
+
+## Defines colors
+palette <- c("#000000", "#654321", "#d2b48c", "#ffd700", "#ffc0cb", "#dc143c", "#00aa00", "#4682b4", "#7e6583")
+black <- palette[1]
+brown <- palette[2]
+tan <- palette[3]
+gold <- palette[4]
+pink <- palette[5]
+red <- palette[6]
+green <- palette[7]
+blue <- palette[8]
+
+## Makes special styled fonts available to R (e.g.: Medium, Solid, etc)
+### Lists fonts visible to {systemfonts}
+fonts_list <- systemfonts::system_fonts()
+
+### Takes all font styles that share that exact family name and
+### registers them (makes them visible to {systemfonts})
+junebug::font_hoist("Font Awesome 6 Brands")
+
+### Gets the info used to register the font families
+fonts_register <- systemfonts::registry_fonts()
+
+## Defines the fonts
+font_brands_glyphs <- "Font Awesome 6 Brands Regular"
+
+## Loads the data. Data downloaded and picked from Table 2.17 of the
+## "Padrão de vida e distribuição de rendimentos" section of this IBGE page:
+## https://www.ibge.gov.br/estatisticas/sociais/populacao/9221-sintese-de-indicadores-sociais.html?=&t=resultados
+rawData <- readxl::read_xlsx("2024/week03/data.xlsx")
+
+# 1. Data handling ##########
+## Keeps only data on extreme poverty among Blacks
+workData <- rawData |> 
+  dplyr::filter(race == "blacks",
+                vulnerability == "extreme poverty")
+
+## Converts the years to factor and reverses the order
+workData <- workData |> 
+  dplyr::mutate(year = factor(year),
+                year = forcats::fct_rev(year))
+
+## Highlights the first and last values of the series
+highlightData <- workData |> 
+  dplyr::slice(c(1, n()))
+
+# 2. Plot production ##########
+## Creates the title
+title <- "
+<span style='font-size:120px;'>PERCENTAGE OF BLACKS IN EXTREME POVERTY.</span>
+<br><br>
+<span style='font-size:70px;'>VALUES ARE CONVERTED FROM BRL TO USD PPP (PURCHASING POWER PARITY) AS OF 2017.</span><br>
+<span style='font-size:70px;'>DATA SHOWS THE PERCENTAGE OF PEOPLE WITH INCOME PER CAPITA BELOW USD 2.15 PPP 2017.</span>
+<br><br>
+<span style='font-size:60px;'>INSPIRED BY: W.E.B. DU BOIS | DATA FROM: IBGE | GRAPHIC BY: ÍCARO BERNARDES<br>
+<span style='font-family:\"Font Awesome 6 Brands Regular\";font-size:40px;'>\uf099 \uf16d \uf08c </span>@IcaroBSC | 
+<span style='font-family:\"Font Awesome 6 Brands Regular\";font-size:40px;'>\uf09b </span>@IcaroBernardes 
+</span>
+"
+
+## Creates the message
+maxWht <- rawData |> 
+  dplyr::filter(race == "whites",
+                vulnerability == "extreme poverty") |> 
+  dplyr::slice_max(pct) |> 
+  dplyr::mutate(pct = scales::label_percent(accuracy = 0.01, scale = 1)(pct))
+message <- glue::glue("
+<span style='font-size:230px;'>{maxWht$pct}</span>
+<span style='font-size:80px;'>OF THE WHITES WERE IN EXTREME POVERTY IN {maxWht$year}.</span>
+")
+
+## Creates the plot
+p <- workData |> 
+  ggplot() +
+
+  ### Places the bars
+  geom_col(aes(x = pct, y = year),
+           fill = red, color = black, width = 0.5) +
+
+  ### Places the highlighted labels
+  geom_text(
+    aes(
+      x = pct/2, y = year,
+      label = scales::label_percent(accuracy = 0.01, scale = 1)(pct)
+    ),
+    family = "Teko", fontface = "bold",
+    size = 70, size.unit = "pt", data = highlightData
+  ) +
+
+  ### Annotates the message
+  annotate(
+    "TextBox", x = I(1), y = I(7.8), label = message,
+    hjust = 1, vjust = 0, width = unit(0.27, "npc"), size = 15,
+     family = "Teko", fill = NA, box.colour = NA, color = brown
+  ) +
+
+  ### Controls x-axis expansion beyond limits
+  scale_x_continuous(expand = expansion(0,0)) +
+
+  ### Places the title
+  labs(title = title) +
+
+  ### Eliminates and customizes plot elements
+  theme_void() +
+  theme(
+    plot.title = ggtext::element_markdown(
+      hjust = 0.5, vjust = 0, size = 20, lineheight = 2.5,
+      margin = margin(0, 120, 40, 0)
+    ),
+    plot.background = element_rect(fill = tan, color = tan),
+    plot.margin = margin(80, 60, 60, 60),
+
+    legend.position = "none",
+    text = element_text(family = "Teko"),
+
+    axis.text.y = element_text(size = 70, margin = margin(0, 20, 0, 0))
+  )
+
+## Shows an accurate preview of the plot
+ggview::ggview(p, device = "png", dpi = 320,
+               units = "in", width = 22, height = 28)
+
+## Saves the plot
+ggsave("2024/week03/week03.png", plot = p, device = "png", dpi = 320,
+       units = "in", width = 22, height = 28)
diff --git a/2024/week03/week03.png b/2024/week03/week03.png
diff --git a/2024/weekXX/NER.R b/2024/weekXX/NER.R
@@ -0,0 +1,188 @@
+# 0. Initial setup ##########
+## Loads packages
+library(dplyr)
+library(huggingfaceR)
+library(purrr)
+library(glue)
+library(stringr)
+library(cli)
+library(carnaval)
+library(readxl)
+library(tidyr)
+library(tidytext)
+library(tm)
+library(stringi)
+
+## Loads the data
+rawData <- readxl::read_xlsx("2024/week03/data.xlsx") |> 
+  dplyr::distinct() |> 
+  dplyr::filter(between(ano, 1993, 2018))
+
+## Gets the scores on the Sambas' lyric and melody
+sambaScore <- get_scores(1993:2018, criterions = "SAMBA DE ENREDO")
+
+## Downloads the pipeline from
+## https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-english
+modelRoberta <- huggingfaceR::hf_load_pipeline(
+  model_id = "FacebookAI/xlm-roberta-large-finetuned-conll03-english", 
+  task = "token-classification"
+)
+
+# 1. Data cleaning ##########
+## Filters out schools that weren't evaluated
+workData <- rawData |> 
+  dplyr::filter(stringr::str_detect(escola, "não incluída no julgamento", negate = TRUE))
+
+## Gets a list of all schools by year
+paradeList <- sambaScore |> 
+  dplyr::summarise(
+    names = glue::glue_collapse(school, sep = "|"),
+    .by = year
+  )
+
+## Separates the name of the school from the "Enredo" (theme of the parade)
+workData <- workData |> 
+  dplyr::rename(year = ano,
+                lyrics = letra) |> 
+  dplyr::left_join(paradeList) |> 
+  dplyr::mutate(school = stringr::str_extract(escola, names),
+                theme = stringr::str_remove(escola, school),
+                theme = stringr::str_remove(theme, "(( L)?( A)?( S)?)$"),
+                theme = stringr::str_trim(theme)) |> 
+  dplyr::select(year, school, theme, lyrics)
+
+# 2. NER process ##########
+## Adds periods to the end of the sentences
+workData <- workData |> 
+  dplyr::mutate(lyrics = str_replace_all(lyrics, "\r\n", " \\.\r\n"))
+
+## Creates a function that submits the lyrics
+## through the pipeline and returns a tibble
+extractor <- function(lyric) {
+  lyric |> 
+    modelRoberta() |> 
+    purrr::compact() |> 
+    purrr::map(dplyr::as_tibble) |> 
+    purrr::list_rbind()
+}
+
+## Creates a function that extracts the entities
+## with score above 0.7. Also binds together entities
+## that are next to each other on the lyrics
+binder <- function(tbl) {
+
+  ### Verifies which rows have their end close to start of the next row
+  tbl <- tbl |> 
+    dplyr::mutate(together = (lead(start, default = 10000) - end <= 1),
+                  row = 1:n())
+
+  ### Lists the row number of lines that break/end the sequence of closeness
+  vec <- which(!tbl$together)
+
+  ### Groups the rows and collapses them
+  tbl <- tbl |> 
+    dplyr::mutate(
+      together = santoku::chop(row, vec, seq_along(vec),
+                               left = FALSE, close_end = FALSE)
+    ) |> 
+    dplyr::summarise(
+      word = glue::glue_collapse(word),
+      .by = c(together, entity)
+    )
+
+  ### Cleans the entities
+  tbl |> 
+    dplyr::mutate(word = stringr::str_replace_all(word, "▁", " "),
+                  word = stringr::str_trim(word)) |> 
+    dplyr::select(-together)
+
+}
+
+## Wraps all in a single function
+## with a failsafe in case of error
+safePipe <- purrr::possibly(
+  \(el) {
+    el |> 
+      extractor() |> 
+      binder()
+  }
+)
+
+## Process all data through the pipeline
+result <- workData$lyrics |> 
+  purrr::imap(\(el, row, total = nrow(workData)) {
+
+    ### Comunicates progress
+    cli::cli_inform("Processing row {row}/{total}...")
+
+    ### Operates the pipeline
+    safePipe(el)
+
+  })
+
+## Merges the data
+workData <- workData |> 
+  dplyr::mutate(entities = result) |> 
+  tidyr::unnest(cols = entities)
+
+## Gets the year and lyrics to count
+## the frequency of the terms (without NER)
+textData <- workData |> 
+  dplyr::select(year, lyrics)
+
+## Keeps entities that have three or more characters
+workData <- workData |> 
+  dplyr::filter(stringr::str_length(word) >= 3)
+
+## Counts the occurrence by type
+countData <- workData |> 
+  dplyr::count(entity, word, sort = TRUE)
+
+## Separates the data by type
+countData <- countData |> 
+  tidyr::nest(.by = entity)
+
+## Shows the most frequent entities
+### Location
+countData |> 
+  dplyr::filter(entity == "I-LOC") |> 
+  dplyr::pull(data) |> 
+  purrr::pluck(1) |> 
+  print(n = 15)
+
+### Person
+countData |> 
+  dplyr::filter(entity == "I-PER") |> 
+  dplyr::pull(data) |> 
+  purrr::pluck(1) |> 
+  print(n = 15)
+
+### Organization
+countData |> 
+  dplyr::filter(entity == "I-ORG") |> 
+  dplyr::pull(data) |> 
+  purrr::pluck(1) |> 
+  print(n = 15)
+
+### Miscellanea
+countData |> 
+  dplyr::filter(entity == "I-MISC") |> 
+  dplyr::pull(data) |> 
+  purrr::pluck(1) |> 
+  print(n = 15)
+
+# 3. Counting terms frequency ##########
+## Standardizes the lyrics 
+textData <- textData |> 
+  dplyr::mutate(lyrics = stringi::stri_trans_general(lyrics, "lower; latin-ascii"))
+
+## Filters out stopwords
+stops <- tm::stopwords(kind = "portuguese")
+stops <- c(stops, "vem", "pra", "vou", "nao", "bis", "vai", "faz", "la")
+textData <- textData |> 
+  dplyr::mutate(lyrics = removeWords(lyrics, stops))
+
+## Counting terms
+textData <- textData |> 
+  tidytext::unnest_tokens(output = "word", input = "lyrics") |> 
+  dplyr::count(word, sort = TRUE)
diff --git a/2024/weekXX/data.xlsx b/2024/weekXX/data.xlsx