-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c725b61
commit 1829810
Showing
99 changed files
with
545 additions
and
60 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
# 0. Initial setup ########## | ||
## Loads packages | ||
library(dplyr) | ||
library(forcats) | ||
library(ggplot2) | ||
library(ggtext) | ||
library(ggview) | ||
library(glue) | ||
library(junebug) | ||
library(readxl) | ||
library(scales) | ||
library(systemfonts) | ||
|
||
## Defines colors | ||
palette <- c("#000000", "#654321", "#d2b48c", "#ffd700", "#ffc0cb", "#dc143c", "#00aa00", "#4682b4", "#7e6583") | ||
black <- palette[1] | ||
brown <- palette[2] | ||
tan <- palette[3] | ||
gold <- palette[4] | ||
pink <- palette[5] | ||
red <- palette[6] | ||
green <- palette[7] | ||
blue <- palette[8] | ||
|
||
## Makes special styled fonts available to R (e.g.: Medium, Solid, etc) | ||
### Lists fonts visible to {systemfonts} | ||
fonts_list <- systemfonts::system_fonts() | ||
|
||
### Takes all font styles that share that exact family name and | ||
### registers them (makes them visible to {systemfonts}) | ||
junebug::font_hoist("Font Awesome 6 Brands") | ||
|
||
### Gets the info used to register the font families | ||
fonts_register <- systemfonts::registry_fonts() | ||
|
||
## Defines the fonts | ||
font_brands_glyphs <- "Font Awesome 6 Brands Regular" | ||
|
||
## Loads the data. Data downloaded and picked from Table 2.17 of the | ||
## "Padrão de vida e distribuição de rendimentos" section of this IBGE page: | ||
## https://www.ibge.gov.br/estatisticas/sociais/populacao/9221-sintese-de-indicadores-sociais.html?=&t=resultados | ||
rawData <- readxl::read_xlsx("2024/week03/data.xlsx") | ||
|
||
# 1. Data handling ########## | ||
## Keeps only data on extreme poverty among Blacks | ||
workData <- rawData |> | ||
dplyr::filter(race == "blacks", | ||
vulnerability == "extreme poverty") | ||
|
||
## Converts the years to factor and reverses the order | ||
workData <- workData |> | ||
dplyr::mutate(year = factor(year), | ||
year = forcats::fct_rev(year)) | ||
|
||
## Highlights the first and last values of the series | ||
highlightData <- workData |> | ||
dplyr::slice(c(1, n())) | ||
|
||
# 2. Plot production ########## | ||
## Creates the title | ||
title <- " | ||
<span style='font-size:120px;'>PERCENTAGE OF BLACKS IN EXTREME POVERTY.</span> | ||
<br><br> | ||
<span style='font-size:70px;'>VALUES ARE CONVERTED FROM BRL TO USD PPP (PURCHASING POWER PARITY) AS OF 2017.</span><br> | ||
<span style='font-size:70px;'>DATA SHOWS THE PERCENTAGE OF PEOPLE WITH INCOME PER CAPITA BELOW USD 2.15 PPP 2017.</span> | ||
<br><br> | ||
<span style='font-size:60px;'>INSPIRED BY: W.E.B. DU BOIS | DATA FROM: IBGE | GRAPHIC BY: ÍCARO BERNARDES<br> | ||
<span style='font-family:\"Font Awesome 6 Brands Regular\";font-size:40px;'>\uf099 \uf16d \uf08c </span>@IcaroBSC | | ||
<span style='font-family:\"Font Awesome 6 Brands Regular\";font-size:40px;'>\uf09b </span>@IcaroBernardes | ||
</span> | ||
" | ||
|
||
## Creates the message | ||
maxWht <- rawData |> | ||
dplyr::filter(race == "whites", | ||
vulnerability == "extreme poverty") |> | ||
dplyr::slice_max(pct) |> | ||
dplyr::mutate(pct = scales::label_percent(accuracy = 0.01, scale = 1)(pct)) | ||
message <- glue::glue(" | ||
<span style='font-size:230px;'>{maxWht$pct}</span> | ||
<span style='font-size:80px;'>OF THE WHITES WERE IN EXTREME POVERTY IN {maxWht$year}.</span> | ||
") | ||
|
||
## Creates the plot | ||
p <- workData |> | ||
ggplot() + | ||
|
||
### Places the bars | ||
geom_col(aes(x = pct, y = year), | ||
fill = red, color = black, width = 0.5) + | ||
|
||
### Places the highlighted labels | ||
geom_text( | ||
aes( | ||
x = pct/2, y = year, | ||
label = scales::label_percent(accuracy = 0.01, scale = 1)(pct) | ||
), | ||
family = "Teko", fontface = "bold", | ||
size = 70, size.unit = "pt", data = highlightData | ||
) + | ||
|
||
### Annotates the message | ||
annotate( | ||
"TextBox", x = I(1), y = I(7.8), label = message, | ||
hjust = 1, vjust = 0, width = unit(0.27, "npc"), size = 15, | ||
family = "Teko", fill = NA, box.colour = NA, color = brown | ||
) + | ||
|
||
### Controls x-axis expansion beyond limits | ||
scale_x_continuous(expand = expansion(0,0)) + | ||
|
||
### Places the title | ||
labs(title = title) + | ||
|
||
### Eliminates and customizes plot elements | ||
theme_void() + | ||
theme( | ||
plot.title = ggtext::element_markdown( | ||
hjust = 0.5, vjust = 0, size = 20, lineheight = 2.5, | ||
margin = margin(0, 120, 40, 0) | ||
), | ||
plot.background = element_rect(fill = tan, color = tan), | ||
plot.margin = margin(80, 60, 60, 60), | ||
|
||
legend.position = "none", | ||
text = element_text(family = "Teko"), | ||
|
||
axis.text.y = element_text(size = 70, margin = margin(0, 20, 0, 0)) | ||
) | ||
|
||
## Shows an accurate preview of the plot | ||
ggview::ggview(p, device = "png", dpi = 320, | ||
units = "in", width = 22, height = 28) | ||
|
||
## Saves the plot | ||
ggsave("2024/week03/week03.png", plot = p, device = "png", dpi = 320, | ||
units = "in", width = 22, height = 28) |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,188 @@ | ||
# 0. Initial setup ########## | ||
## Loads packages | ||
library(dplyr) | ||
library(huggingfaceR) | ||
library(purrr) | ||
library(glue) | ||
library(stringr) | ||
library(cli) | ||
library(carnaval) | ||
library(readxl) | ||
library(tidyr) | ||
library(tidytext) | ||
library(tm) | ||
library(stringi) | ||
|
||
## Loads the data | ||
rawData <- readxl::read_xlsx("2024/week03/data.xlsx") |> | ||
dplyr::distinct() |> | ||
dplyr::filter(between(ano, 1993, 2018)) | ||
|
||
## Gets the scores on the Sambas' lyric and melody | ||
sambaScore <- get_scores(1993:2018, criterions = "SAMBA DE ENREDO") | ||
|
||
## Downloads the pipeline from | ||
## https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-english | ||
modelRoberta <- huggingfaceR::hf_load_pipeline( | ||
model_id = "FacebookAI/xlm-roberta-large-finetuned-conll03-english", | ||
task = "token-classification" | ||
) | ||
|
||
# 1. Data cleaning ########## | ||
## Filters out schools that weren't evaluated | ||
workData <- rawData |> | ||
dplyr::filter(stringr::str_detect(escola, "não incluída no julgamento", negate = TRUE)) | ||
|
||
## Gets a list of all schools by year | ||
paradeList <- sambaScore |> | ||
dplyr::summarise( | ||
names = glue::glue_collapse(school, sep = "|"), | ||
.by = year | ||
) | ||
|
||
## Separates the name of the school from the "Enredo" (theme of the parade) | ||
workData <- workData |> | ||
dplyr::rename(year = ano, | ||
lyrics = letra) |> | ||
dplyr::left_join(paradeList) |> | ||
dplyr::mutate(school = stringr::str_extract(escola, names), | ||
theme = stringr::str_remove(escola, school), | ||
theme = stringr::str_remove(theme, "(( L)?( A)?( S)?)$"), | ||
theme = stringr::str_trim(theme)) |> | ||
dplyr::select(year, school, theme, lyrics) | ||
|
||
# 2. NER process ########## | ||
## Adds periods to the end of the sentences | ||
workData <- workData |> | ||
dplyr::mutate(lyrics = str_replace_all(lyrics, "\r\n", " \\.\r\n")) | ||
|
||
## Creates a function that submits the lyrics | ||
## through the pipeline and returns a tibble | ||
extractor <- function(lyric) { | ||
lyric |> | ||
modelRoberta() |> | ||
purrr::compact() |> | ||
purrr::map(dplyr::as_tibble) |> | ||
purrr::list_rbind() | ||
} | ||
|
||
## Creates a function that extracts the entities | ||
## with score above 0.7. Also binds together entities | ||
## that are next to each other on the lyrics | ||
binder <- function(tbl) { | ||
|
||
### Verifies which rows have their end close to start of the next row | ||
tbl <- tbl |> | ||
dplyr::mutate(together = (lead(start, default = 10000) - end <= 1), | ||
row = 1:n()) | ||
|
||
### Lists the row number of lines that break/end the sequence of closeness | ||
vec <- which(!tbl$together) | ||
|
||
### Groups the rows and collapses them | ||
tbl <- tbl |> | ||
dplyr::mutate( | ||
together = santoku::chop(row, vec, seq_along(vec), | ||
left = FALSE, close_end = FALSE) | ||
) |> | ||
dplyr::summarise( | ||
word = glue::glue_collapse(word), | ||
.by = c(together, entity) | ||
) | ||
|
||
### Cleans the entities | ||
tbl |> | ||
dplyr::mutate(word = stringr::str_replace_all(word, "▁", " "), | ||
word = stringr::str_trim(word)) |> | ||
dplyr::select(-together) | ||
|
||
} | ||
|
||
## Wraps all in a single function | ||
## with a failsafe in case of error | ||
safePipe <- purrr::possibly( | ||
\(el) { | ||
el |> | ||
extractor() |> | ||
binder() | ||
} | ||
) | ||
|
||
## Process all data through the pipeline | ||
result <- workData$lyrics |> | ||
purrr::imap(\(el, row, total = nrow(workData)) { | ||
|
||
### Comunicates progress | ||
cli::cli_inform("Processing row {row}/{total}...") | ||
|
||
### Operates the pipeline | ||
safePipe(el) | ||
|
||
}) | ||
|
||
## Merges the data | ||
workData <- workData |> | ||
dplyr::mutate(entities = result) |> | ||
tidyr::unnest(cols = entities) | ||
|
||
## Gets the year and lyrics to count | ||
## the frequency of the terms (without NER) | ||
textData <- workData |> | ||
dplyr::select(year, lyrics) | ||
|
||
## Keeps entities that have three or more characters | ||
workData <- workData |> | ||
dplyr::filter(stringr::str_length(word) >= 3) | ||
|
||
## Counts the occurrence by type | ||
countData <- workData |> | ||
dplyr::count(entity, word, sort = TRUE) | ||
|
||
## Separates the data by type | ||
countData <- countData |> | ||
tidyr::nest(.by = entity) | ||
|
||
## Shows the most frequent entities | ||
### Location | ||
countData |> | ||
dplyr::filter(entity == "I-LOC") |> | ||
dplyr::pull(data) |> | ||
purrr::pluck(1) |> | ||
print(n = 15) | ||
|
||
### Person | ||
countData |> | ||
dplyr::filter(entity == "I-PER") |> | ||
dplyr::pull(data) |> | ||
purrr::pluck(1) |> | ||
print(n = 15) | ||
|
||
### Organization | ||
countData |> | ||
dplyr::filter(entity == "I-ORG") |> | ||
dplyr::pull(data) |> | ||
purrr::pluck(1) |> | ||
print(n = 15) | ||
|
||
### Miscellanea | ||
countData |> | ||
dplyr::filter(entity == "I-MISC") |> | ||
dplyr::pull(data) |> | ||
purrr::pluck(1) |> | ||
print(n = 15) | ||
|
||
# 3. Counting terms frequency ########## | ||
## Standardizes the lyrics | ||
textData <- textData |> | ||
dplyr::mutate(lyrics = stringi::stri_trans_general(lyrics, "lower; latin-ascii")) | ||
|
||
## Filters out stopwords | ||
stops <- tm::stopwords(kind = "portuguese") | ||
stops <- c(stops, "vem", "pra", "vou", "nao", "bis", "vai", "faz", "la") | ||
textData <- textData |> | ||
dplyr::mutate(lyrics = removeWords(lyrics, stops)) | ||
|
||
## Counting terms | ||
textData <- textData |> | ||
tidytext::unnest_tokens(output = "word", input = "lyrics") |> | ||
dplyr::count(word, sort = TRUE) |
Binary file not shown.
Oops, something went wrong.