Skip to content

Commit

Permalink
y24wk03
Browse files Browse the repository at this point in the history
  • Loading branch information
IcaroBernardes committed Feb 24, 2024
1 parent c725b61 commit 1829810
Show file tree
Hide file tree
Showing 99 changed files with 545 additions and 60 deletions.
Binary file added 2024/week03/data.xlsx
Binary file not shown.
137 changes: 137 additions & 0 deletions 2024/week03/week03.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
# 0. Initial setup ##########
## Loads packages
library(dplyr)
library(forcats)
library(ggplot2)
library(ggtext)
library(ggview)
library(glue)
library(junebug)
library(readxl)
library(scales)
library(systemfonts)

## Defines colors
palette <- c("#000000", "#654321", "#d2b48c", "#ffd700", "#ffc0cb", "#dc143c", "#00aa00", "#4682b4", "#7e6583")
black <- palette[1]
brown <- palette[2]
tan <- palette[3]
gold <- palette[4]
pink <- palette[5]
red <- palette[6]
green <- palette[7]
blue <- palette[8]

## Makes special styled fonts available to R (e.g.: Medium, Solid, etc)
### Lists fonts visible to {systemfonts}
fonts_list <- systemfonts::system_fonts()

### Takes all font styles that share that exact family name and
### registers them (makes them visible to {systemfonts})
junebug::font_hoist("Font Awesome 6 Brands")

### Gets the info used to register the font families
fonts_register <- systemfonts::registry_fonts()

## Defines the fonts
font_brands_glyphs <- "Font Awesome 6 Brands Regular"

## Loads the data. Data downloaded and picked from Table 2.17 of the
## "Padrão de vida e distribuição de rendimentos" section of this IBGE page:
## https://www.ibge.gov.br/estatisticas/sociais/populacao/9221-sintese-de-indicadores-sociais.html?=&t=resultados
rawData <- readxl::read_xlsx("2024/week03/data.xlsx")

# 1. Data handling ##########
## Keeps only data on extreme poverty among Blacks
workData <- rawData |>
dplyr::filter(race == "blacks",
vulnerability == "extreme poverty")

## Converts the years to factor and reverses the order
workData <- workData |>
dplyr::mutate(year = factor(year),
year = forcats::fct_rev(year))

## Highlights the first and last values of the series
highlightData <- workData |>
dplyr::slice(c(1, n()))

# 2. Plot production ##########
## Creates the title
title <- "
<span style='font-size:120px;'>PERCENTAGE OF BLACKS IN EXTREME POVERTY.</span>
<br><br>
<span style='font-size:70px;'>VALUES ARE CONVERTED FROM BRL TO USD PPP (PURCHASING POWER PARITY) AS OF 2017.</span><br>
<span style='font-size:70px;'>DATA SHOWS THE PERCENTAGE OF PEOPLE WITH INCOME PER CAPITA BELOW USD 2.15 PPP 2017.</span>
<br><br>
<span style='font-size:60px;'>INSPIRED BY: W.E.B. DU BOIS | DATA FROM: IBGE | GRAPHIC BY: ÍCARO BERNARDES<br>
<span style='font-family:\"Font Awesome 6 Brands Regular\";font-size:40px;'>\uf099 \uf16d \uf08c </span>@IcaroBSC |
<span style='font-family:\"Font Awesome 6 Brands Regular\";font-size:40px;'>\uf09b </span>@IcaroBernardes
</span>
"

## Creates the message
maxWht <- rawData |>
dplyr::filter(race == "whites",
vulnerability == "extreme poverty") |>
dplyr::slice_max(pct) |>
dplyr::mutate(pct = scales::label_percent(accuracy = 0.01, scale = 1)(pct))
message <- glue::glue("
<span style='font-size:230px;'>{maxWht$pct}</span>
<span style='font-size:80px;'>OF THE WHITES WERE IN EXTREME POVERTY IN {maxWht$year}.</span>
")

## Creates the plot
p <- workData |>
ggplot() +

### Places the bars
geom_col(aes(x = pct, y = year),
fill = red, color = black, width = 0.5) +

### Places the highlighted labels
geom_text(
aes(
x = pct/2, y = year,
label = scales::label_percent(accuracy = 0.01, scale = 1)(pct)
),
family = "Teko", fontface = "bold",
size = 70, size.unit = "pt", data = highlightData
) +

### Annotates the message
annotate(
"TextBox", x = I(1), y = I(7.8), label = message,
hjust = 1, vjust = 0, width = unit(0.27, "npc"), size = 15,
family = "Teko", fill = NA, box.colour = NA, color = brown
) +

### Controls x-axis expansion beyond limits
scale_x_continuous(expand = expansion(0,0)) +

### Places the title
labs(title = title) +

### Eliminates and customizes plot elements
theme_void() +
theme(
plot.title = ggtext::element_markdown(
hjust = 0.5, vjust = 0, size = 20, lineheight = 2.5,
margin = margin(0, 120, 40, 0)
),
plot.background = element_rect(fill = tan, color = tan),
plot.margin = margin(80, 60, 60, 60),

legend.position = "none",
text = element_text(family = "Teko"),

axis.text.y = element_text(size = 70, margin = margin(0, 20, 0, 0))
)

## Shows an accurate preview of the plot
ggview::ggview(p, device = "png", dpi = 320,
units = "in", width = 22, height = 28)

## Saves the plot
ggsave("2024/week03/week03.png", plot = p, device = "png", dpi = 320,
units = "in", width = 22, height = 28)
Binary file added 2024/week03/week03.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
188 changes: 188 additions & 0 deletions 2024/weekXX/NER.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
# 0. Initial setup ##########
## Loads packages
library(dplyr)
library(huggingfaceR)
library(purrr)
library(glue)
library(stringr)
library(cli)
library(carnaval)
library(readxl)
library(tidyr)
library(tidytext)
library(tm)
library(stringi)

## Loads the data
rawData <- readxl::read_xlsx("2024/week03/data.xlsx") |>
dplyr::distinct() |>
dplyr::filter(between(ano, 1993, 2018))

## Gets the scores on the Sambas' lyric and melody
sambaScore <- get_scores(1993:2018, criterions = "SAMBA DE ENREDO")

## Downloads the pipeline from
## https://huggingface.co/FacebookAI/xlm-roberta-large-finetuned-conll03-english
modelRoberta <- huggingfaceR::hf_load_pipeline(
model_id = "FacebookAI/xlm-roberta-large-finetuned-conll03-english",
task = "token-classification"
)

# 1. Data cleaning ##########
## Filters out schools that weren't evaluated
workData <- rawData |>
dplyr::filter(stringr::str_detect(escola, "não incluída no julgamento", negate = TRUE))

## Gets a list of all schools by year
paradeList <- sambaScore |>
dplyr::summarise(
names = glue::glue_collapse(school, sep = "|"),
.by = year
)

## Separates the name of the school from the "Enredo" (theme of the parade)
workData <- workData |>
dplyr::rename(year = ano,
lyrics = letra) |>
dplyr::left_join(paradeList) |>
dplyr::mutate(school = stringr::str_extract(escola, names),
theme = stringr::str_remove(escola, school),
theme = stringr::str_remove(theme, "(( L)?( A)?( S)?)$"),
theme = stringr::str_trim(theme)) |>
dplyr::select(year, school, theme, lyrics)

# 2. NER process ##########
## Adds periods to the end of the sentences
workData <- workData |>
dplyr::mutate(lyrics = str_replace_all(lyrics, "\r\n", " \\.\r\n"))

## Creates a function that submits the lyrics
## through the pipeline and returns a tibble
extractor <- function(lyric) {
lyric |>
modelRoberta() |>
purrr::compact() |>
purrr::map(dplyr::as_tibble) |>
purrr::list_rbind()
}

## Creates a function that extracts the entities
## with score above 0.7. Also binds together entities
## that are next to each other on the lyrics
binder <- function(tbl) {

### Verifies which rows have their end close to start of the next row
tbl <- tbl |>
dplyr::mutate(together = (lead(start, default = 10000) - end <= 1),
row = 1:n())

### Lists the row number of lines that break/end the sequence of closeness
vec <- which(!tbl$together)

### Groups the rows and collapses them
tbl <- tbl |>
dplyr::mutate(
together = santoku::chop(row, vec, seq_along(vec),
left = FALSE, close_end = FALSE)
) |>
dplyr::summarise(
word = glue::glue_collapse(word),
.by = c(together, entity)
)

### Cleans the entities
tbl |>
dplyr::mutate(word = stringr::str_replace_all(word, "", " "),
word = stringr::str_trim(word)) |>
dplyr::select(-together)

}

## Wraps all in a single function
## with a failsafe in case of error
safePipe <- purrr::possibly(
\(el) {
el |>
extractor() |>
binder()
}
)

## Process all data through the pipeline
result <- workData$lyrics |>
purrr::imap(\(el, row, total = nrow(workData)) {

### Comunicates progress
cli::cli_inform("Processing row {row}/{total}...")

### Operates the pipeline
safePipe(el)

})

## Merges the data
workData <- workData |>
dplyr::mutate(entities = result) |>
tidyr::unnest(cols = entities)

## Gets the year and lyrics to count
## the frequency of the terms (without NER)
textData <- workData |>
dplyr::select(year, lyrics)

## Keeps entities that have three or more characters
workData <- workData |>
dplyr::filter(stringr::str_length(word) >= 3)

## Counts the occurrence by type
countData <- workData |>
dplyr::count(entity, word, sort = TRUE)

## Separates the data by type
countData <- countData |>
tidyr::nest(.by = entity)

## Shows the most frequent entities
### Location
countData |>
dplyr::filter(entity == "I-LOC") |>
dplyr::pull(data) |>
purrr::pluck(1) |>
print(n = 15)

### Person
countData |>
dplyr::filter(entity == "I-PER") |>
dplyr::pull(data) |>
purrr::pluck(1) |>
print(n = 15)

### Organization
countData |>
dplyr::filter(entity == "I-ORG") |>
dplyr::pull(data) |>
purrr::pluck(1) |>
print(n = 15)

### Miscellanea
countData |>
dplyr::filter(entity == "I-MISC") |>
dplyr::pull(data) |>
purrr::pluck(1) |>
print(n = 15)

# 3. Counting terms frequency ##########
## Standardizes the lyrics
textData <- textData |>
dplyr::mutate(lyrics = stringi::stri_trans_general(lyrics, "lower; latin-ascii"))

## Filters out stopwords
stops <- tm::stopwords(kind = "portuguese")
stops <- c(stops, "vem", "pra", "vou", "nao", "bis", "vai", "faz", "la")
textData <- textData |>
dplyr::mutate(lyrics = removeWords(lyrics, stops))

## Counting terms
textData <- textData |>
tidytext::unnest_tokens(output = "word", input = "lyrics") |>
dplyr::count(word, sort = TRUE)
Binary file added 2024/weekXX/data.xlsx
Binary file not shown.
Loading

0 comments on commit 1829810

Please sign in to comment.