In [1]:
library(tidyverse)
library(haven)
library(dplyr)
library(scales)
library(tidyr)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mggplot2  [39m 3.5.2     [32m✔[39m [34mtibble   [39m 3.3.0
[32m✔[39m [34mlubridate[39m 1.9.4     [32m✔[39m [34mtidyr    [39m 1.3.1
[32m✔[39m [34mpurrr    [39m 1.1.0     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors

Attaching package: ‘scales’


The following object is masked from ‘package:purrr’:

    discard


The following object is masked from ‘package:readr’:

    col_factor



In [2]:
my_data <- read.csv("censusdata.csv")

In [3]:
columns_data <- colnames(my_data)
columns_data

In [4]:
my_data <- my_data |> select(NOC21)
head(my_data)

Unnamed: 0_level_0,NOC21
Unnamed: 0_level_1,<int>
1,88
2,14
3,99
4,99
5,99
6,99


In [5]:
my_data <- my_data |> filter(NOC21 != 99 & NOC21 != 88, na.rm = TRUE)
head(my_data)

Unnamed: 0_level_0,NOC21
Unnamed: 0_level_1,<int>
1,14
2,25
3,1
4,3
5,6
6,22


In [6]:
total_obs <- my_data |> nrow()
total_obs

In [25]:
summary_stats <- my_data |> group_by(NOC21) |> 
                 summarise(count = n())
#proportion = count/total_obs, 
                           #std_dev = proportion*(1-proportion)/count)
head(summary_stats)

stats_table <- summary_stats |> summarise(mean = mean(count), standard_deviation = sd(count),
                                         max = max(count))
stats_table

NOC21,count
<int>,<int>
1,359
2,3077
3,1144
4,738
5,1190
6,1426


mean,standard_deviation,max
<dbl>,<dbl>,<int>
1145.192,731.9563,3077


In [8]:
result <- summary_stats %>%
  mutate(group = case_when(
    NOC21 %in% c(1, 2, 3, 4, 5, 6, 14) ~ "Administrative + Financial",
    NOC21 %in% c(7, 8, 9, 10, 11, 15, 21) ~ "Technical/Sciences",
    NOC21 %in% c(17, 18, 19, 20, 22) ~ "Sales/Trades",
    NOC21 %in% c(23, 24, 25, 26) ~ "Labourers",
    TRUE ~ "Other"  # Everything else
  )) %>%
  group_by(group) %>%
  summarise(total_count = sum(count),
           proportion = total_count/total_obs, 
           std_dev = proportion*(1-proportion)/total_count)
result

group,total_count,proportion,std_dev
<chr>,<int>,<dbl>,<dbl>
Administrative + Financial,8562,0.2875567,2.392757e-05
Labourers,3091,0.1038119,3.009868e-05
Other,3326,0.1117045,2.98336e-05
Sales/Trades,7833,0.263073,2.474986e-05
Technical/Sciences,6963,0.2338539,2.573119e-05


In [9]:
final_dummified_version <- result |> 
    mutate(dummy_variable = case_when (
        group %in% c("Administrative + Financial", "Other",	"Sales/Trades", "Technical/Sciences") ~
        1,
        TRUE ~ 0
    )) |> group_by(dummy_variable) |> summarise(new_total_count = sum(total_count),
           proportion = new_total_count/total_obs, 
           std_dev = proportion*(1-proportion)/new_total_count)
final_dummified_version

dummy_variable,new_total_count,proportion,std_dev
<dbl>,<int>,<dbl>,<dbl>
0,3091,0.1038119,3.009868e-05
1,26684,0.8961881,3.486547e-06


In [10]:
ai_data <- read.csv("AIusage.csv") |> rename(C1 = Business.characteristics)
head(ai_data)

Unnamed: 0_level_0,REF_DATE,GEO,DGUID,C1,Use.of.artificial.intelligence..AI..by.businesses.or.organizations.in.producing.goods.or.delivering.services.over.the.last.12.months,UOM,UOM_ID,SCALAR_FACTOR,SCALAR_ID,VECTOR,COORDINATE,VALUE,STATUS,SYMBOL,TERMINATED,DECIMALS
Unnamed: 0_level_1,<int>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<chr>,<int>,<chr>,<chr>,<dbl>,<chr>,<lgl>,<lgl>,<int>
1,2025,Canada,2021A000011124,"North American Industry Classification System (NAICS), all industries","Yes, business used AI for producing goods or delivering services over the last 12 months",Percent,239,units,0,v1671082146,1.1.1,12.2,A,,,1
2,2025,Canada,2021A000011124,"Agriculture, forestry, fishing and hunting [11]","Yes, business used AI for producing goods or delivering services over the last 12 months",Percent,239,units,0,v1671082366,1.2.1,1.8,A,,,1
3,2025,Canada,2021A000011124,"Mining, quarrying, and oil and gas extraction [21]","Yes, business used AI for producing goods or delivering services over the last 12 months",Percent,239,units,0,v1671082586,1.3.1,5.6,A,,,1
4,2025,Canada,2021A000011124,Construction [23],"Yes, business used AI for producing goods or delivering services over the last 12 months",Percent,239,units,0,v1671082806,1.4.1,3.6,A,,,1
5,2025,Canada,2021A000011124,Manufacturing [31-33],"Yes, business used AI for producing goods or delivering services over the last 12 months",Percent,239,units,0,v1671083026,1.5.1,13.1,B,,,1
6,2025,Canada,2021A000011124,Wholesale trade [41],"Yes, business used AI for producing goods or delivering services over the last 12 months",Percent,239,units,0,v1671083246,1.6.1,10.6,B,,,1


In [20]:
ai_data <- ai_data %>% select(C1, VALUE) |>
  filter(C1 %in% c('Agriculture, forestry, fishing and hunting [11]',
                   'Mining, quarrying, and oil and gas extraction [21]',
                   'Construction [23]',
                   'Manufacturing [31-33]',
                   'Wholesale trade [41]',
                   'Retail trade [44-45]',
                   'Transportation and warehousing [48-49]',
                   'Information and cultural industries [51]',
                   'Finance and insurance [52]',
                   'Real estate and rental and leasing [53]',
                   'Professional, scientific and technical services [54]',
                   'Administrative and support, waste management and remediation services [56]',
                   'Health care and social assistance [62]',
                   'Arts, entertainment and recreation [71]',
                   'Accommodation and food services [72]',
                   'Other services (except public administration) [81]'))
ai_data

stats_table <- summarise(ai_data, mean = mean(VALUE), sd = sd(VALUE), max = max(VALUE))
stats_table

C1,VALUE
<chr>,<dbl>
"Agriculture, forestry, fishing and hunting [11]",1.8
"Mining, quarrying, and oil and gas extraction [21]",5.6
Construction [23],3.6
Manufacturing [31-33],13.1
Wholesale trade [41],10.6
Retail trade [44-45],6.6
Transportation and warehousing [48-49],1.8
Information and cultural industries [51],35.6
Finance and insurance [52],30.6
Real estate and rental and leasing [53],11.8


mean,sd,max
<dbl>,<dbl>,<dbl>
12.675,10.97278,35.6


In [12]:
result <- ai_data %>%
  mutate(group = case_when(
    C1 %in% c('Finance and insurance [52]', 
              'Administrative and support, waste management and remediation services [56]')
      ~ "Administrative + Financial",
    C1 %in% c('Professional, scientific and technical services [54]',
             'Health care and social assistance [62]') ~ "Technical/Sciences",
    C1 %in% c('Wholesale trade [41]',
              'Retail trade [44-45]',
              'Accommodation and food services [72]') ~ "Sales/Trades",
    C1 %in% c('Agriculture, forestry, fishing and hunting [11]',
              'Mining, quarrying, and oil and gas extraction [21]',
              'Construction [23]',
              'Manufacturing [31-33]') ~ "Labourers",
    TRUE ~ "Other"  # Everything else
  )) %>%
  group_by(group) %>%
  summarise(total_percentage = sum(VALUE), 
           mean = mean(VALUE),
           std_dev = sd(VALUE))
result

group,total_percentage,mean,std_dev
<chr>,<dbl>,<dbl>,<dbl>
Administrative + Financial,40.4,20.2,14.707821
Labourers,24.1,6.025,4.965464
Other,70.5,14.1,12.894573
Sales/Trades,18.7,6.233333,4.561067
Technical/Sciences,49.1,24.55,10.111627


In [13]:
final_dummified_version <- ai_data %>%
  mutate(group = case_when(
    C1 %in% c('Finance and insurance [52]', 
              'Administrative and support, waste management and remediation services [56]')
      ~ "Administrative + Financial",
    C1 %in% c('Professional, scientific and technical services [54]',
             'Health care and social assistance [62]') ~ "Technical/Sciences",
    C1 %in% c('Wholesale trade [41]',
              'Retail trade [44-45]',
              'Accommodation and food services [72]') ~ "Sales/Trades",
    C1 %in% c('Agriculture, forestry, fishing and hunting [11]',
              'Mining, quarrying, and oil and gas extraction [21]',
              'Construction [23]',
              'Manufacturing [31-33]') ~ "Labourers",
    TRUE ~ "Other"
  )) %>%
  mutate(dummy_variable = case_when(
    group %in% c("Administrative + Financial", "Other", "Sales/Trades", "Technical/Sciences") ~ 1,
    TRUE ~ 0
  )) %>%
  group_by(dummy_variable) %>%
  summarise(new_total_percentage = sum(VALUE),
            mean = mean(VALUE),
            std_dev = sd(VALUE))
final_dummified_version

dummy_variable,new_total_percentage,mean,std_dev
<dbl>,<dbl>,<dbl>,<dbl>
0,24.1,6.025,4.965464
1,178.7,14.89167,11.662719
