## EDLD 654: Machine Learning for Educational Data Science 

In [1]:
dat1 <- read.csv('../input/llm-detect-ai-generated-text/train_essays.csv')
test <- read.csv('../input/augmented-data-for-llm-detect-ai-generated-text/final_test.csv')

set.seed(12345)

test <- test[sample(1:nrow(test), round(nrow(test) * 0.2)),]


In [2]:
dat1 <- dat1[, !names(dat1) %in% c('id', 'prompt_id')]
colnames(dat1)[colnames(dat1) == "generated"] <- "label"

In [3]:
require(reticulate)
conda_list()
use_condaenv("r-reticulate")
reticulate::conda_install(envname  = 'r-reticulate',
                          packages = 'sentence_transformers',
                          pip      = TRUE)

Loading required package: reticulate



name,python
<chr>,<chr>
base,/root/.local/share/r-miniconda/bin/python
r-reticulate,/root/.local/share/r-miniconda/envs/r-reticulate/bin/python


In [4]:
st <- import('sentence_transformers')
model.name <- 'allenai/longformer-base-4096'
longformer      <- st$models$Transformer(model.name)
pooling_model   <- st$models$Pooling(longformer$get_word_embedding_dimension())
LFmodel <- st$SentenceTransformer(modules = list(longformer,pooling_model))

In [5]:
LFmodel$get_max_seq_length()
LFmodel$get_sentence_embedding_dimension()

In [6]:
Sys.time()
embeddings <- LFmodel$encode(dat1$text,
                            show_progress_bar = TRUE)
Sys.time()

[1] "2023-12-01 18:43:43 UTC"

[1] "2023-12-01 20:14:09 UTC"

In [7]:
dat2 <- read.csv('../input/daigt-data-llama-70b-and-falcon180b/llama_70b_v2.csv')
dat2 <- dat2[, !names(dat2) %in% c('X', 'writing_prompt')]
colnames(dat2)[colnames(dat2) == "generated_text"] <- "text"
colnames(dat2)[colnames(dat2) == "generated"] <- "label"

In [8]:
Sys.time()
embeddings2 <- LFmodel$encode(dat2$text,
                            show_progress_bar = TRUE)
Sys.time()

[1] "2023-12-01 20:14:09 UTC"

[1] "2023-12-01 21:11:36 UTC"

In [9]:
dat <- rbind(dat1, dat2)
train_data <- rbind(embeddings, embeddings2)
df <- as.data.frame(train_data)

In [10]:
dim(df)
head(df)

Unnamed: 0_level_0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,⋯,V759,V760,V761,V762,V763,V764,V765,V766,V767,V768
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
1,-0.01065577,0.17162082,0.07070802,-0.14250818,0.0940149,0.2281031,0.0384746,0.0101683084,0.007156015,0.0080927,⋯,-0.0008820345,-0.05510386,0.02274735,0.0130305253,0.079225592,0.08573294,0.02887944,0.068842202,0.015631704,-0.03130931
2,0.01087508,0.21920277,0.07373214,-0.13705434,0.12080667,0.2224719,0.03767246,0.0205100421,-0.033579458,-0.02234134,⋯,-0.0103599159,-0.01774607,-0.00597649,0.0008034957,0.081369951,0.09313472,0.03644373,0.046994265,-0.002939301,-0.03359382
3,0.02878954,0.15941419,0.11170983,-0.13862039,0.09010795,0.2698942,0.01351254,0.0578485802,-0.019821843,-0.0482122,⋯,-0.015643809,-0.05600847,0.04371329,0.0443126149,0.13832888,0.07420971,0.02502623,-0.006326944,0.041711487,-0.02182001
4,0.03325859,0.23046023,0.10813868,-0.12438165,0.10976613,0.2397555,0.02504187,0.0008743505,0.022798153,0.02452957,⋯,-0.0514836572,-0.01167708,0.07093766,0.0241212081,0.120546617,0.06744061,-0.02056895,0.064800523,0.032108698,-0.01556792
5,0.06590359,0.1144085,0.07370606,-0.13567548,0.01663973,0.1234191,0.00533645,0.0252679978,0.039820533,-0.03309957,⋯,0.0130621456,-0.07256314,0.02431746,0.0094972663,0.092259914,0.06140486,0.14072876,0.024600221,-0.010641599,0.01302812
6,0.01830397,0.07895082,0.15187693,-0.04768516,-0.2325585,0.146431,0.03889984,0.2115037888,0.135829598,-0.04959927,⋯,0.0304254126,-0.1153921,-0.08152495,0.0051791477,-0.002064273,0.03301026,0.14738438,0.086929388,0.003801686,0.06784646


In [11]:
df$text <- dat$text
df$label <- as.factor(dat$label)

In [12]:
write.csv(df, 'df.csv')

In [13]:
require(recipes)
require(dplyr)

predictor_names <- as.character(colnames(df))

embed_names <- colnames(df)[1:768]

non_numeric_cols <- c('text', 'label')

blueprint <- recipe(x = df,
                    vars = c(paste(predictor_names)),
                    roles = c(rep('predictor',769),'outcome')) %>%
                    step_normalize(all_of(embed_names)) %>%
                    step_scale(all_of(setdiff(embed_names, non_numeric_cols))) %>%
                    step_impute_mean(all_predictors(), -all_of(non_numeric_cols))

blueprint

Loading required package: recipes

Loading required package: dplyr


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘recipes’


The following object is masked from ‘package:stats’:

    step




[36m──[39m [1mRecipe[22m [36m──────────────────────────────────────────────────────────────────────[39m



── Inputs 

Number of variables by role

outcome:     1
predictor: 769



── Operations 

[36m•[39m Centering and scaling for: [34mall_of(embed_names)[39m

[36m•[39m Scaling for: [34mall_of(setdiff(embed_names, non_numeric_cols))[39m

[36m•[39m Mean imputation for: [34mall_predictors()[39m, [34m-all_of(non_numeric_cols)[39m



In [14]:
prepare <- prep(blueprint,
                training = df)

prepare



[36m──[39m [1mRecipe[22m [36m──────────────────────────────────────────────────────────────────────[39m



── Inputs 

Number of variables by role

outcome:     1
predictor: 769



── Training information 

Training data contained 2550 data points and no incomplete rows.



── Operations 

[36m•[39m Centering and scaling for: [34mV1[39m, [34mV2[39m, [34mV3[39m, [34mV4[39m, [34mV5[39m, [34mV6[39m, [34mV7[39m, [34mV8[39m, [34mV9[39m, ... | [3mTrained[23m

[36m•[39m Scaling for: [34mV1[39m, [34mV2[39m, [34mV3[39m, [34mV4[39m, [34mV5[39m, [34mV6[39m, [34mV7[39m, [34mV8[39m, [34mV9[39m, [34mV10[39m, [34mV11[39m, [34mV12[39m, ... | [3mTrained[23m

[36m•[39m Mean imputation for: [34mV1[39m, [34mV2[39m, [34mV3[39m, [34mV4[39m, [34mV5[39m, [34mV6[39m, [34mV7[39m, [34mV8[39m, [34mV9[39m, [34mV10[39m, ... | [3mTrained[23m



In [15]:
baked <- bake(prepare, new_data = df)
dim(baked)

In [16]:
write.csv(baked, 'baked.csv')