# Model Predictions

a couple of my features rely on predictions from off-the-shelf models. To ensure that inference doesn't take forever, I am running these models on Google Colab to utilize GPUs.

In [1]:
# Importing libraries
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from tqdm import tqdm
import torch

tqdm.pandas()

In [2]:
# Getting the data
data = pd.read_csv('preprocessed_data.csv')
data.head()

Unnamed: 0,row_id,essay,word_count,LLM_written,stop_word_count,stop_word_ratio,unique_word_count,unique_word_ratio,count_question,count_exclamation,count_semi,count_colon,grammar_errors
0,1,"Dear State Senator,\n\nI'm writting to you tod...",291,1,137,0.47079,131,0.450172,0,2,0,0,2
1,2,"Uh, hi! So, like, summers are, like, awesome r...",311,1,137,0.440514,121,0.389068,3,4,0,0,11
2,3,"When peoples ask for advices, they sometimes t...",333,1,158,0.474474,155,0.465465,0,1,0,0,6
3,4,I think art edukation is super impotent for ki...,308,1,121,0.392857,130,0.422078,0,0,0,0,48
4,5,I think we should totally switch to renewable ...,307,1,138,0.449511,146,0.47557,0,2,0,0,5


In [3]:
# Getting the GPU
device = "cuda:0" if torch.cuda.is_available() else "cpu"

## Adding the features from the OpenAI Detector

In [4]:
# Getting the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("roberta-base-openai-detector")
model = AutoModelForSequenceClassification.from_pretrained("roberta-base-openai-detector")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base-openai-detector were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
# Putting model on GPU
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [6]:
# Defining a function for inference
def detector_pred(essay:str) -> float:
  # Tokenizing the input essay
  inputs = tokenizer(essay,return_tensors='pt',truncation=True).to(device)

  # Getting the logits
  with torch.no_grad():
    logits = model(**inputs).logits

  # Doing 1 - max logit because the model has "Real" = class 1 and "Fake" = class 0
  # My labels are the opposite, 1 = LLM Written and 0 = student written.
  # If a logit = 0 = Fake, 1-0 = 1 = LLM Written
  # If a logit = 1 = Real, 1-1 = 0 = student written
  predicted_class = 1 - logits.argmax().item()
  return predicted_class

In [7]:
# Running the examples through the model
data['detector_pred'] = data['essay'].progress_apply(detector_pred)

100%|██████████| 49929/49929 [25:33<00:00, 32.56it/s]


In [8]:
# Checking the new feature
data.head()

Unnamed: 0,row_id,essay,word_count,LLM_written,stop_word_count,stop_word_ratio,unique_word_count,unique_word_ratio,count_question,count_exclamation,count_semi,count_colon,grammar_errors,detector_pred
0,1,"Dear State Senator,\n\nI'm writting to you tod...",291,1,137,0.47079,131,0.450172,0,2,0,0,2,1
1,2,"Uh, hi! So, like, summers are, like, awesome r...",311,1,137,0.440514,121,0.389068,3,4,0,0,11,0
2,3,"When peoples ask for advices, they sometimes t...",333,1,158,0.474474,155,0.465465,0,1,0,0,6,0
3,4,I think art edukation is super impotent for ki...,308,1,121,0.392857,130,0.422078,0,0,0,0,48,0
4,5,I think we should totally switch to renewable ...,307,1,138,0.449511,146,0.47557,0,2,0,0,5,1


## Adding Features from the Emotion Detection Model

In [9]:
# Getting the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("j-hartmann/emotion-english-distilroberta-base")
model = AutoModelForSequenceClassification.from_pretrained("j-hartmann/emotion-english-distilroberta-base")

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

In [10]:
# Putting model on GPU
model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (

In [13]:
# Mapping from index number to class
emotions = ['anger','disgust','fear','joy','neutral','sadness','surprise']

In [17]:
# Defining a function for inference
def emotion_detector_pred(essay:str) -> tuple[int,int,int,int]:
  # Tokenizing the input essay
  inputs = tokenizer(essay,return_tensors='pt',truncation=True).to(device)

  # Getting the logits
  with torch.no_grad():
    logits = model(**inputs).logits

  # Getting the predicted emotion
  predicted_emotion = emotions[logits.argmax().item()]
  if predicted_emotion == 'anger':
    return 1,0,0,0
  elif predicted_emotion == 'surprise':
    return 0,1,0,0
  elif predicted_emotion == 'sadness':
    return 0,0,1,0
  elif predicted_emotion == 'fear':
    return 0,0,0,1
  else:
    return 0,0,0,0

In [18]:
# Inference
emotion_rows = data['essay'].progress_apply(emotion_detector_pred)

100%|██████████| 49929/49929 [13:28<00:00, 61.73it/s]


In [19]:
# Adding columns for each emotion
data['anger_pred'] = [row[0] for row in emotion_rows]
data['surprise_pred'] = [row[1] for row in emotion_rows]
data['sadness_pred'] = [row[2] for row in emotion_rows]
data['fear_pred'] = [row[3] for row in emotion_rows]

In [20]:
# Checking the final dataset
data.head()

Unnamed: 0,row_id,essay,word_count,LLM_written,stop_word_count,stop_word_ratio,unique_word_count,unique_word_ratio,count_question,count_exclamation,count_semi,count_colon,grammar_errors,detector_pred,anger_pred,surprise_pred,sadness_pred,fear_pred
0,1,"Dear State Senator,\n\nI'm writting to you tod...",291,1,137,0.47079,131,0.450172,0,2,0,0,2,1,0,0,0,0
1,2,"Uh, hi! So, like, summers are, like, awesome r...",311,1,137,0.440514,121,0.389068,3,4,0,0,11,0,0,0,0,0
2,3,"When peoples ask for advices, they sometimes t...",333,1,158,0.474474,155,0.465465,0,1,0,0,6,0,0,0,0,0
3,4,I think art edukation is super impotent for ki...,308,1,121,0.392857,130,0.422078,0,0,0,0,48,0,0,0,1,0
4,5,I think we should totally switch to renewable ...,307,1,138,0.449511,146,0.47557,0,2,0,0,5,1,0,0,0,0


In [21]:
# Saving the final dataset
data.to_csv('prepared_training_set.csv',index=False)