In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

: 

In [11]:
import torch
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"Using device: {device}")

Using device: cuda


In [4]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
import pandas as pd

df = pd.read_csv('/kaggle/input/aiiscoming2/LLM.csv')
label_mapping = {'student': 0, 'ai': 1}
df = df.dropna(subset=['Label'])
df['Label'] = df['Label'].map(label_mapping).astype('int64')
df.rename(columns={'Text': 'content', 'Label': 'label'}, inplace=True)
train_df, eval_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['content'], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    weight_decay=0.01,
    fp16=torch.cuda.is_available(),
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
trainer.train()

def save_model_and_tokenizer(model, tokenizer, model_path, tokenizer_path):
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(tokenizer_path)
    print("Model and tokenizer saved.")

model_path = './bert_finetuned_model'
tokenizer_path = './bert_tokenizer'
save_model_and_tokenizer(model, tokenizer, model_path, tokenizer_path)

def load_model_and_tokenizer(model_path, tokenizer_path):
    model = BertForSequenceClassification.from_pretrained(model_path)
    tokenizer = BertTokenizer.from_pretrained(tokenizer_path)
    return model, tokenizer

model, tokenizer = load_model_and_tokenizer(model_path, tokenizer_path)
labels = {0: "student", 1: "ai"}

def predict_text_category(dialogue, model, tokenizer):
    inputs = tokenizer(dialogue, return_tensors='pt', truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return labels[predicted_class]


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/881 [00:00<?, ? examples/s]

Map:   0%|          | 0/221 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112557266665135, max=1.0…

Epoch,Training Loss,Validation Loss
1,No log,0.001422
2,No log,0.000486
3,No log,0.0003
4,No log,0.000234
5,0.017400,0.000215


Model and tokenizer saved.
The predicted label for the given text is: ai


In [13]:
text = "hello beautiful wanna go on date"
predicted_label = predict_text_category(text, model, tokenizer)
print(f"The predicted label for the given text is: {predicted_label}")

The predicted label for the given text is: student


In [12]:
ai_generated_text = "Blockchain is revolutionizing education, providing personalized learning experiences to students all over the world."
predicted_label = predict_text_category(ai_generated_text, model, tokenizer)
print(f"The predicted label for the given text is: {predicted_label}")

The predicted label for the given text is: ai


Using device: cuda


(1102, 2)

The predicted label for the given text is: student
