In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install catboost



In [None]:
!pip install datasets



In [None]:
!pip install lightgbm



In [None]:
import sys
import gc

import pandas as pd
from sklearn.model_selection import StratifiedKFold
import numpy as np
from sklearn.metrics import roc_auc_score
import numpy as np
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from tokenizers import (
    decoders,
    models,
    normalizers,
    pre_tokenizers,
    processors,
    trainers,
    Tokenizer,
)

from datasets import Dataset
from tqdm.auto import tqdm
from transformers import PreTrainedTokenizerFast

from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import VotingClassifier

In [None]:
dataset = pd.read_csv('dataset.csv')

In [None]:
dataset.head()

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False


In [None]:
df = dataset.drop_duplicates(subset=['text'])
df.reset_index(drop=True, inplace=True)

In [None]:
df.head()

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False


In [None]:
df.drop(['prompt_name', 'source', 'RDizzl3_seven'], axis='columns', inplace=True)

In [None]:
df.head()

Unnamed: 0,text,label
0,Phones\n\nModern humans today are always on th...,0
1,This essay will explain if drivers should or s...,0
2,Driving while the use of cellular devices\n\nT...,0
3,Phones & Driving\n\nDrivers should not be able...,0
4,Cell Phone Operation While Driving\n\nThe abil...,0


In [None]:
df.shape

(44868, 2)

In [None]:
import sklearn
df = sklearn.utils.shuffle(df)

In [None]:
df.shape

(44868, 2)

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

In [None]:
!pip install sentence-transformers scikit-learn catboost lightgbm




In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
import numpy as np

In [None]:
texts = df['text'].astype(str).tolist()
labels = df['label'].tolist()

In [None]:
from transformers import AutoTokenizer, AutoModel
import torch

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.to(device)
bert_model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state  # (batch_size, seq_len, hidden_size)
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return (token_embeddings * input_mask_expanded).sum(1) / input_mask_expanded.sum(1)

In [None]:
class TextDataset(Dataset):
    def __init__(self, texts):
        self.texts = texts

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx]

def encode_texts(texts, batch_size=16):
    embeddings = []
    dataset = TextDataset(texts)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    for batch in tqdm(loader, desc="Encoding with BERT"):
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt', max_length=512)
        encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
        with torch.no_grad():
            model_output = bert_model(**encoded_input)
        sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
        embeddings.append(sentence_embeddings.cpu().numpy())
    return np.vstack(embeddings)

In [None]:
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

In [None]:
X = encode_texts(texts)

Encoding with BERT: 100%|██████████| 2805/2805 [27:51<00:00,  1.68it/s]


In [None]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, labels, test_size=0.3, random_state=42)

In [None]:
from sklearn.ensemble import VotingClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier, RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

In [None]:
xgb = XGBClassifier(
    n_estimators=500, learning_rate=0.05, max_depth=5, subsample=0.8, colsample_bytree=0.8,
    reg_alpha=0.5, reg_lambda=1, use_label_encoder=False, eval_metric='logloss'
)

In [None]:
et = ExtraTreesClassifier(n_estimators=300, max_depth=10, min_samples_split=4, random_state=42)

In [None]:
bag = BaggingClassifier(n_estimators=100, max_samples=0.8, max_features=0.8, random_state=42)

In [None]:
ada = AdaBoostClassifier(n_estimators=200, learning_rate=0.5, random_state=42)

In [None]:
rf = RandomForestClassifier(n_estimators=300, max_depth=10, min_samples_split=4, random_state=42)

In [None]:
cat = CatBoostClassifier(iterations=500, depth=6, learning_rate=0.03, l2_leaf_reg=5, verbose=0)

In [None]:
sgd = SGDClassifier(
    loss='log_loss', penalty='elasticnet', alpha=1e-4, l1_ratio=0.15,
    learning_rate='optimal', early_stopping=True, max_iter=2000, random_state=42
)

In [None]:
lgbm = LGBMClassifier(
    n_estimators=1000, learning_rate=0.01, num_leaves=32, max_depth=8,
    min_child_samples=30, subsample=0.8, colsample_bytree=0.8, reg_alpha=0.1, reg_lambda=0.1
)

In [None]:
ensemble = VotingClassifier(
    estimators=[
        ('catboost', cat),
        ('sgd', sgd),
        ('lgbm', lgbm),
        ('randomforest', rf),
        ('adaboost',ada),
        ('bagging', bag),
        ('extratree',et),
        ('xgb', xgb)
    ],
    voting='soft'  # Use 'hard' for majority vote
)

In [None]:
ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)



[LightGBM] [Info] Number of positive: 12232, number of negative: 19175
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.429788 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 195840
[LightGBM] [Info] Number of data points in the train set: 31407, number of used features: 768
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.389467 -> initscore=-0.449552
[LightGBM] [Info] Start training from score -0.449552


Parameters: { "use_label_encoder" } are not used.



In [None]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99      8196
           1       0.99      0.98      0.99      5265

    accuracy                           0.99     13461
   macro avg       0.99      0.99      0.99     13461
weighted avg       0.99      0.99      0.99     13461



In [None]:
print(accuracy_score(y_test, y_pred))

0.9913082237575217


In [None]:
print(precision_score(y_test, y_pred))

0.9934815950920245


In [None]:
print(recall_score(y_test, y_pred))

0.9842355175688509


In [None]:
print(f1_score(y_test, y_pred))

0.9888369430397863
