In [1]:
import keras.backend as K

print(K.backend())

torch


In [2]:
import nltk
import keras
import pandas as pd
import numpy as np
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.layers import TextVectorization
from keras.models import Sequential
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

In [3]:
"""
Download dataset SubtaskA.jsonl from 
https://github.com/mbzuai-nlp/M4GT-Bench.

Google drive from repo: https://drive.google.com/drive/folders/1hBgW6sgZfz1BK0lVdUu0bZ4HPKSpOMSY
Direct link to SubtaskA.jsonl: https://drive.google.com/file/d/1zwSfSKe4-0m2td_cP0Sl2LhKtksvtHlf/view
"""
import gdown, os
# DATA_PATH = "C:/Users/Admin/Downloads/SubtaskA.jsonl"
DATA_PATH = "./datasets/SubtaskA.jsonl"
if not os.path.exists(DATA_PATH):
    if not os.path.exists(os.path.dirname(DATA_PATH)):
        os.makedirs(os.path.dirname(DATA_PATH))
    gdown.download("https://drive.google.com/uc?id=1zwSfSKe4-0m2td_cP0Sl2LhKtksvtHlf", DATA_PATH, quiet=False)

# initialize dataframe
df = pd.read_json(DATA_PATH, lines=True)

In [4]:
print(df.source.value_counts())
print()
print(df.model.value_counts())

wikihow      36556
reddit       33999
arxiv        33998
wikipedia    31365
peerread     16891
Name: source, dtype: int64

human      65177
chatGPT    16892
gpt4       14344
davinci    14340
bloomz     14332
dolly      14046
cohere     13678
Name: model, dtype: int64


In [5]:
print(df[df.label == 0].model.value_counts())
print()
print(df[df.label == 1].model.value_counts())

human    65177
Name: model, dtype: int64

chatGPT    16892
gpt4       14344
davinci    14340
bloomz     14332
dolly      14046
cohere     13678
Name: model, dtype: int64


In [6]:
df[['text', 'label']]

Unnamed: 0,text,label
0,We consider a system of many polymers in solut...,1
1,We present a catalog of 66 YSOs in the Serpens...,1
2,Spectroscopic Observations of the Intermediate...,1
3,We present a new class of stochastic Lie group...,1
4,ALMA as the ideal probe of the solar chromosph...,1
...,...,...
152804,The main results presented in this dissertati...,0
152805,Fine-grained sketch-based image retrieval (FG...,0
152806,We present the derivation of the NNLO two-par...,0
152807,The principle of optimism in the face of unce...,0


In [7]:
"""
Pre-process dataframe.
"""
MAX_VOCAB = 10_000
MAX_LENGTH = 200

# init text vectorizer
vectorize_layer = TextVectorization(
    max_tokens=MAX_VOCAB,
    standardize='lower_and_strip_punctuation',
    split='whitespace',
    ngrams=None,
    output_mode='int',
    output_sequence_length=MAX_LENGTH,
    pad_to_max_tokens=False,
    vocabulary=None,
    idf_weights=None,
    sparse=False,
    ragged=False,
    encoding='utf-8',
    name=None,
)

# create vocabulary
vectorize_layer.adapt(df['text'])
vocab = vectorize_layer.get_vocabulary()

In [8]:
# vectorize text data (in subsets for memory constraints)
X = []
y = df['label']

subset_size = df.shape[0] // 100
for i in range(0, df.shape[0], subset_size):
    subset = df['text'][i : i + subset_size]
    X.append(vectorize_layer(subset).cpu())

X = np.vstack(X)
print(X.shape, y.shape)

(152809, 200) (152809,)


In [9]:
"""
LSTM model generator.
"""
EMBEDDING_DIM = 128
N_HIDDEN = 100
OPTIMIZER = 'adam'
N_CLASSES = 2

import torch
class LSTMModel(torch.nn.Module):
    def __init__(self):
        super(LSTMModel, self).__init__()
        self.embeddings = torch.nn.Embedding(
            num_embeddings=MAX_VOCAB,
            embedding_dim=EMBEDDING_DIM,
        )
        self.lstm = torch.nn.LSTM(
            input_size=EMBEDDING_DIM,
            hidden_size=N_HIDDEN,
            num_layers=1,
            batch_first=True,
        )
        self.linear = torch.nn.Linear(N_HIDDEN, N_CLASSES)
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, x):
        x = self.embeddings(x)
        x, _ = self.lstm(x)
        x = self.linear(x[:, -1, :])
        x = self.softmax(x)
        return x

def get_model(model_path=None):
    model = LSTMModel()
    if model_path:
        # load existing model
        model.load_state_dict(torch.load(model_path))
    return model

In [27]:
"""
Train and evaluate model.
"""
import torch
from tqdm import tqdm
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# create new model
model = get_model().to(device)

# Convert to torch tensors
if not isinstance(X, torch.Tensor) or not isinstance(y, torch.Tensor):
    X = torch.tensor(X)
    y = torch.tensor(y)
    # Convert to one hot encoding
    y = torch.nn.functional.one_hot(torch.tensor(y), N_CLASSES).to(torch.float32)

# create data splits
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.15,
    random_state=777,
)

# train the model
dataset = torch.utils.data.TensorDataset(X_train, y_train)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=256, shuffle=True)
# train the model
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-7)
EPOCHS = 5
for epoch in range(1, EPOCHS + 1):
    with tqdm(dataloader, postfix={"epoch": epoch, "loss": 0, "acc": 0}) as pbar:
        for i, (X_batch, y_batch) in enumerate(pbar):
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            model.train()
            model.zero_grad()
            y_pred = model(X_batch)
            loss = loss_fn(y_pred, y_batch)
            loss.backward()
            optimizer.step()
            acc = (y_pred.argmax(dim=1) == y_batch.argmax(dim=1)).float().mean()
            if i % 10 == 0:
                # pbar.set_postfix({"epoch": epoch, "loss": loss.item(), "acc": acc.item()})
                pbar.set_postfix_str(f"epoch={epoch}, loss={loss.item():.4f}, acc={acc.item():.4f}")

# final evaluation of the model
model.eval()
dataset = torch.utils.data.TensorDataset(X_test, y_test)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=64, shuffle=False)
correct, total = 0, 0
for X_batch, y_batch in dataloader:
    X_batch, y_batch = X_batch.to(device), y_batch.to(device)
    y_pred = model(X_batch)
    correct += (y_pred.argmax(dim=1) == y_batch.argmax(dim=1)).sum().item()
    total += y_batch.size(0)
accuracy = correct / total
# report results
print("Accuracy: %.2f%%" % (accuracy * 100))

# save model
torch.save(model.state_dict(), "model.pth")

100%|██████████| 508/508 [00:20<00:00, 25.02it/s, epoch=1, loss=0.6294, acc=0.6758]
100%|██████████| 508/508 [00:20<00:00, 25.33it/s, epoch=2, loss=0.4775, acc=0.8242]
100%|██████████| 508/508 [00:20<00:00, 25.32it/s, epoch=3, loss=0.4849, acc=0.8164]
100%|██████████| 508/508 [00:20<00:00, 24.61it/s, epoch=4, loss=0.4167, acc=0.8984]
100%|██████████| 508/508 [00:22<00:00, 22.52it/s, epoch=5, loss=0.4015, acc=0.9102]


Accuracy: 87.63%


In [33]:
# Example human

We consider a system of many polymers in solution that interact via an external force that is applied to each pair of polymers. We study the statistical equilibrium of this system, and find that the polymers form clusters whose sizes are given by a power law distribution. This is in contrast to the traditional picture of polymers in solution, where the thermodynamic equilibrium is described by a mean-field theory based on the solution of the mean-field Boltzmann equation. We show that this difference is due to a breakdown of the assumptions that were used to derive the mean-field theory. In particular, we show that the polymer-polymer interactions in the system considered are non-local, and are thus not described by the mean-field theory. We then derive a new theory for the statistical equilibrium in the presence of an external force, which includes a correction to the mean-field theory. The new theory predicts that the polymer clusters become less dense as the external force increases

KeyError: -1