# You are bot!
## Kaggle competition [(competition page)](https://www.kaggle.com/competitions/you-are-bot/overview)

In [None]:
import json
import math
from collections import Counter
from typing import List

import datasets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
from catboost import CatBoostClassifier
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from tqdm.auto import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

sns.set_theme(style="darkgrid")

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

# Raw data

## Loading and preprocessing

In [None]:
with open("/kaggle/input/you-are-bot/train.json") as train_json, open("/kaggle/input/you-are-bot/train.json") as test_json:
    train_data = json.load(train_json)
    test_data = json.load(test_json)
data = {"train": train_data, "test": test_data}

In [None]:
flattened_data = {"train": [], "test": []}
for split in ["train", "test"]:
    for conv_id, messages in data[split].items():
        for msg in messages:
            flattened_data[split].append(
                {
                    "id": conv_id,
                    "message": msg["message"],
                    "text": msg["text"],
                    "participant_index": msg["participant_index"],
                }
            )
train_df = pd.DataFrame(flattened_data["train"])
test_df = pd.DataFrame(flattened_data["test"])

In [None]:
train_df["ID"] = train_df["id"] + "_" + train_df["participant_index"]
test_df["ID"] = test_df["id"] + "_" + test_df["participant_index"]

In [None]:
train_df.shape, test_df.shape

In [None]:
y_train = pd.read_csv("/kaggle/input/you-are-bot/ytrain.csv")
y_train["ID"] = y_train["dialog_id"] + "_" + y_train["participant_index"].astype(str)
y_train.drop(columns=["participant_index", "dialog_id"], inplace=True)

In [None]:
train_df = pd.merge(train_df, y_train, on="ID", how="left")

In [None]:
train_df = (
    train_df.groupby("id")
    .apply(lambda x: x.sort_values("message"))
    .reset_index(drop=True)
)
test_df = (
    test_df.groupby("id")
    .apply(lambda x: x.sort_values("message"))
    .reset_index(drop=True)
)

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
# Converting dataframes into datasets

grouped_train_df = train_df.groupby("id", as_index=False).agg(
    {"text": list, "is_bot": list, "participant_index": list}
)
grouped_test_df = test_df.groupby("id", as_index=False).agg(
    {"text": list, "participant_index": list}
)

In [None]:
dataset = datasets.DatasetDict(
    {
        "train": datasets.Dataset.from_pandas(grouped_train_df),
        "test": datasets.Dataset.from_pandas(grouped_test_df),
    }
)
dataset

## Visualising valuable distributions

In [None]:
bots_cnt = Counter(train_df["is_bot"])
messages_per_dialog = list(Counter(train_df["id"]).values())
messages_per_dialog = Counter(messages_per_dialog)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

sns.barplot(x=list(bots_cnt.keys()), y=list(bots_cnt.values()), ax=axes[0])
axes[0].set_title("Samples per class (bot/human)")
axes[0].set_xlabel("Role")
axes[0].set_ylabel("Number of Samples")
axes[0].tick_params(
    axis="x",
)

sns.barplot(
    x=list(messages_per_dialog.keys()), y=list(messages_per_dialog.values()), ax=axes[1]
)
axes[1].set_title("Number of messages per dialog")
axes[1].set_xlabel("Number of messages")
axes[1].set_ylabel("Number of Samples")
axes[1].tick_params(axis="x")

plt.tight_layout()
plt.show()

## Extracting features

Loading models that would be used to calculate perplexities

In [None]:
model_p = AutoModelForCausalLM.from_pretrained(
    "gpt2",
    trust_remote_code=True,
    output_hidden_states=True,
)
tokenizer = AutoTokenizer.from_pretrained(
    "gpt2",
    trust_remote_code=True,
)

In [None]:
def calculate_perplexity(
    message: str,
    context: str,
    model: AutoModelForCausalLM,
    tokenizer: AutoTokenizer,
    device: str,
) -> float:
    """Calculates perplexity tokens in message based on this message and its context.

    Args:
        message (str): message for calculating perplexities
        context (str): context of a message. Perplexities are not calculated for tokens of context!
        model (AutoModelForCausalLM): model for calculating perplexities
        tokenizer (AutoTokenizer): model's tokenizer
        device (str): whether to run or "cuda" or "cpu"

    Returns:
        float: perplexity of a message
    """

    if context != "":
        context += " "
    full_input = context + message
    encodings = tokenizer(
        full_input, return_tensors="pt", truncation=True, max_length=512
    )
    input_ids = encodings["input_ids"].to(device)

    context_ids = tokenizer(
        context, return_tensors="pt", truncation=True, max_length=512
    )["input_ids"]
    context_length = context_ids.shape[1]

    labels = input_ids.clone()
    labels[:, :context_length] = -100

    model = model.to(device)
    with torch.no_grad():
        outputs = model(input_ids=input_ids, labels=labels)
        loss = outputs.loss.item()

    perplexity = math.exp(loss)
    return perplexity

In [None]:
def run_perplexities(dataset: datasets.Dataset) -> List[float]:
    """Calculates perplexities for each message in a dialog

    Args:
        dataset (datasets.Dataset): dataset with dialogs

    Returns:
        List[float]: list of perplexities for each message in a dialog
    """
    perplexities = []
    for sample in tqdm(dataset):
        context = ""
        sample_perplexities = []
        for i, message in enumerate(sample["text"]):
            ppl = calculate_perplexity(message, context, model_p, tokenizer, device)
            sample_perplexities.append(ppl)
            context += " " + message
            context = context.strip()
        perplexities.append(sample_perplexities)
    return perplexities

In [None]:
train_perplexities = run_perplexities(dataset["train"])

In [None]:
ppl_train_df = dataset["train"].to_pandas()
ppl_train_df["perplexities"] = train_perplexities
ppl_train_df.head()

In [None]:
def get_features(df: pd.DataFrame, stage="train") -> dict:
    """Extracts features (min perplexity, mean perplexity, perplexity variance,
    number of messages, text of messages) for each user in a dialog from a dataframe.

    Args:
        df (pd.DataFrame): dataframe with dialogs
        stage (str, optional): train or val. If set to train then the label column is provided. Defaults to "train".

    Returns:
        dict: columns with extracted features
    """
    if stage == "train":
        data_with_ppl = {
            "ID": [],
            "is_bot": [],
            "mean_ppl": [],
            "min_ppl": [],
            "variance_ppl": [],
            "num_messages": [],
            "text": [],
        }
    else:
        data_with_ppl = {
            "ID": [],
            "mean_ppl": [],
            "min_ppl": [],
            "variance_ppl": [],
            "num_messages": [],
            "text": [],
        }

    for _, row in df.iterrows():
        if stage == "train":
            is_user_bot = dict(zip(row["participant_index"], row["is_bot"]))

        ppl_per_user = {0: [], 1: []}
        text_per_user = {0: [], 1: []}
        for participant, perplexity, text in zip(
            row["participant_index"], row["perplexities"], row["text"]
        ):
            if not np.isnan(perplexity):
                ppl_per_user[int(participant)].append(perplexity)
            text_per_user[int(participant)].append(text)

        for participant in ppl_per_user.keys():
            if ppl_per_user[participant] != []:
                min_ppl = np.min(ppl_per_user[participant])
                mean_ppl = np.mean(ppl_per_user[participant])
                variance_ppl = np.var(ppl_per_user[participant])
            else:
                min_ppl = np.nan
                mean_ppl = np.nan
                variance_ppl = np.nan
            num_messages = len(text_per_user[participant])
            text = " ".join(text_per_user[participant])

            new_id = row["id"] + "_" + str(participant)
            data_with_ppl["ID"].append(new_id)
            data_with_ppl["mean_ppl"].append(mean_ppl)
            data_with_ppl["min_ppl"].append(min_ppl)
            data_with_ppl["variance_ppl"].append(variance_ppl)
            data_with_ppl["num_messages"].append(num_messages)
            data_with_ppl["text"].append(text)
            if stage == "train":
                data_with_ppl["is_bot"].append(is_user_bot[str(participant)])
    return data_with_ppl

In [None]:
ppl_train_df = pd.DataFrame.from_dict(get_features(ppl_train_df))
ppl_train_df = ppl_train_df.dropna()
ppl_train_df.head()

### Visualizing extracted perplexity distributions

In [None]:
plt.figure(figsize=(9, 9))
palette = {}
sns.boxplot(
    x="is_bot",
    y="mean_ppl",
    data=ppl_train_df,
    hue="is_bot",
    palette=["lightgreen", "coral"],
)
plt.title("Perplexity for bot/human")
plt.xlabel("0 - human, 1 - bot")
plt.ylabel("Perplexity")
plt.ylim(-5, 250)
plt.tick_params(axis="x")
plt.show()

## Trying to classify based on perplexities only

In [None]:
perplexities = ppl_train_df["mean_ppl"].values
labels = ppl_train_df["is_bot"].values

thresholds = np.sort(perplexities)

best_threshold = None
best_f1 = 0

for threshold in thresholds:
    preds = (perplexities < threshold).astype(int)
    f1 = f1_score(labels, preds)
    if f1 > best_f1:
        best_f1 = f1
        best_threshold = threshold

print(f"Best threshold: {best_threshold}, Train F1: {best_f1:.3f}")

In [None]:
preds = (perplexities < best_threshold).astype(int)
accuracy_score(labels, preds)

## Extracting features from test split

In [None]:
test_perplexities = run_perplexities(dataset["test"])

In [None]:
ppl_test_df = dataset["test"].to_pandas()
ppl_test_df["perplexities"] = test_perplexities
ppl_test_df.head()

In [None]:
ppl_test_df = pd.DataFrame.from_dict(get_features(ppl_test_df, stage="val"))
ppl_test_df["mean_ppl"] = ppl_test_df["mean_ppl"].fillna(0)
ppl_test_df.head()

### Saving a baseline submission

In [None]:
submission = ppl_test_df
submission["is_bot"] = (submission["mean_ppl"] < best_threshold).astype(float)
submission = submission.drop(
    columns=["mean_ppl", "min_ppl", "variance_ppl", "num_messages", "text"]
)
submission.head()

# Conducting experiments on features and classifiers

## Experiment 1. Ridge regression on perplexities

In [None]:
ppl_train_df.head()

In [None]:
x = ppl_train_df["mean_ppl"].clip(upper=110).values.reshape(-1, 1)
scaler = RobustScaler()
x = scaler.fit_transform(x)
y = ppl_train_df["is_bot"].values

In [None]:
x_test = ppl_test_df["mean_ppl"].clip(upper=110).values.reshape(-1, 1)
x_test = scaler.fit_transform(x_test)

In [None]:
ridge_logreg = LogisticRegression(penalty="l2", class_weight="balanced")
scores = cross_validate(
    ridge_logreg,
    x,
    y,
    cv=3,
    scoring=("accuracy", "roc_auc", "f1", "neg_log_loss"),
    return_train_score=True,
)
pd.DataFrame.from_dict(scores)

## Experiment 2. Random forest on perplexities

In [None]:
forest_clf = RandomForestClassifier(class_weight="balanced", max_depth=3)
scores = cross_validate(
    forest_clf,
    x,
    y,
    cv=3,
    scoring=("accuracy", "roc_auc", "f1", "neg_log_loss"),
    return_train_score=True,
)
pd.DataFrame.from_dict(scores)

In [None]:
forest_clf.fit(x, y)
probas = forest_clf.predict_proba(x_test)

In [None]:
submission = ppl_test_df
submission["is_bot"] = probas[:, 1]
submission = submission.drop(columns=["mean_ppl", "text"])
submission.head()

## Experiment 3. Extracting message embeddings and training random forest both on perplexities and embeddings

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
train_texts = ppl_train_df["text"].values
train_embeddings = model.encode(train_texts)

In [None]:
train_ppls = ppl_train_df["mean_ppl"].values.reshape(-1, 1)
x = np.hstack([train_ppls, train_embeddings])
x.shape

Reducing embeddins size from 384 to 50 using PCA decomposition significantly improved performance

In [None]:
def print_metrics(scores: dict):
    print("Train scores:")
    print(f"Accuracy: {np.mean(scores['train_accuracy']):.4f}")
    print(f"ROC AUC: {np.mean(scores['train_roc_auc']):.4f}")
    print(f"F1: {np.mean(scores['train_f1']):.4f}")
    print(f"Log Loss: {-np.mean(scores['train_neg_log_loss']):.4f}")

    print("\nValidation scores:")
    print(f"Accuracy: {np.mean(scores['test_accuracy']):.4f}")
    print(f"ROC AUC: {np.mean(scores['test_roc_auc']):.4f}")
    print(f"F1: {np.mean(scores['test_f1']):.4f}")
    print(f"Log Loss: {-np.mean(scores['test_neg_log_loss']):.4f}")

In [None]:
perplexities = ppl_train_df["mean_ppl"].values.reshape(-1, 1)
X = np.hstack([perplexities, train_embeddings])
y = ppl_train_df["is_bot"].values


pipeline = Pipeline(
    [
        ("pca", PCA(n_components=50)),
        ("classifier", RandomForestClassifier(class_weight="balanced", max_depth=10)),
    ]
)


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(
    pipeline,
    X,
    y,
    cv=cv,
    scoring=("accuracy", "roc_auc", "f1", "neg_log_loss"),
    return_train_score=True,
)

print_metrics(scores)

In [None]:
test_ppls = ppl_test_df["mean_ppl"].values.reshape(-1, 1)
test_texts = ppl_test_df["text"].values
test_embeddings = model.encode(test_texts)

In [None]:
x_test = np.hstack([test_ppls, test_embeddings])
pipeline.fit(X, y)
probas = pipeline.predict_proba(x_test)

In [None]:
submission = ppl_test_df
submission["is_bot"] = probas[:, 1]
submission = submission.drop(columns=["mean_ppl", "text"])
submission.head()

## Experiment 4. Using bossting instead of random forest

In [None]:
cat_model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=3,
    loss_function="Logloss",
    eval_metric="Logloss",
    random_seed=42,
    verbose=100,
    class_weights=[2, 3],
    early_stopping_rounds=100,
    task_type="GPU",
)

In [None]:
perplexities = ppl_train_df["mean_ppl"].values.reshape(-1, 1)
X = np.hstack([perplexities, train_embeddings])
y = ppl_train_df["is_bot"].values


pipeline = Pipeline(
    [
        ("pca", PCA(n_components=50)),
        ("classifier", cat_model),
    ]
)


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(
    pipeline,
    X,
    y,
    cv=cv,
    scoring=("accuracy", "roc_auc", "f1", "neg_log_loss"),
    return_train_score=True,
)


print_metrics(scores)

In [None]:
pipeline.fit(X, y)

In [None]:
probas = pipeline.predict_proba(x_test)

In [None]:
submission = ppl_test_df
submission["is_bot"] = probas[:, 1]
submission = submission.drop(
    columns=["mean_ppl", "text", "min_ppl", "variance_ppl", "num_messages"]
)
submission.head()

In [None]:
submission.to_csv("../data/submission.csv", index=False)

## Experiment 5. Extracting additional features: min perplexity, perplexity variance and number of messages

In [None]:
cat_model = CatBoostClassifier(
    iterations=300,
    learning_rate=0.05,
    depth=5,
    loss_function="Logloss",
    eval_metric="Logloss",
    random_seed=42,
    verbose=100,
    class_weights=[2, 3],
    early_stopping_rounds=100,
    task_type="GPU",
)

In [None]:
mean_ppl = ppl_train_df["mean_ppl"].values.reshape(-1, 1)
min_ppl = ppl_train_df["min_ppl"].values.reshape(-1, 1)
variance_ppl = ppl_train_df["variance_ppl"].values.reshape(-1, 1)
num_messages = ppl_train_df["num_messages"].values.reshape(-1, 1)

X = np.hstack([mean_ppl, min_ppl, variance_ppl, num_messages, train_embeddings])
y = ppl_train_df["is_bot"].values


pipeline = Pipeline(
    [
        ("pca", PCA(n_components=100)),
        ("classifier", cat_model),
    ]
)


cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scores = cross_validate(
    pipeline,
    X,
    y,
    cv=cv,
    scoring=("accuracy", "roc_auc", "f1", "neg_log_loss"),
    return_train_score=True,
)


print_metrics(scores)

In [None]:
ppl_test_df = ppl_test_df.fillna(0)

In [None]:
test_mean_ppl = ppl_test_df["mean_ppl"].values.reshape(-1, 1)
test_min_ppl = ppl_test_df["min_ppl"].values.reshape(-1, 1)
test_variance_ppl = ppl_test_df["variance_ppl"].values.reshape(-1, 1)
test_num_messages = ppl_test_df["num_messages"].values.reshape(-1, 1)

x_test = np.hstack(
    [test_mean_ppl, test_min_ppl, test_variance_ppl, test_num_messages, test_embeddings]
)

In [None]:
pipeline.fit(X, y)
probas = pipeline.predict_proba(x_test)

In [None]:
submission = ppl_test_df
submission["is_bot"] = probas[:, 1]
submission = submission.drop(
    columns=["mean_ppl", "text", "min_ppl", "variance_ppl", "num_messages"]
)
submission.head()

Loading the best result

In [None]:
submission.to_csv("../data/submission.csv", index=False)