<a href="https://colab.research.google.com/github/MalakhovDenis/DLS-DL2/blob/main/17.%20%F0%9F%A4%93%D0%94%D0%97-6.%20%D0%94%D0%B5%D1%82%D0%B5%D0%BA%D1%86%D0%B8%D1%8F%20%D1%81%D0%B3%D0%B5%D0%BD%D0%B5%D1%80%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%BD%D1%8B%D1%85%20%D1%82%D0%B5%D0%BA%D1%81%D1%82%D0%BE%D0%B2/17.1.%20%D0%94%D0%BE%D0%BC%D0%B0%D1%88%D0%BD%D0%B5%D0%B5%20%D0%B7%D0%B0%D0%B4%D0%B0%D0%BD%D0%B8%D0%B5.%20%D0%94%D0%B5%D1%82%D0%B5%D0%BA%D1%86%D0%B8%D1%8F%20%D1%81%D0%B3%D0%B5%D0%BD%D0%B5%D1%80%D0%B8%D1%80%D0%BE%D0%B2%D0%B0%D0%BD%D0%BD%D1%8B%D1%85%20%D1%82%D0%B5%D0%BA%D1%81%D1%82%D0%BE%D0%B2/Malakhov%5Bhomework-6%5Dbot-detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

you_are_bot_path = kagglehub.competition_download('you-are-bot')

print('Data source import complete.')


In [None]:
import json
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, log_loss


def load_train_data(data_file: str, labels_file: str):
    all_texts = []
    all_labels = []

    labels_df = pd.read_csv(labels_file)
    labels_df = labels_df[labels_df["participant_index"] == 0]
    labels_dict = dict(zip(labels_df["dialog_id"], labels_df["is_bot"]))

    with open(data_file, "r", encoding="utf-8") as f:

        data = json.load(f)
        for key in data.keys():
            messages = data[key]

            part_0_texts = [
                m["text"] for m in messages if m["participant_index"] == "0"
            ]
            part_1_texts = [
                m["text"] for m in messages if m["participant_index"] == "1"
            ]

            part_0_label = int(labels_dict[key])
            part_1_label = 1 - part_0_label

            text_0 = " ".join(part_0_texts)
            text_1 = " ".join(part_1_texts)

            all_texts.append(text_0)
            all_labels.append(part_0_label)

            all_texts.append(text_1)
            all_labels.append(part_1_label)

    df = pd.DataFrame({"text": all_texts, "is_bot": all_labels})
    return df


def load_test_data(data_file: str, labels_file: str):
    df_info = pd.read_csv(labels_file)

    with open(data_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    all_texts = []
    ids = []

    for _, row in df_info.iterrows():
        dialog_id = row["dialog_id"]
        participant_index = str(row["participant_index"])
        messages = data[dialog_id]

        texts = [
            m["text"] for m in messages if m["participant_index"] == participant_index
        ]
        combined_text = " ".join(texts)
        all_texts.append(combined_text)
        ids.append(row["ID"])

    df = pd.DataFrame({"ID": ids, "text": all_texts})
    return df


def main():
    df = load_train_data("you-are-bot/train.json", "you-are-bot/ytrain.csv")
    X = df["text"]
    y = df["is_bot"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    pipe = Pipeline(
        [
            ("vectorizer", TfidfVectorizer()),
            ("model", LogisticRegression(random_state=42)),
        ]
    )

    pipe.fit(X_train, y_train)

    val_pred = pipe.predict(X_test)
    val_proba = pipe.predict_proba(X_test)
    val_acc = accuracy_score(y_test, val_pred)
    val_roc = roc_auc_score(y_test, val_proba[:, 1])
    val_logloss = log_loss(y_test, val_proba)
    print("Val Accuracy:", val_acc)
    print("Val ROC AUC:", val_roc)
    print("Val Log Loss:", val_logloss)

    df_test = load_test_data("you-are-bot/test.json", "you-are-bot/ytest.csv")
    X_test = df_test["text"]
    test_proba = pipe.predict_proba(X_test)[:, 1]

    preds_df = pd.DataFrame({"ID": df_test["ID"], "is_bot": test_proba})
    preds_df.to_csv("preds.csv", index=False)


if __name__ == "__main__":
    main()
