In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from Numeric_Features_model.MLPClassifier import MLPClassifier
import importlib
import Ensemble_model.ensemble
importlib.reload(Ensemble_model.ensemble)
from Ensemble_model.ensemble import BotEnsemble
import torch

# Automatically use GPU if available, fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


# Load transformers data #

In [2]:
transformer_path = "../userdesc-LM-model/trained-model/checkpoint-18441"
tokenizer = AutoTokenizer.from_pretrained(transformer_path, use_fast=True)
# tokenizer = DistilBertTokenizer.from_pretrained(transformer_path)

transformer_model = AutoModelForSequenceClassification.from_pretrained(transformer_path)
transformer_model.eval().to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [89]:
print("TOKEN 1")
print(type(tokenizer))
print("TOKEN 2")
print(type(tokenizer2))

TOKEN 1
<class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>
TOKEN 2
<class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>


# LOAD MLP data #

In [3]:
mlp_model = MLPClassifier(input_dim=2)
mlp_model.load_state_dict(torch.load("../Numeric_Features_model/trained-model/mlp_model.pt"))
mlp_model.eval().to(device)


  mlp_model.load_state_dict(torch.load("../Numeric_Features_model/trained-model/mlp_model.pt"))


MLPClassifier(
  (net): Sequential(
    (0): Linear(in_features=2, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=128, out_features=64, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.3, inplace=False)
    (6): Linear(in_features=64, out_features=32, bias=True)
    (7): ReLU()
    (8): Linear(in_features=32, out_features=1, bias=True)
  )
)

In [4]:
ensemble = BotEnsemble(transformer_model, tokenizer, mlp_model, alpha=1)

record = {
    "followers": 500,
    "avg_retweetcount": 1.3,
    "acctdesc": "Co-Founder @templatenb #WordPress #Webdevelopment #WooCommerce"
}

features = [record["followers"], record["avg_retweetcount"]]
prob = ensemble.predict_prob(record["acctdesc"], features)
pred = int(prob > 0.5)
print(f"Predicted class: {pred} (probability: {prob:.4f})")

Predicted class: 1 (probability: 0.9998)


In [92]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# === 1. Load Data ===
df = pd.read_csv("../data/processed_users_filtered.csv")  # update path if needed

# === 2. Drop rows with missing essential values (if any) ===
df = df.dropna(subset=["followers", "avg_retweetcount", "label"])

# === 3. Iterate and Predict ===
y_true = []
y_probs = []

for _, row in df.iterrows():
    features = [row["followers"], row["avg_retweetcount"]]
    desc = row["acctdesc"]
    prob = ensemble.predict_prob(desc, features)

    y_probs.append(prob)
    y_true.append(row["label"])

# === 4. Threshold and Metrics ===
threshold = 0.487  # or use your previously tuned threshold
y_pred = [1 if p > threshold else 0 for p in y_probs]

# === 5. Print Metrics ===
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=4))

print("ROC AUC Score:", roc_auc_score(y_true, y_probs))


Confusion Matrix:
[[96235  1615]
 [ 1265  5173]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9870    0.9835    0.9853     97850
           1     0.7621    0.8035    0.7822      6438

    accuracy                         0.9724    104288
   macro avg     0.8746    0.8935    0.8838    104288
weighted avg     0.9731    0.9724    0.9727    104288

ROC AUC Score: 0.9136002541755541


# Labling all the unlabeled users #

In [5]:
import pandas as pd
from tqdm import tqdm

# === 1. Load unique users file ===
df_unlabeled = pd.read_csv("../preprocessing/unique_users_no_intersection.csv")

# === 2. Drop rows with missing required features ===
df_unlabeled = df_unlabeled.dropna(subset=["followers", "avg_retweetcount"])

# === 3. Predict Labels ===
tqdm.pandas(desc="Labeling users")
df_unlabeled["predicted_label"] = df_unlabeled.progress_apply(
    lambda row: ensemble.predict_label(
        features = [row["followers"], row["avg_retweetcount"]],
        acctdesc = row["acctdesc"],
        threshold=0.487
    ),
    axis=1
)

# === 4. Save to file ===
output_path = "../data/unique_users_after_labeling.csv"
df_unlabeled.to_csv(output_path, index=False)
print(f"✅ Saved labeled data to: {output_path}")


Labeling users:   4%|▍         | 94418/2285391 [11:08<4:18:24, 141.31it/s]


KeyboardInterrupt: 