In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import importlib
import Ensemble_model.ensemble
importlib.reload(Ensemble_model.ensemble)
from Ensemble_model.ensemble import BotEnsemble
import torch

# Automatically use GPU if available, fallback to CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

  from .autonotebook import tqdm as notebook_tqdm


Using device: cuda


# Load transformers data #

In [2]:
transformer_path = "../../twitter-bot-project - Copy/userdesc-LM-model/trained-model/checkpoint-18441"
tokenizer = AutoTokenizer.from_pretrained(transformer_path, use_fast=True)

transformer_model = AutoModelForSequenceClassification.from_pretrained(transformer_path)
transformer_model.eval().to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


# Load random forest model #

In [3]:
import numpy as np
import onnxruntime as ort

# === Load ONNX model ===
onnx_model_path = '../../twitter-bot-project - Copy/Numeric_Features_model/trained-model/rf_model.onnx'
session = ort.InferenceSession(onnx_model_path, providers=["CUDAExecutionProvider"])

# === Prepare input (test) ===
features = [[500, 1.3]]  # Single record with 2 features
input_array = np.array(features, dtype=np.float32)

# ONNX input name may vary; get it programmatically:
input_name = session.get_inputs()[0].name

# === Run inference ===
outputs = session.run(None, {input_name: input_array})


probs = outputs[1][0]  # [prob_class_0, prob_class_1]
prob = probs[1]
pred = int(prob > 0.5)

print(f"Predicted class: {pred} (probability: {prob:.4f})")

Predicted class: 1 (probability: 0.5525)


In [4]:
import numpy as np
from sklearn.metrics import f1_score

def find_best_threshold(y_true, y_probs, metric=f1_score, step=0.01):
    """
    Finds the best threshold for converting probabilities to labels.
    Returns:
        best_threshold: Threshold with highest metric score.
        best_score: The corresponding metric score.
    """
    thresholds = np.arange(0, 1 + step, step)
    best_score = -1
    best_threshold = 0.5
    for t in thresholds:
        y_pred = (y_probs >= t).astype(int)
        score = metric(y_true, y_pred)
        if score > best_score:
            best_score = score
            best_threshold = t
    return best_threshold, best_score

In [5]:
# Initialize ensemble with random forest model
ensemble = BotEnsemble(
    transformer_model=transformer_model,
    tokenizer=tokenizer,
    numeric_model=session,
    alpha=1,
)
record = {
    "followers": 500,
    "avg_retweetcount": 1.3,
    "acctdesc": "Co-Founder @templatenb #WordPress #Webdevelopment #WooCommerce"
}


features = [record["followers"], record["avg_retweetcount"]]
prob = ensemble.predict_prob(record["acctdesc"], features)
pred = int(prob > 0.5)
print(f"Predicted class: {pred} (probability: {prob:.4f})\n\n")

Predicted class: 1 (probability: 0.9998)




# Create filtered users dataset #

In [None]:
import pandas as pd
# Load the CSV
df = pd.read_csv('../data/processed_users.csv')
# Keep only the desired columns
df = df[['userid', 'followers', 'avg_retweetcount', 'label']]

file_path = '../data/userdesc_labeled.csv'
# Load the CSV file into a pandas DataFrame
df_desc = pd.read_csv(file_path)

# Drop rows with missing descriptions
df_clean = df_desc.dropna(subset=["acctdesc"]).copy()

# Check balance
print(df_clean['label'].value_counts())
print(df_clean.head())

df_merged = pd.merge(df, df_clean[['userid', 'acctdesc']], on='userid', how='left')

print("\nMerged DataFrame with 'acctdesc'. First 5 rows:")
print(df_merged.head())
df_merged.to_csv('../data/processed_users_filtered.csv', index=False)

# Test ensemble with random forest on full data #

In [7]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# === 1. Load Data ===
df = pd.read_csv("../data/processed_users_filtered.csv")

# === 2. Drop rows with missing essential values (if any) ===
df = df.dropna(subset=["followers", "avg_retweetcount", "label"])

# === 3. Iterate and Predict ===
y_true = []
y_probs = []

for _, row in df.iterrows():
    features = [row["followers"], row["avg_retweetcount"]]
    desc = row["acctdesc"]
    prob = ensemble.predict_prob(desc, features)

    y_probs.append(prob)
    y_true.append(row["label"])

# === 4. Threshold and Metrics ===
best_threshold, best_score = find_best_threshold(np.array(y_true), np.array(y_probs), metric=f1_score, step=0.01)
print(f"Best threshold: {best_threshold:.3f}")
print(f"Best F1 score: {best_score:.4f}")

y_pred = (np.array(y_probs) >= best_threshold).astype(int)

# === 5. Print Metrics ===
print("Confusion Matrix:")
print(confusion_matrix(y_true, y_pred))

print("\nClassification Report:")
print(classification_report(y_true, y_pred, digits=4))

print("ROC AUC Score:", roc_auc_score(y_true, y_probs))

Best threshold: 0.980
Best F1 score: 0.8414
Confusion Matrix:
[[97415   435]
 [ 1447  4991]]

Classification Report:
              precision    recall  f1-score   support

           0     0.9854    0.9956    0.9904     97850
           1     0.9198    0.7752    0.8414      6438

    accuracy                         0.9820    104288
   macro avg     0.9526    0.8854    0.9159    104288
weighted avg     0.9813    0.9820    0.9812    104288

ROC AUC Score: 0.9871173464973793


# Labling all the unlabeled users using ensemble with random forest #

In [8]:
import pandas as pd# import pandas as pd
labeled_path = '../data/unique_users_after_labeling.csv'
unlabeled_path = '../data/unique_users_no_intersection_unlabeled.csv'

df_labeled = pd.read_csv(labeled_path)
df_unlabeled = pd.read_csv(unlabeled_path)

# Remove intersection based on 'user_id'
remaining = df_unlabeled[~df_unlabeled['userid'].isin(df_labeled['userid'])]
print(f"Total unlabeled users: {len(df_unlabeled)}")
print(f"Remaining unlabeled users: {len(remaining)}")
print(f" labeled users: {len(df_labeled)}")
# remaining.to_csv('remaining.csv', index=False)
del df_labeled
del df_unlabeled
del remaining

Total unlabeled users: 2285391
Remaining unlabeled users: 0
 labeled users: 2285391


In [7]:
import pandas as pd# import pandas as pd

from tqdm import tqdm
labeled_path = '../data/unique_users_after_labeling.csv'
unlabeled_path = '../data/unique_users_no_intersection_unlabeled.csv'


batch_size = 5000

reader = pd.read_csv(unlabeled_path, chunksize=batch_size)
first_batch = False

for chunk in tqdm(reader, desc="Batch labeling"):
    chunk = chunk.dropna(subset=["followers", "avg_retweetcount"])
    chunk["predicted_label"] = chunk.apply(
        lambda row: ensemble.predict_label(
            features=[row["followers"], row["avg_retweetcount"]],
            acctdesc=row["acctdesc"],
            threshold=0.98
        ),
        axis=1
    )
    chunk.to_csv(labeled_path, mode='w' if first_batch else 'a', index=False, header=first_batch)
    first_batch = False


Batch labeling: 105it [58:34, 33.47s/it]


In [9]:
import pandas as pd

# Load the labeled data
df = pd.read_csv('../data/unique_users_after_labeling.csv')

# Get the distribution of bots (1) and humans (0)
label_counts = df['predicted_label'].value_counts()
print("Distribution of bots (1) and humans (0):")
print(label_counts)
del label_counts
del df

Distribution of bots (1) and humans (0):
predicted_label
0    2226874
1      58517
Name: count, dtype: int64


# Add to unique_users_after_labeling.csv the prelabeled users  #

* First step is to get all the labeled users and merge "acctdesc" with the numeric features

In [None]:
from IPython.display import display
# Define file paths
processed_users_path = '../data/processed_users.csv'
userdesc_labeled_path = '../data/userdesc_labeled.csv'


# Define the columns you want from processed_users_df *after* dropping
processed_users_cols_to_keep = ['userid', 'totaltweets', 'avg_retweetcount', 'followers', 'following', 'label']
userdesc_labeled_cols_to_keep = ['userid', 'acctdesc']

# Load the datasets and select/drop columns immediately
try:
    processed_users_df = pd.read_csv(processed_users_path)[processed_users_cols_to_keep]
    userdesc_labeled_df = pd.read_csv(userdesc_labeled_path)[userdesc_labeled_cols_to_keep]

    print("DataFrames loaded and relevant columns selected.")
    print("Processed Users Shape (selected columns):", processed_users_df.shape)
    print("Userdesc Labeled Shape (selected columns):", userdesc_labeled_df.shape)

    merged_df = pd.merge(
        processed_users_df,
        userdesc_labeled_df,
        on='userid',
        how='left',
    )
    merged_df = merged_df.reset_index(drop=True)

    current_columns = merged_df.columns.tolist()
    acctdesc_index = current_columns.index('acctdesc')
    label_index = current_columns.index('label')

    # Swap their positions in the list
    current_columns[acctdesc_index], current_columns[label_index] = current_columns[label_index], current_columns[acctdesc_index]

    # Reindex the DataFrame with the new column order
    merged_df_reordered = merged_df[current_columns]
    print("\nDataFrames merged successfully on 'userid'.")
    print("Merged DataFrame Shape:", merged_df.shape)
    display(merged_df_reordered.head()) # Use display for better output formatting

except FileNotFoundError as e:
    print(f"Error: One of the files was not found. {e}")
except KeyError as e:
    print(f"Error: A column specified for selecting or reordering was not found. {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [None]:
unique_users_path = '../data/unique_users_after_labeling.csv'

try:
    # Load the unique users file
    unique_users_df = pd.read_csv(unique_users_path)
    print(f"\nLoaded unique users file: {unique_users_path}")
    print("Unique Users Shape (before modifications):", unique_users_df.shape)
    print("Unique Users Columns (before modifications):", unique_users_df.columns.tolist())

    # Drop the "count" column
    if 'count' in unique_users_df.columns:
        unique_users_df = unique_users_df.drop(columns=['count'])
        print("Dropped 'count' column from unique_users_df.")
    else:
        print("'count' column not found in unique_users_df. Skipping drop.")

    # Rename "predicted_label" to "label"
    if 'predicted_label' in unique_users_df.columns:
        unique_users_df = unique_users_df.rename(columns={'predicted_label': 'label'})
        print("Renamed 'predicted_label' to 'label' in unique_users_df.")
    else:
         print("'predicted_label' column not found in unique_users_df. Skipping rename.")

    common_columns = list(merged_df_reordered.columns) # Get the column names from merged_df

    columns_to_select = [col for col in common_columns if col in unique_users_df.columns]
    unique_users_df_aligned = unique_users_df[columns_to_select]


    print("\nUnique Users Shape (after modifications and alignment):", unique_users_df_aligned.shape)
    print("Unique Users Columns (after modifications and alignment):", unique_users_df_aligned.columns.tolist())
    print("Merged DataFrame Columns:", merged_df_reordered.columns.tolist())


    # Concatenate the two dataframes
    # Use ignore_index=True to reset the index after concatenation
    combined_df = pd.concat([merged_df_reordered, unique_users_df_aligned], ignore_index=True)

    print("\nDataFrames concatenated successfully.")
    print("Combined DataFrame Shape:", combined_df.shape)
    print("Combined DataFrame Head:")
    display(combined_df.head())

except FileNotFoundError as e:
    print(f"Error processing unique_users_df: The file was not found. {e}")
except Exception as e:
    print(f"An unexpected error occurred while processing unique_users_df: {e}")

print("\nLabel Counts in combined_df:")
label_counts = combined_df['label'].value_counts()
print(label_counts)
combined_df.to_csv(unique_users_path, index=False, mode='w')