# Instalação das bibliotecas


In [1]:
!pip install scikit-learn



In [15]:
import pandas as pd
import gdown
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification, TFBertForSequenceClassification
import tensorflow as tf
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

# Carregamento do DF e Preparação dos dados

In [3]:
arquivo_destino_base = "dataset_{}.csv"

ids = {
    "bot_detection_data": "1jeTXXmc3NA2g5M_plszwKtApICScVEF_",
}

dataframes = {}

for key, file_id in ids.items():
    url = f"https://drive.google.com/uc?id={file_id}"
    arquivo_destino = arquivo_destino_base.format(key)

    gdown.download(url, arquivo_destino, quiet=False)
    df = pd.read_csv(arquivo_destino, sep=";")
    dataframes[key] = df

Downloading...
From: https://drive.google.com/uc?id=1jeTXXmc3NA2g5M_plszwKtApICScVEF_
To: /content/dataset_bot_detection_data.csv
100%|██████████| 7.46M/7.46M [00:00<00:00, 53.4MB/s]


In [4]:
df = pd.read_csv("/content/dataset_bot_detection_data.csv", delimiter=",")

In [5]:
df

Unnamed: 0,User ID,Username,Tweet,Retweet Count,Mention Count,Follower Count,Verified,Bot Label,Location,Created At,Hashtags
0,132131,flong,Station activity person against natural majori...,85,1,2353,False,1,Adkinston,2020-05-11 15:29:50,
1,289683,hinesstephanie,Authority research natural life material staff...,55,5,9617,True,0,Sanderston,2022-11-26 05:18:10,both live
2,779715,roberttran,Manage whose quickly especially foot none to g...,6,2,4363,True,0,Harrisonfurt,2022-08-08 03:16:54,phone ahead
3,696168,pmason,Just cover eight opportunity strong policy which.,54,5,2242,True,1,Martinezberg,2021-08-14 22:27:05,ever quickly new I
4,704441,noah87,Animal sign six data good or.,26,3,8438,False,1,Camachoville,2020-04-13 21:24:21,foreign mention
...,...,...,...,...,...,...,...,...,...,...,...
49995,491196,uberg,Want but put card direction know miss former h...,64,0,9911,True,1,Lake Kimberlyburgh,2023-04-20 11:06:26,teach quality ten education any
49996,739297,jessicamunoz,Provide whole maybe agree church respond most ...,18,5,9900,False,1,Greenbury,2022-10-18 03:57:35,add walk among believe
49997,674475,lynncunningham,Bring different everyone international capital...,43,3,6313,True,1,Deborahfort,2020-07-08 03:54:08,onto admit artist first
49998,167081,richardthompson,Than about single generation itself seek sell ...,45,1,6343,False,0,Stephenside,2022-03-22 12:13:44,star


In [6]:
df['Hashtags'].fillna('No hashtags', inplace=True)

# Tokenização dos Tweets

In [10]:
tweets_train, tweets_test, labels_train, labels_test = train_test_split(df['Tweet'], df['Bot Label'], test_size=0.2, random_state=42)

In [7]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



In [11]:
def encode_tweets(tweets):
    return tokenizer(list(tweets), truncation=True, padding='max_length', max_length=128, return_tensors='tf')

train_encodings = encode_tweets(tweets_train)
test_encodings = encode_tweets(tweets_test)

# Tensorflow


In [12]:
# Carregar o modelo DistilBERT pré-treinado
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Compilar o modelo
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

# Construir o Modelo BERT para Classificação

In [16]:
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Treinar o modelo

In [18]:
train_dataset = tf.data.Dataset.from_tensor_slices((train_encodings['input_ids'], labels_train)).shuffle(1000).batch(64)
test_dataset = tf.data.Dataset.from_tensor_slices((test_encodings['input_ids'], labels_test)).batch(64)

In [None]:
history = model.fit(train_dataset, epochs=1, validation_data=test_dataset)

In [None]:
results = model.evaluate(test_dataset)
print(f"Test Loss: {results[0]}, Test Accuracy: {results[1]}")

In [None]:
model.save_pretrained('saved_distilbert_model')

# Ir além

In [None]:
sns.countplot(x=labels_train)
plt.title('Distribuição de Classes (0=Não Bot, 1=Bot)')
plt.xlabel('Classe')
plt.ylabel('Número de Amostras')
plt.show()

In [None]:
pred_labels = model.predict(test_dataset).argmax(axis=1)

cm = confusion_matrix(labels_test, pred_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Não Bot', 'Bot'])
disp.plot(cmap='Blues')
plt.title('Matriz de Confusão')
plt.show()