# Install required libraries

In [1]:
!pip install transformers datasets torch pandas scikit-learn

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

# 1. Import Required Libraries

In [2]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

# 2. Load the Dataset

In [3]:
data = pd.read_csv("/content/Twitter_Suicide_Dataset - Twitter_Suicide_Dataset.csv")

# View the first few rows of the dataset

In [4]:
data.head()

Unnamed: 0,Tweet,Suicide
0,i hate myself so much i want to KILL myself ho...,Potential Suicide post
1,RT @DrugForumsBest: I woke up with a bag over ...,Potential Suicide post
2,i wanna fucking kill myself,Potential Suicide post
3,why do i destroy all my opportunities? am i go...,Potential Suicide post
4,u - understand what the fuck im doing wrong wi...,Potential Suicide post


# 3. Preprocess the Dataset
# Map "Potential Suicide post" to 1 and "Not Suicide post" to 0

In [5]:
data['label'] = data['Suicide'].map({'Potential Suicide post': 1, 'Not Suicide post': 0})

# Extract tweets and labels
# Ensure tweets are strings
data['Tweet'] = data['Tweet'].astype(str)  # Convert to string type
tweets = data['Tweet'].tolist()
labels = data['label'].tolist()

# Split the dataset into training and test sets

In [6]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    tweets, labels, test_size=0.2, random_state=42
)

# 4. Load Pre-Trained Tokenizer

In [7]:
model_name = "distilbert-base-uncased"  # Pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Tokenize the data

In [8]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=128)

In [9]:
data.head()

Unnamed: 0,Tweet,Suicide,label
0,i hate myself so much i want to KILL myself ho...,Potential Suicide post,1
1,RT @DrugForumsBest: I woke up with a bag over ...,Potential Suicide post,1
2,i wanna fucking kill myself,Potential Suicide post,1
3,why do i destroy all my opportunities? am i go...,Potential Suicide post,1
4,u - understand what the fuck im doing wrong wi...,Potential Suicide post,1


# 5. Prepare the Dataset for Hugging Face

In [10]:
class SuicidalTweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SuicidalTweetDataset(train_encodings, train_labels)
test_dataset = SuicidalTweetDataset(test_encodings, test_labels)

# 6. Load the Pre-Trained Model

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# 7. Define Training Arguments

In [12]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    report_to="none",  # Disable W&B logging
)



# 8. Train the Model

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.202,0.225547
2,0.1059,0.231359
3,0.0146,0.216045


TrainOutput(global_step=240, training_loss=0.17217737541844447, metrics={'train_runtime': 37.5253, 'train_samples_per_second': 102.251, 'train_steps_per_second': 6.396, 'total_flos': 89345638238040.0, 'train_loss': 0.17217737541844447, 'epoch': 3.0})

# 9. Evaluate the Model

In [14]:
evaluation_results = trainer.evaluate()
print("Evaluation Results:", evaluation_results)

Evaluation Results: {'eval_loss': 0.21604549884796143, 'eval_runtime': 0.766, 'eval_samples_per_second': 417.73, 'eval_steps_per_second': 26.108, 'epoch': 3.0}


# Confusion Matrix

In [23]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import torch
from transformers import Trainer

# Assuming `test_dataset` contains ground truth labels for the test data
# and `model`, `tokenizer`, and `device` are already defined.

def evaluate_model(test_dataset, model, tokenizer, device):
    """
    Evaluate the model and calculate metrics like accuracy, F1 score, precision, recall, and confusion matrix.
    :param test_dataset: Dataset object containing encodings and labels.
    :param model: Trained Hugging Face model.
    :param tokenizer: Tokenizer corresponding to the model.
    :param device: Device (e.g., 'cuda' or 'cpu').
    """
    # Extract test labels
    true_labels = [item['labels'].item() for item in test_dataset]

    # Collect predictions
    model.eval()
    predictions = []
    for idx in range(0, len(test_dataset), 16):  # Batch processing
        batch_tweets = test_dataset.encodings['input_ids'][idx: idx + 16]
        batch_tweets = tokenizer.batch_decode(batch_tweets, skip_special_tokens=True)
        preds = predict_suicide(batch_tweets, model, tokenizer, device)
        predictions.extend([0 if pred == "Not Suicide post" else 1 for pred in preds])

    # Calculate metrics
    acc = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions)
    precision = precision_score(true_labels, predictions)
    recall = recall_score(true_labels, predictions)
    conf_matrix = confusion_matrix(true_labels, predictions)

    # Print the metrics
    print("Accuracy:", acc)
    print("F1 Score:", f1)
    print("Precision:", precision)
    print("Recall:", recall)
    print("Confusion Matrix:\n", conf_matrix)

# Call the evaluation function
evaluate_model(test_dataset, model, tokenizer, device)


Accuracy: 0.946875
F1 Score: 0.9216589861751152
Precision: 0.9523809523809523
Recall: 0.8928571428571429
Confusion Matrix:
 [[203   5]
 [ 12 100]]


# 10. Make Predictions

In [21]:
new_tweets = [
    "I want to end it all, there's no point in living.",
    "I am so happy to share my life with friends and family."
]

In [22]:
def predict_suicide(tweets, model, tokenizer, device):
    """
    Predict whether tweets are potential suicide posts or not.
    :param tweets: List of tweets (strings).
    :param model: Pre-trained Hugging Face model.
    :param tokenizer: Corresponding tokenizer for the model.
    :param device: Device to run the model (e.g., 'cuda' or 'cpu').
    :return: List of predictions ("Potential Suicide post" or "Not Suicide post").
    """
    # Tokenize the tweets
    encodings = tokenizer(tweets, truncation=True, padding=True, max_length=128, return_tensors="pt")

    # Move encodings to the correct device
    encodings = {key: val.to(device) for key, val in encodings.items()}

    # Move model to the same device
    model = model.to(device)

    # Get model predictions
    with torch.no_grad():
        outputs = model(**encodings)
        predictions = torch.argmax(outputs.logits, dim=1)

    # Map predictions to labels
    sentiment_map = {0: "Not Suicide post", 1: "Potential Suicide post"}
    return [sentiment_map[pred.item()] for pred in predictions]

# Determine the device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Predict sentiments for new tweets
predicted_sentiments = predict_suicide(new_tweets, model, tokenizer, device)
print("Predicted Sentiments:", predicted_sentiments)


Predicted Sentiments: ['Potential Suicide post', 'Not Suicide post']


# Predict sentiments for new tweets

In [25]:
def predict_suicide_in_batches(tweets, model, tokenizer, device, batch_size=16):
    """
    Predict whether tweets are potential suicide posts or not in batches.
    :param tweets: List of tweets (strings).
    :param model: Pre-trained Hugging Face model.
    :param tokenizer: Corresponding tokenizer for the model.
    :param device: Device to run the model (e.g., 'cuda' or 'cpu').
    :param batch_size: Number of tweets to process in each batch.
    :return: List of predictions ("Potential Suicide post" or "Not Suicide post").
    """
    # Move model to the correct device
    model = model.to(device)

    all_predictions = []

    for i in range(0, len(tweets), batch_size):
        batch_tweets = tweets[i:i + batch_size]

        # Tokenize the batch
        encodings = tokenizer(batch_tweets, truncation=True, padding=True, max_length=128, return_tensors="pt")

        # Move encodings to the correct device
        encodings = {key: val.to(device) for key, val in encodings.items()}

        # Get model predictions
        with torch.no_grad():
            outputs = model(**encodings)
            predictions = torch.argmax(outputs.logits, dim=1)

        # Map predictions to labels
        sentiment_map = {0: "Not Suicide post", 1: "Potential Suicide post"}
        batch_predictions = [sentiment_map[pred.item()] for pred in predictions]

        all_predictions.extend(batch_predictions)

    return all_predictions

# Example list of tweets (replace this with your 100 tweets)
new_tweets = [
    "I want to end it all, there's no point in living.",
    "I am so happy to share my life with friends and family.",
        "You're the only one who can see this cause no one else is following me this is for you because you're pretty awesome",

   "&lt;---Sad level is 3. I was writing a massive blog tweet on Myspace and my comp shut down. Now it's all lost *lays in fetal position*",

   "...  Headed to Hospitol : Had to pull out of the Golf Tourny in 3rd place!!!!!!!!!!! I Think I Re-Ripped something !!! Yeah THAT !!",

   "BoRinG   ): whats wrong with him??     Please tell me........   :-/",

   "can't be bothered. i wish i could spend the rest of my life just sat here and going to gigs. seriously.",

   "Feeeling like shit right now. I really want to sleep, but nooo I have 3 hours of dancing and an art assignment to finish.",

   "goodbye exams, HELLO ALCOHOL TONIGHT ",

   "I didn't realize it was THAT deep. Geez give a girl a warning atleast!",

   "I hate it when any athlete appears to tear an ACL on live television.",

   "i miss you guys too     i think i'm wearing skinny jeans a cute sweater and heels   not really sure   what are you doing today",

  "-- Meet your Meat http://bit.ly/15SSCI",
    # Add more tweets here...
]   # Replicating to simulate 100 tweets

# Determine the device (use GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Predict sentiments for all tweets
predicted_sentiments = predict_suicide_in_batches(new_tweets, model, tokenizer, device, batch_size=16)


for idx, tweet in enumerate(new_tweets):
    print(f"Tweet {idx + 1}: {tweet}")
    print(f"Prediction: {predicted_sentiments[idx]}")
    print()



Tweet 1: I want to end it all, there's no point in living.
Prediction: Potential Suicide post

Tweet 2: I am so happy to share my life with friends and family.
Prediction: Not Suicide post

Tweet 3: You're the only one who can see this cause no one else is following me this is for you because you're pretty awesome
Prediction: Not Suicide post

Tweet 4: &lt;---Sad level is 3. I was writing a massive blog tweet on Myspace and my comp shut down. Now it's all lost *lays in fetal position*
Prediction: Potential Suicide post

Tweet 5: ...  Headed to Hospitol : Had to pull out of the Golf Tourny in 3rd place!!!!!!!!!!! I Think I Re-Ripped something !!! Yeah THAT !!
Prediction: Not Suicide post

Tweet 6: BoRinG   ): whats wrong with him??     Please tell me........   :-/
Prediction: Not Suicide post

Tweet 7: can't be bothered. i wish i could spend the rest of my life just sat here and going to gigs. seriously.
Prediction: Not Suicide post

Tweet 8: Feeeling like shit right now. I really want 

# **For Future Use**

In [None]:
# Save the trained model and tokenizer
model.save_pretrained('/content/sentiment_model')  # Save model
tokenizer.save_pretrained('/content/sentiment_model')  # Save tokenizer


('/content/sentiment_model/tokenizer_config.json',
 '/content/sentiment_model/special_tokens_map.json',
 '/content/sentiment_model/vocab.txt',
 '/content/sentiment_model/added_tokens.json',
 '/content/sentiment_model/tokenizer.json')