In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv('/content/Stress.csv')

# Check
print(df.head())

# Features and labels
X = df['text']
y = df['label']

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


          subreddit post_id sentence_range  \
0              ptsd  8601tu       (15, 20)   
1        assistance  8lbrx9         (0, 5)   
2              ptsd  9ch1zh       (15, 20)   
3     relationships  7rorpp        [5, 10]   
4  survivorsofabuse  9p2gbc         [0, 5]   

                                                text  label  confidence  \
0  He said he had not felt that way before, sugge...      1         0.8   
1  Hey there r/assistance, Not sure if this is th...      0         1.0   
2  My mom then hit me with the newspaper and it s...      1         0.8   
3  until i met my new boyfriend, he is amazing, h...      1         0.6   
4  October is Domestic Violence Awareness Month a...      1         0.8   

   social_timestamp  
0        1521614353  
1        1527009817  
2        1535935605  
3        1516429555  
4        1539809005  


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Convert text to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train SVM
svm_model = SVC(kernel='linear', C=1)
svm_model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred = svm_model.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


SVM Accuracy: 0.7676056338028169
              precision    recall  f1-score   support

           0       0.76      0.74      0.75       270
           1       0.77      0.79      0.78       298

    accuracy                           0.77       568
   macro avg       0.77      0.77      0.77       568
weighted avg       0.77      0.77      0.77       568



In [9]:
pip install transformers tensorflow




In [10]:
from transformers import DistilBertTokenizerFast
import tensorflow as tf

# Initialize tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# Tokenize the text
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True, max_length=128)

# Convert to tf.data.Dataset
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    list(y_train)
)).shuffle(1000).batch(16)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    list(y_test)
)).batch(16)


In [11]:
pip install transformers==4.57.2




In [12]:
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

# ----------------------
# Tokenizer
# ----------------------
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased", use_auth_token=False)

# Encode your text data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True)
test_encodings = tokenizer(list(X_test), truncation=True, padding=True)

# ----------------------
# Dataset class
# ----------------------
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = Dataset(train_encodings, list(y_train))
test_dataset = Dataset(test_encodings, list(y_test))

# ----------------------
# Model
# ----------------------
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# ----------------------
# Training arguments
# ----------------------
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,  # evaluate at end of each epoch
    save_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10
)

# ----------------------
# Trainer
# ----------------------
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# ----------------------
# Train
# ----------------------
trainer.train()

# ----------------------
# Evaluate
# ----------------------
trainer.evaluate()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmanasabr0708[0m ([33mmanasabr0708-maharaja-institute-of-technology-mysore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
10,0.667
20,0.5566
30,0.5662
40,0.4804
50,0.5126
60,0.4146
70,0.4405
80,0.428
90,0.4552
100,0.5276


{'eval_loss': 0.6690376400947571,
 'eval_runtime': 5.7218,
 'eval_samples_per_second': 99.27,
 'eval_steps_per_second': 6.292,
 'epoch': 3.0}

In [13]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import os

# Disable wandb completely
os.environ["WANDB_DISABLED"] = "true"

# Get predictions only
pred = trainer.predict(test_dataset)

# Convert logits to labels
y_pred = np.argmax(pred.predictions, axis=1)
y_true = np.array(y_test)

# Accuracy
acc = accuracy_score(y_true, y_pred)
print("DistilBERT Test Accuracy:", acc)

print("\nClassification Report:")
print(classification_report(y_true, y_pred))


DistilBERT Test Accuracy: 0.8151408450704225

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.79      0.80       270
           1       0.81      0.84      0.83       298

    accuracy                           0.82       568
   macro avg       0.82      0.81      0.81       568
weighted avg       0.82      0.82      0.81       568



In [14]:
from transformers import AutoTokenizer

save_path = "distilbert_stress_model"

model.save_pretrained(save_path)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenizer.save_pretrained(save_path)

print("Model saved to:", save_path)


Model saved to: distilbert_stress_model


In [15]:
import shutil

shutil.make_archive("distilbert_stress_model", "zip", "distilbert_stress_model")


'/content/distilbert_stress_model.zip'

In [16]:
from google.colab import files
files.download("distilbert_stress_model.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>