In [1]:
!pip install wandb



In [2]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
from sklearn.preprocessing import LabelEncoder
import torch
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AdamW, get_scheduler
from tqdm.auto import tqdm
from transformers import TrainingArguments, Trainer
import wandb
import numpy as np

# Initialize WandB with your API key
wandb.login(key="92009a9c5dd6b5b7a30a3f921b700b85af6651cb")

# Load your dataset from CSV
data = pd.read_csv('/kaggle/input/dianostic-final-dataset/final_dataset.csv')

# Print data types of each column in the dataset
print("Data types of each column in the dataset:")
print(data.dtypes)
print()

# Encode labels as integers
label_encoder = LabelEncoder()
data['subreddit'] = label_encoder.fit_transform(data['subreddit'])

# Load BERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")

# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['post'], padding='max_length', truncation=True)

# Convert Pandas DataFrame to Dataset
dataset = Dataset.from_pandas(data)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Add labels to the tokenized dataset
tokenized_datasets = tokenized_datasets.add_column("labels", data['subreddit'].tolist())

# Split into train and test datasets
train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Define DataLoader
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
eval_dataloader = DataLoader(eval_dataset, batch_size=8)

# Load model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-multilingual-cased", num_labels=len(label_encoder.classes_))

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Define learning rate scheduler
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# Install and import the evaluate module
!pip install evaluate
import evaluate

# Define metric
metric = evaluate.load("accuracy")

# Compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Define training arguments
training_args = TrainingArguments(
    output_dir="/kaggle/working/",
    evaluation_strategy="steps",
    eval_steps=1000,
    logging_steps=1000,
    num_train_epochs=num_epochs,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/kaggle/working/logs',
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

# Train the model with WandB integration
trainer.train()

# Finish WandB run
wandb.finish()


2024-07-07 10:53:10.132581: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-07 10:53:10.132694: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-07 10:53:10.299382: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Data types of each column in the dataset:
subreddit    object
post         object
dtype: object



tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Map:   0%|          | 0/26852 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

[34m[1mwandb[0m: Currently logged in as: [33msiddharth8shukla8[0m ([33msid48[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.17.4 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.17.0
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240707_105359-c38koy1i[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33m/kaggle/working/[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/sid48/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/sid48/huggingface/runs/c38koy1i[0m


Step,Training Loss,Validation Loss,Accuracy
1000,0.6453,0.441686,0.851052
2000,0.3781,0.421737,0.859616
3000,0.2911,0.429228,0.878235
4000,0.2113,0.408211,0.886613


[34m[1mwandb[0m:                                                                                
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run history:
[34m[1mwandb[0m:           eval/accuracy ▁▃▆█
[34m[1mwandb[0m:               eval/loss █▄▅▁
[34m[1mwandb[0m:            eval/runtime ▃▂▁█
[34m[1mwandb[0m: eval/samples_per_second ▆▇█▁
[34m[1mwandb[0m:   eval/steps_per_second ▆▇█▁
[34m[1mwandb[0m:             train/epoch ▁▁▃▃▆▆███
[34m[1mwandb[0m:       train/global_step ▁▁▃▃▆▆███
[34m[1mwandb[0m:         train/grad_norm ▃█▃▁
[34m[1mwandb[0m:     train/learning_rate █▆▃▁
[34m[1mwandb[0m:              train/loss █▄▂▁
[34m[1mwandb[0m: 
[34m[1mwandb[0m: Run summary:
[34m[1mwandb[0m:            eval/accuracy 0.88661
[34m[1mwandb[0m:                eval/loss 0.40821
[34m[1mwandb[0m:             eval/runtime 104.8933
[34m[1mwandb[0m:  eval/samples_per_second 51.204
[34m[1mwandb[0m:    eval/steps_per_second 3.203
[34m[1mwandb[0m:          

In [3]:
# import pandas as pd
# from datasets import Dataset
# from transformers import AutoTokenizer
# from sklearn.preprocessing import LabelEncoder
# import torch
# from torch.utils.data import DataLoader
# from transformers import AutoModelForSequenceClassification, AdamW, get_scheduler
# from tqdm.auto import tqdm
# from transformers import TrainingArguments, Trainer

# # Load your dataset from CSV
# data = pd.read_csv('/kaggle/input/dianostic-final-dataset/final_dataset.csv')

# # Print data types of each column in the dataset
# print("Data types of each column in the dataset:")
# print(data.dtypes)
# print()

# # Encode labels as integers
# label_encoder = LabelEncoder()
# data['subreddit'] = label_encoder.fit_transform(data['subreddit'])

# # Load BERT tokenizer
# tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-multilingual-cased")

# # Tokenize function
# def tokenize_function(examples):
#     return tokenizer(examples['post'], padding='max_length', truncation=True)

# # Convert Pandas DataFrame to Dataset
# dataset = Dataset.from_pandas(data)

# # Tokenize dataset
# tokenized_datasets = dataset.map(tokenize_function, batched=True)

# # Add labels to the tokenized dataset
# tokenized_datasets = tokenized_datasets.add_column("labels", data['subreddit'].tolist())

# # Split into train and test datasets
# train_test_split = tokenized_datasets.train_test_split(test_size=0.2)
# train_dataset = train_test_split['train']
# eval_dataset = train_test_split['test']

# # Define DataLoader
# train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
# eval_dataloader = DataLoader(eval_dataset, batch_size=8)

# # Load model for sequence classification
# model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-base-multilingual-cased", num_labels=len(label_encoder.classes_))

# # Define optimizer
# optimizer = AdamW(model.parameters(), lr=5e-5)

# # Define learning rate scheduler
# num_epochs = 3
# num_training_steps = num_epochs * len(train_dataloader)
# lr_scheduler = get_scheduler(
#     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
# )

# # Install and import the evaluate module
# !pip install evaluate
# import evaluate

# # Define metric
# metric = evaluate.load("accuracy")

# # Compute metrics function
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="/kaggle/working/",
#     evaluation_strategy="epoch",
#     num_train_epochs=num_epochs,
#     per_device_train_batch_size=8,
#     per_device_eval_batch_size=8,
#     warmup_steps=500,
#     weight_decay=0.01,
#     logging_dir='/kaggle/working/logs',
# )

# # Initialize Trainer
# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=train_dataset,
#     eval_dataset=eval_dataset,
#     compute_metrics=compute_metrics,
# )

# # Train the model
# trainer.train()

In [4]:
# Save the model and tokenizer
model.save_pretrained("/kaggle/working/saved_model")
tokenizer.save_pretrained("/kaggle/working/saved_model")

('/kaggle/working/saved_model/tokenizer_config.json',
 '/kaggle/working/saved_model/special_tokens_map.json',
 '/kaggle/working/saved_model/vocab.txt',
 '/kaggle/working/saved_model/added_tokens.json',
 '/kaggle/working/saved_model/tokenizer.json')

In [5]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

# Load the model and tokenizer
model_path = "/kaggle/working/saved_model"
print(f"Loading model from: {model_path}")
model = AutoModelForSequenceClassification.from_pretrained(model_path)
print("Model loaded successfully.")

tokenizer = AutoTokenizer.from_pretrained(model_path)
print("Tokenizer loaded successfully.")

# Example usage:
input_text = "Losing it. Lately I haven’t felt like a real person. And I don’t know if that’s the right way to describe it. I’m always in my head, always thinking and overthinking but never ever about anything that actually matters. Nothing excites me anymore, I can’t focus on anything for too long. All I do is sleep, I am exhausted 99% of the time. When I’m not at work I’ll sometimes sleep for literally my entire off day. I can’t keep up with chores. I have a hard time showering/brushing my teeth which makes me feel so disgusting. I hate my job. I’m getting very bored in my relationship even though I love him to death. I feel very alone because he doesn’t get it and we go through the same motions and conversations every day. Basically I don’t have any plans or directions for my life, everything scares me, and I have no idea what to do. Nothing seems to help and no one seems to understand. I just downloaded this app and decided to post here because I have no one to talk to and I am so scared."
print(f"Input text: {input_text}")

inputs = tokenizer(input_text, return_tensors="pt")
print("Tokenized input:", inputs)

outputs = model(**inputs)
print("Model outputs:", outputs)

import torch
import numpy as np

# Assuming 'outputs' is the SequenceClassifierOutput object as shown
logits = outputs.logits
predictions = torch.argmax(logits, dim=-1)
predicted_class_index = predictions.item()

# Decode the predicted class using label_encoder
predicted_class = label_encoder.classes_[predicted_class_index]

print("Predicted Class:", predicted_class)


Loading model from: /kaggle/working/saved_model
Model loaded successfully.
Tokenizer loaded successfully.
Input text: Losing it. Lately I haven’t felt like a real person. And I don’t know if that’s the right way to describe it. I’m always in my head, always thinking and overthinking but never ever about anything that actually matters. Nothing excites me anymore, I can’t focus on anything for too long. All I do is sleep, I am exhausted 99% of the time. When I’m not at work I’ll sometimes sleep for literally my entire off day. I can’t keep up with chores. I have a hard time showering/brushing my teeth which makes me feel so disgusting. I hate my job. I’m getting very bored in my relationship even though I love him to death. I feel very alone because he doesn’t get it and we go through the same motions and conversations every day. Basically I don’t have any plans or directions for my life, everything scares me, and I have no idea what to do. Nothing seems to help and no one seems to under

In [6]:
!zip -r file.zip /kaggle/working/saved_model

  pid, fd = os.forkpty()
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/saved_model/ (stored 0%)
  adding: kaggle/working/saved_model/vocab.txt (deflated 45%)
  adding: kaggle/working/saved_model/model.safetensors (deflated 7%)
  adding: kaggle/working/saved_model/tokenizer.json (deflated 67%)
  adding: kaggle/working/saved_model/special_tokens_map.json (deflated 42%)
  adding: kaggle/working/saved_model/config.json (deflated 55%)
  adding: kaggle/working/saved_model/tokenizer_config.json (deflated 76%)


In [7]:
from IPython.display import FileLink
FileLink(r'file.zip')

In [8]:
!pip list

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Package                                  Version
---------------------------------------- ------------------
absl-py                                  1.4.0
accelerate                               0.30.1
access                                   1.1.9
affine                                   2.4.0
aiobotocore                              2.13.0
aiofiles                                 22.1.0
aiohttp                                  3.9.1
aioitertools                             0.11.0
aiorwlock                                1.3.0
aiosignal                                1.3.1
aiosqlite                                0.19.0
albumentations                           1.4.0
alembic                                  1.13.1
altair                                   5.3.0
annotated-types                          0.6.0
annoy                                    1.17.3
anyio                                    4.2.0
apache-beam                              2.46.0
aplus            