<a href="https://colab.research.google.com/github/MarcPal08/2024-bracciano-iris/blob/main/sentiment_analysis_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load dataset

Create a secret variable named "gh_username" with your github/huggingface username

In [48]:
from google.colab import userdata
username = userdata.get('gh_username')
print(username)

MarcPal08


In [49]:
!pip install transformers datasets



In [50]:
from datasets import load_dataset

# Change the username with yours
dataset = load_dataset(f"{username}/sentiment-analysis-test")


# Tokenize

In [51]:
# Add label column to dataset

label2id = {"negative": 0, "neutral": 1, "positive": 2}

def add_label_column(examples):
    examples["label"] = label2id[examples["sentiment"]]
    return examples

dataset = dataset.map(add_label_column)
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'sentiment', 'label'],
        num_rows: 279
    })
    test: Dataset({
        features: ['text', 'sentiment', 'label'],
        num_rows: 70
    })
})


In [52]:
print(dataset["train"][0])
print(dataset["test"][0])

{'text': 'i professori, lo spazio interno ed esterno della struttura con il giardino', 'sentiment': 'positive', 'label': 2}
{'text': 'il 10 che ho preso ad educazione fisica', 'sentiment': 'positive', 'label': 2}


In [53]:
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/xlm-roberta-base-tweet-sentiment-it")

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
print(tokenized_dataset)


DatasetDict({
    train: Dataset({
        features: ['text', 'sentiment', 'label', 'input_ids', 'attention_mask'],
        num_rows: 279
    })
    test: Dataset({
        features: ['text', 'sentiment', 'label', 'input_ids', 'attention_mask'],
        num_rows: 70
    })
})


In [54]:
print(tokenized_dataset["train"][0])
print(tokenized_dataset["test"][0])

{'text': 'i professori, lo spazio interno ed esterno della struttura con il giardino', 'sentiment': 'positive', 'label': 2, 'input_ids': [0, 17, 16030, 14, 4, 459, 40481, 46188, 2223, 6, 127585, 832, 37778, 158, 211, 107300, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'text': 'il 10 che ho preso ad educazione fisica', 'sentiment': 'positive', 'label': 2, 'input_ids': [0, 211, 209, 290, 739, 29377, 606, 6, 199466, 130198, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


# Fine tuning a pre-trained model

In [55]:
from transformers import AutoModelForSequenceClassification

checkpoint = "cardiffnlp/xlm-roberta-base-tweet-sentiment-it"
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

In [56]:
print(model)

XLMRobertaForSequenceClassification(
  (roberta): XLMRobertaModel(
    (embeddings): XLMRobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): XLMRobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x XLMRobertaLayer(
          (attention): XLMRobertaAttention(
            (self): XLMRobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): XLMRobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=

In [57]:
print(model.config)

XLMRobertaConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "XLMRobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "negative",
    "1": "neutral",
    "2": "positive"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 0,
    "neutral": 1,
    "positive": 2
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "xlm-roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}



In [58]:
# Create training parameters

from transformers import TrainingArguments

training_args = TrainingArguments("sentiment-analysis-test")



In [59]:
# Create trainer

from transformers import Trainer

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator
)

In [60]:
# Train the model

trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmarcopalumbo068[0m ([33mmarcopalumbo068-liceo-scientifico-statale-ignazio-vian[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


TrainOutput(global_step=105, training_loss=0.6555173601422991, metrics={'train_runtime': 527.805, 'train_samples_per_second': 1.586, 'train_steps_per_second': 0.199, 'total_flos': 19028444593032.0, 'train_loss': 0.6555173601422991, 'epoch': 3.0})

In [61]:
# Push the new model to the hub
trainer.push_to_hub()


events.out.tfevents.1746623264.544c5f084b7a.986.0:   0%|          | 0.00/5.25k [00:00<?, ?B/s]

events.out.tfevents.1746626026.544c5f084b7a.986.2:   0%|          | 0.00/5.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

events.out.tfevents.1746625476.544c5f084b7a.986.1:   0%|          | 0.00/15.9k [00:00<?, ?B/s]

Upload 7 LFS files:   0%|          | 0/7 [00:00<?, ?it/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/5.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/MarcPal08/sentiment-analysis-test/commit/139267e835f77fa7690591542abe0bc3065454d2', commit_message='End of training', commit_description='', oid='139267e835f77fa7690591542abe0bc3065454d2', pr_url=None, repo_url=RepoUrl('https://huggingface.co/MarcPal08/sentiment-analysis-test', endpoint='https://huggingface.co', repo_type='model', repo_id='MarcPal08/sentiment-analysis-test'), pr_revision=None, pr_num=None)

# Evaluate!

In [62]:
from transformers import pipeline, AutoModelForSequenceClassification

# Load and test the original model
sentiment_pipeline_orig = pipeline("sentiment-analysis", model="cardiffnlp/xlm-roberta-base-tweet-sentiment-it")

prompt = "Personale docente"
print(f"Sentiment for prompt {prompt}:")
print(sentiment_pipeline_orig(prompt))


prompt = "Interrogazioni a sorpresa"
print(f"Sentiment for prompt {prompt}:")
print(sentiment_pipeline_orig(prompt))

Device set to use cuda:0


Sentiment for prompt Personale docente:
[{'label': 'neutral', 'score': 0.9373170733451843}]
Sentiment for prompt Interrogazioni a sorpresa:
[{'label': 'negative', 'score': 0.9806037545204163}]


In [63]:
from transformers import pipeline, AutoModelForSequenceClassification

# Load and test our model
sentiment_pipeline = pipeline("sentiment-analysis", model=f"{username}/sentiment-analysis-test")

prompt = "Personale docente"

print(f"Sentiment for prompt {prompt}:")
print(sentiment_pipeline(prompt))

prompt = "Interrogazioni a sorpresa"
print(f"Sentiment for prompt {prompt}:")
print(sentiment_pipeline(prompt))

config.json:   0%|          | 0.00/889 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Device set to use cuda:0


Sentiment for prompt Personale docente:
[{'label': 'positive', 'score': 0.9237983226776123}]
Sentiment for prompt Interrogazioni a sorpresa:
[{'label': 'negative', 'score': 0.7684286832809448}]


# Gradio interface

In [64]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.29.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.0 (from gradio)
  Downloading gradio_client-1.10.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.8-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6

In [65]:
import gradio as gr

# Load and test our model
sentiment_pipeline = pipeline("sentiment-analysis", model=f"{username}/sentiment-analysis-test")

def analyze_sentiment(text):
    result = sentiment_pipeline(text)
    label = result[0]['label']
    score = result[0]['score']
    return f"Label: {label}, Score: {score}"

iface = gr.Interface(
    fn=analyze_sentiment,
    inputs=gr.Textbox(label="Prompt", lines=2, placeholder="Scrivi qui qualcosa sulla tua scuola..."),
    outputs=gr.Textbox(label="Sentiment Analysis Result"),
    title="Sentiment Analysis for VIAN",
    description="Analizza i sentimenti riguardo alla tua scuola con un modello fine-tuned",
)

iface.launch()


Device set to use cuda:0


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6e94f7b67a060c1da7.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


