<a href="https://colab.research.google.com/github/MarcPal08/2024-bracciano-iris/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create dataset

In [6]:
!pip install transformers datasets



In [8]:
from datasets import load_dataset

In [9]:
v01_files = ["v01/m00 - m00-1.csv",
    "v01/m00 - m00-2.csv",
    "v01/m00 - m00-3.csv",
    "v01/m00 - m00-4.csv",
    "v01/m01 - m01-1.csv",
    "v01/m02 - m02-1.csv",
    "v01/m03 - m03-1.csv",
    "v01/m04 - m04-1.csv"]
dataset = load_dataset("csv",data_files=v01_files)

Generating train split: 0 examples [00:00, ? examples/s]

In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'sentiment'],
        num_rows: 349
    })
})

In [12]:
sample = dataset["train"].shuffle(seed=40).select(range(10))
print(sample[:3])

{'text': ['Sono importanti perché utili alle competenze lavorative necessarie', 'Ottima la dimensione didattica, mediocre la dimensione organizzativa e di coordinamento.', 'la scuola mi sta aiutando a crescere formalmente e mi sta insegnando cose importanti per la mia vita'], 'sentiment': ['positive', 'positive', 'positive']}


In [13]:
dataset_clean = dataset["train"].train_test_split(train_size=0.8, seed=42)

In [14]:
dataset_clean

DatasetDict({
    train: Dataset({
        features: ['text', 'sentiment'],
        num_rows: 279
    })
    test: Dataset({
        features: ['text', 'sentiment'],
        num_rows: 70
    })
})

In [15]:
dataset_clean.save_to_disk('v01/dataset_clean')

Saving the dataset (0/1 shards):   0%|          | 0/279 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/70 [00:00<?, ? examples/s]

# Upload dataset

In [20]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [21]:
dataset_clean.push_to_hub("sentiment-analysis-test")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/MarcPal08/sentiment-analysis-test/commit/28afeaae0f512fea971a3f517948c323c621f74c', commit_message='Upload dataset', commit_description='', oid='28afeaae0f512fea971a3f517948c323c621f74c', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/MarcPal08/sentiment-analysis-test', endpoint='https://huggingface.co', repo_type='dataset', repo_id='MarcPal08/sentiment-analysis-test'), pr_revision=None, pr_num=None)

# Tokenize it!

In [22]:
from transformers import AutoTokenizer
from datasets import load_from_disk

dataset = load_from_disk("v01/dataset_clean")

tokenizer = AutoTokenizer.from_pretrained("MilaNLProc/feel-it-italian-sentiment")



tokenizer_config.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/847 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/794k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.68M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/299 [00:00<?, ?B/s]

In [23]:
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128)

In [29]:
tokenized_dataset = dataset.map(tokenize_function, batched=True)
print(tokenized_dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 279
    })
    test: Dataset({
        features: ['text', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 70
    })
})


In [31]:
samples = tokenized_dataset["train"].shuffle(seed=40).select(range(10))
samples[:3]

{'text': ['cogestione, dibattiti sugli argomenti trattati,progetti ',
  'Sicuramente la mancanza di comprensione e empatia da parte di alcuni professori che mettono verifiche su verifiche la stessa settimana ',
  'essere bocciato o essere rimandato a settembre'],
 'sentiment': ['positive', 'negative', 'negative'],
 'input_ids': [[5,
   253,
   23414,
   31914,
   21624,
   3634,
   6542,
   6505,
   31914,
   1686,
   23756,
   31897,
   6],
  [5,
   10852,
   51,
   4874,
   21,
   8604,
   26,
   30826,
   95,
   352,
   21,
   1076,
   17265,
   58,
   9048,
   15009,
   78,
   15009,
   51,
   1458,
   1989,
   31897,
   6],
  [5, 321, 1139, 8478, 73, 321, 2658, 1203, 13, 1889, 6]],
 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}