Main resources:
- https://brighteshun.medium.com/sentiment-analysis-part-1-finetuning-and-hosting-a-text-classification-model-on-huggingface-9d6da6fd856b

# 1. Install dependencies

In [1]:
# !pip install -q transformers

In [2]:
#Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import csv

#finetuning
from datasets import load_dataset
from transformers import pipeline, AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification, Trainer, DistilBertTokenizerFast, DefaultDataCollator

# 2. Run sentiment analysis predictions by using Pipeline

Data: https://www.airlinequality.com/airline-reviews/scoot/

In [3]:
sample_data = 'I travelled with my sister, my elderly parent and my toddler son taking Scoot from Haikou to Singapore. The journey was a pleasant one despite the episode of my child became very unwell, alternating between drowsy and cranky in the mid of the journey. We sought the crew for help. They are very professional, helpful and friendly. They checked us out first, discussed with their captain and then prep us on the travel duration and preparation on ground. Besides, they offered ice pack/wet kitchen towels along the way. Nearing to the destination, they even moved me and my son to the front row. The flight arrived 20 minutes earlier. After that, they connected me and my son with the ground crew Firliza who accompanied us to airport clinic for medical treatment.'

In [4]:
sentiment_pipeline = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

In [5]:
data = [sample_data]
analysis = sentiment_pipeline(data)

analysis

[{'label': 'POSITIVE', 'score': 0.9988247752189636}]

# 3. Fine tuning on custom dataset

In [6]:
def remove_non_utf8(input_file, output_file, replace_char=''):
  with open(input_file, 'r') as f:
    content = f.read()
  cleaned_content = ''.join(char if char.isascii() else replace_char for char in content)
  with open(output_file, 'w') as f:
    f.write(cleaned_content)


def text_to_csv(input_file, output_file, delimiter=','):
  """
  Converts a text file to a CSV file, using the specified delimiter.

  Args:
      input_file (str): Path to the input text file.
      output_file (str): Path to the output CSV file.
      delimiter (str, optional): Delimiter separating data in the text file. Defaults to ','.
  """

  with open(input_file, 'r') as input_file_handle, open(output_file, 'w', newline='') as output_csv:
    reader = csv.reader(input_file_handle, delimiter=delimiter)
    writer = csv.writer(output_csv)
    # Assuming the first line contains headers (optional)
    headers = next(reader)  # Read and store the header row (if present)
    writer.writerow(headers)  # Write the header row to the CSV

    for row in reader:
      writer.writerow(row)

In [7]:
# Example usage
data_path = 'base.csv'
cleaned_txt_path = 'cleaned.txt'
cleaned_csv_path = 'cleaned.csv'

remove_non_utf8(data_path, cleaned_txt_path) 
text_to_csv(cleaned_txt_path, cleaned_csv_path)

In [8]:
df = pd.read_csv(cleaned_csv_path)

# Rename 'old_name' to 'new_name'
df = df.rename(columns={'is_negative_sentiment': 'label'})

df.head()

Unnamed: 0,Year,Cleaned_Review,label
0,2016,Gold Coast to Bangkok via Singapore with Scoot...,1
1,2016,My Scoot flight from Melbourne to Singapore wa...,0
2,2016,Flew back from Amritsar to Singapore on 19th S...,1
3,2016,$500 round trip from Tokyo to Taipei for a fam...,1
4,2016,"Overall excellent service from Scoot, however ...",0


In [9]:
# Split the train data => {train, eval}
train, eval = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

In [10]:
# Save splitted subsets

df.to_csv("train.csv", index=False)
eval.to_csv("eval.csv", index=False)

In [11]:
dataset = load_dataset('csv',
                        data_files =
                        {'train': 'train.csv',
                        'eval': 'eval.csv'
                        }
                      )

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

In [12]:
#Create a tokenizer instance
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

In [13]:
#create a function to convert label
def transform_labels(label):
    return {'labels': label['label']}

# let's tokenize the data for the model to be able to understand
def tokenize_data(example):
    return tokenizer(example['Cleaned_Review'], truncation=True, padding='max_length')

# Transform labels and remove the useless columns
remove_label = ['label']
remove_text = ['Cleaned_Review']

dataset = dataset.map(transform_labels, remove_columns=remove_label)
dataset = dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/1599 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

Map:   0%|          | 0/1599 [00:00<?, ? examples/s]

Map:   0%|          | 0/320 [00:00<?, ? examples/s]

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Year', 'Cleaned_Review', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1599
    })
    eval: Dataset({
        features: ['Year', 'Cleaned_Review', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 320
    })
})

In [15]:
# Loading a pretrain model while specifying the number of labels in our dataset for fine-tuning
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english", num_labels=2)

In [16]:
# freeze all layers except the final classifier
for param in model.distilbert.parameters():
    param.requires_grad = False

for param in model.pre_classifier.parameters():
    param.requires_grad = False

for param in model.classifier.parameters():
    param.requires_grad = True

In [17]:
output_dir = "finetune_sentiments_analysis_2"
#set the training arguments
training_args = TrainingArguments(
                            num_train_epochs=1,
                            evaluation_strategy='epoch',
                            save_strategy='epoch',
                            learning_rate=2e-5,
                            load_best_model_at_end=True,
                            output_dir=output_dir
                            )

In [18]:
# shuffle the datasets

train_dataset = dataset['train'].shuffle(seed=10) 
eval_dataset = dataset['eval'].shuffle(seed=10)

In [19]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset
)


In [20]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Year, Cleaned_Review. If Year, Cleaned_Review are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1599
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 200
  Number of trainable parameters = 1538


  0%|          | 0/200 [00:00<?, ?it/s]

The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: Year, Cleaned_Review. If Year, Cleaned_Review are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 320
  Batch size = 8


  0%|          | 0/40 [00:00<?, ?it/s]

Saving model checkpoint to finetune_sentiments_analysis_2\checkpoint-200
Configuration saved in finetune_sentiments_analysis_2\checkpoint-200\config.json


{'eval_loss': 4.3411736488342285, 'eval_runtime': 230.8135, 'eval_samples_per_second': 1.386, 'eval_steps_per_second': 0.173, 'epoch': 1.0}


Model weights saved in finetune_sentiments_analysis_2\checkpoint-200\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from finetune_sentiments_analysis_2\checkpoint-200 (score: 4.3411736488342285).


{'train_runtime': 2014.3307, 'train_samples_per_second': 0.794, 'train_steps_per_second': 0.099, 'train_loss': 4.2719595336914065, 'epoch': 1.0}


TrainOutput(global_step=200, training_loss=4.2719595336914065, metrics={'train_runtime': 2014.3307, 'train_samples_per_second': 0.794, 'train_steps_per_second': 0.099, 'train_loss': 4.2719595336914065, 'epoch': 1.0})

In [28]:
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Model saved successfully to local directory:", output_dir)

Configuration saved in finetune_sentiments_analysis_2\config.json
Model weights saved in finetune_sentiments_analysis_2\pytorch_model.bin
tokenizer config file saved in finetune_sentiments_analysis_2\tokenizer_config.json
Special tokens file saved in finetune_sentiments_analysis_2\special_tokens_map.json


Model saved successfully to local directory: finetune_sentiments_analysis_2


In [29]:
def load_model(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    return model, tokenizer

# Part 5: Inference Pipeline

In [30]:
from scipy.special import softmax

In [31]:
model, tokenizer = load_model(output_dir)

loading file vocab.txt
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file finetune_sentiments_analysis_2\config.json
Model config DistilBertConfig {
  "_name_or_path": "finetune_sentiments_analysis_2",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "finetuning_task": "sst-2",
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "problem_type": "single_label_classification",
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transfor

In [32]:
# Process the input text and return sentiment prediction
def sentiment_analysis(text):
    # text = preprocess(text)
    encoded_input = tokenizer(text, return_tensors="pt")  # for PyTorch-based models
    output = model(**encoded_input)
    scores_ = output[0][0].detach().numpy()
    scores_ = softmax(scores_)

    # Format output dictionary of scores
    labels = ["Negative", "Positive"]
    scores = {l: float(s) for (l, s) in zip(labels, scores_)}
    return scores

In [33]:
sample_text = 'Scoot is awesome'

sentiment_analysis(sample_text)

{'Negative': 0.00037257905933074653, 'Positive': 0.9996273517608643}

In [34]:
sample_text = 'Scoot is horrible'

sentiment_analysis(sample_text)

{'Negative': 0.9993104934692383, 'Positive': 0.0006894702091813087}