Main resources:
- https://brighteshun.medium.com/sentiment-analysis-part-1-finetuning-and-hosting-a-text-classification-model-on-huggingface-9d6da6fd856b

In [1]:
#Import Libraries
import pandas as pd
import numpy as np
from scipy.special import softmax
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

#finetuning
from transformers import pipeline, AutoTokenizer, TrainingArguments, AutoModelForSequenceClassification, Trainer, DistilBertTokenizerFast, DefaultDataCollator

In [15]:
base_data = pd.read_csv("base.csv")
base_data

Unnamed: 0,Year,Cleaned_Review,is_negative_sentiment
0,2016,Gold Coast to Bangkok via Singapore with Scoot...,1
1,2016,My Scoot flight from Melbourne to Singapore wa...,0
2,2016,Flew back from Amritsar to Singapore on 19th S...,1
3,2016,$500 round trip from Tokyo to Taipei for a fam...,1
4,2016,"Overall excellent service from Scoot, however ...",0
...,...,...,...
1594,2012,SYD-OOL. Arrived at the airport on time. Fligh...,1
1595,2012,Cairns-Sydney-Phuket in Business class. Was no...,1
1596,2012,Had the misfortune of flying Business class fr...,1
1597,2012,I have just flown from Melbourne to Sydney and...,1


In [5]:
test1_data = pd.read_csv("time_period1.csv")
test1_data

Unnamed: 0,Year,Cleaned_Review,is_negative_sentiment
0,2019,Our trip to Athens this month certainly met ou...,0
1,2019,"Singapore to Sydney. For the most part, this a...",0
2,2019,Coimbatore to Singapore. Friendly Check-in sta...,1
3,2019,Singapore to Jakarta. There's no space to put ...,1
4,2019,Wuhan to Singapore. I am very disappointed tha...,1
...,...,...,...
1383,2017,Jetstar Airways exceeded expectations and one ...,0
1384,2017,Sydney to Melbourne with Jetstar Airways. I bo...,0
1385,2017,Adelaide to Denpasar. My first experience with...,1
1386,2017,Flew Hobart to Sydney direct yesterday. Booked...,0


In [6]:
test2_data = pd.read_csv("time_period2.csv")
test2_data

Unnamed: 0,Year,Cleaned_Review,is_negative_sentiment
0,2024,My flight to Singapore was uneventful as usual...,1
1,2024,Using super old plane. Aircon was blowing warm...,1
2,2024,"I travelled with my sister, my elderly parent ...",0
3,2024,Was assigned last two row at seat 39F with the...,0
4,2024,Very rude male flight attendant. Accessed the ...,1
...,...,...,...
1125,2020,Sydney to Launceston. For an extra two kilos i...,1
1126,2020,Denpasar to Melbourne. I had a really disappoi...,1
1127,2020,"Sydney to Launceston. Yet again, Jetstar canno...",1
1128,2020,Honolulu to Sydney. I flew with Jetstar for my...,1


In [7]:
sample = test2_data.iloc[5]
sample_txt, sample_label = sample['Cleaned_Review'], sample['is_negative_sentiment']
print(sample_txt)
print(sample_label)

Upon check-in to Phuket airport, I could see the flight had been delayed. When I advised ground staff that this would mean I would most likely miss my connecting flight in Singapore (dep 1am) through to Melbourne which would cause onward delays (including an unplanned overnight stay), and could they assist by allowing me to move to the front of the plane, or communicate with Singapore ground staff, they advised nothing could be done and that I could sort it at Singapore Changi airport. As it was, the plane was further delayed and the connecting flight was well and truly missed. As I was seated at the very back of the plane, & I could not exit from the rear, I was last off the plane, which meant I was at the back of a very long line which took 2 hours (1am - 3am Singapore time) for me to arrive at the counter in Singapore. I was told I would be on a flight at 11am to MEL. I explained that the arrival time into MEL 2130 meant that I could not get my connections across Victoria into South

In [8]:
def load_model(model_path):
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)
    return model, tokenizer

In [9]:
output_dir = "baseline_pipeline_sentiments_analysis_distilbert"
model, tokenizer = load_model(output_dir)

In [10]:
# Process the input text and return sentiment prediction
def is_negative_sentiment_score(text):
    encoded_input = tokenizer(text, truncation=True, return_tensors="pt")  # for PyTorch-based models
    output = model(**encoded_input)
    scores_ = output[0][0].detach().numpy()
    scores_ = softmax(scores_)

    # Format output dictionary of scores
    labels = ["Negative", "Positive"]
    scores = {l: float(s) for (l, s) in zip(labels, scores_)}
    return scores.get("Negative", 0.0)

In [11]:
def predict(test_data_df): 
    th = 0.5

    test_data_df['negative_sentiment_score'] = test_data_df['Cleaned_Review'].apply(is_negative_sentiment_score)
    test_data_df['sentiment_acc'] = test_data_df['negative_sentiment_score'].apply(lambda score: 1 if score >= th else 0)
    return test_data_df

In [12]:
print(sample_txt)
print(sample_label)
is_negative_sentiment_score(sample_txt)

Upon check-in to Phuket airport, I could see the flight had been delayed. When I advised ground staff that this would mean I would most likely miss my connecting flight in Singapore (dep 1am) through to Melbourne which would cause onward delays (including an unplanned overnight stay), and could they assist by allowing me to move to the front of the plane, or communicate with Singapore ground staff, they advised nothing could be done and that I could sort it at Singapore Changi airport. As it was, the plane was further delayed and the connecting flight was well and truly missed. As I was seated at the very back of the plane, & I could not exit from the rear, I was last off the plane, which meant I was at the back of a very long line which took 2 hours (1am - 3am Singapore time) for me to arrive at the counter in Singapore. I was told I would be on a flight at 11am to MEL. I explained that the arrival time into MEL 2130 meant that I could not get my connections across Victoria into South

0.9997226595878601

In [13]:
def model_inference(test_data):
    TH = 0.5

    test_data_df = predict(test_data)
    test_data_df['sentiment_acc'] = test_data_df['negative_sentiment_score'].apply(lambda score: 1 if score >= TH else 0)

    pred_labels = test_data_df['sentiment_acc']
    true_labels = test_data_df['is_negative_sentiment']

    confusion_matrix_result = confusion_matrix(pred_labels, true_labels)
    accuracy = accuracy_score(true_labels, pred_labels)

    return test_data_df, confusion_matrix_result, accuracy

In [16]:
output_df, confusion_matrix_result, accuracy = model_inference(base_data)

In [17]:
print(confusion_matrix_result)
print(accuracy)

output_df.head()

[[641  27]
 [278 653]]
0.809255784865541


Unnamed: 0,Year,Cleaned_Review,is_negative_sentiment,negative_sentiment_score,sentiment_acc
0,2016,Gold Coast to Bangkok via Singapore with Scoot...,1,0.996442,1
1,2016,My Scoot flight from Melbourne to Singapore wa...,0,0.996511,1
2,2016,Flew back from Amritsar to Singapore on 19th S...,1,0.999636,1
3,2016,$500 round trip from Tokyo to Taipei for a fam...,1,0.998273,1
4,2016,"Overall excellent service from Scoot, however ...",0,0.330671,0


In [18]:
test1_output, test1_confusion_matrix_result, test1_accuracy = model_inference(test1_data)

In [19]:
print(test1_confusion_matrix_result)
print(test1_accuracy)

test1_output.head()

[[307  13]
 [128 940]]
0.8984149855907781


Unnamed: 0,Year,Cleaned_Review,is_negative_sentiment,negative_sentiment_score,sentiment_acc
0,2019,Our trip to Athens this month certainly met ou...,0,0.001487,0
1,2019,"Singapore to Sydney. For the most part, this a...",0,0.996364,1
2,2019,Coimbatore to Singapore. Friendly Check-in sta...,1,0.992684,1
3,2019,Singapore to Jakarta. There's no space to put ...,1,0.998239,1
4,2019,Wuhan to Singapore. I am very disappointed tha...,1,0.999063,1


In [20]:
test2_output, test2_confusion_matrix_result, test2_accuracy = model_inference(test2_data)

In [21]:
print(test2_confusion_matrix_result)
print(test2_accuracy)

test2_output.head()

[[153  12]
 [ 56 909]]
0.9398230088495575


Unnamed: 0,Year,Cleaned_Review,is_negative_sentiment,negative_sentiment_score,sentiment_acc
0,2024,My flight to Singapore was uneventful as usual...,1,0.999516,1
1,2024,Using super old plane. Aircon was blowing warm...,1,0.998806,1
2,2024,"I travelled with my sister, my elderly parent ...",0,0.001175,0
3,2024,Was assigned last two row at seat 39F with the...,0,0.008209,0
4,2024,Very rude male flight attendant. Accessed the ...,1,0.999674,1


In [22]:
# Save the DataFrame to a CSV file named 'test2_output.csv'
test2_output.to_csv('baseline_test2_output.csv', index=False)