Clone github repo to have access to data files.

In [39]:
!git clone https://github.com/Hananxx/SentimentAnalysisPromptExp.git

fatal: destination path 'SentimentAnalysisPromptExp' already exists and is not an empty directory.


### Install and import needed packages

In [90]:
!pip install transformers torch pandas accelerate



In [91]:
import pandas as pd
import json
from transformers import pipeline, AutoTokenizer
import torch
from sklearn.metrics import accuracy_score, precision_recall_fscore_support



### Set filepath prefix

In [92]:
filepath_prefix = "SentimentAnalysisPromptExp/"

### Load testing dataset

In [93]:
df = pd.read_csv(filepath_prefix + 'data/app-test.csv')
print(df.head())  # Inspect the first few rows

                                            sentence  label
0                  its nice this apps is must lovely      1
1        this is really good this app is really good      1
2  ? freezes and force closes a lot on droid incr...      2
3  favorite i use this application every day is v...      1
4  ? probally the biggest flop ever. as soon as y...      2


### Load templates and set the used template

In [94]:
with open(filepath_prefix + 'prompts/zero-shot-prompt-template.json', 'r') as f:
    templates = json.load(f)
print(templates)
prompt = templates['vicuna-0']

{'vicuna-0': "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\nUSER: Please perform Sentiment Classification task. Given the sentence from {}, assign a sentiment label from ['negative', 'neutral', 'positive']. Return label only without any other text.\nASSISTANT: Sure!</s>\nUSER: Sentence: {}\nASSISTANT:", 'vicuna-jira-0': "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.\nUSER: Please perform Sentiment Classification task. Given the sentence from {}, assign a sentiment label from ['negative', 'positive']. Return label only without any other text.\nASSISTANT: Sure!</s>\nUSER: Sentence: {}\nASSISTANT:", 'llama2-0': "<s>[INST] <<SYS>>\nA chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's q

### Load sentences

In [95]:
sentences = df['sentence'].tolist() # For test purposes only the first 50 sentences.

### Extract sentiment labels from different responses

In [96]:
def extract_sentiment_label(response):
    labels = ['positive', 'negative', 'neutral']
    for label in labels:
        if label in response:
            return label
    return None  # if no label found

### Load tokenizer and model

In [97]:
models = ['lmsys/vicuna-13b-v1.5', 'WizardLM/WizardLM-13B-V1.2', 'meta-llama/Llama-2-13b-chat-hf']
def load_model(model_name):
    model_output = []
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    # create a text generation pipeline
    model_pipeline = pipeline(
        'text-generation',
        model=model_name,
        tokenizer=tokenizer,
        dtype=torch.float16,
        device_map='auto',
    )

    for sentence in sentences:
        full_prompt = prompt.format("APP reviews", sentence)
        output = model_pipeline(
            full_prompt,
            max_new_tokens=1024,
            # max_new_tokens= 512,
            temperature=0.7,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
            return_full_text=False
        )
        response = output[0]['generated_text'].strip()  # Extract just the label.
        model_output.append(extract_sentiment_label(response.lower()))

    return model_output

### Post process metrics

In [98]:
labels = { 0: 'neutral', 1: 'positive', 2: 'negative'}
def get_data_frame(model_output):
      return pd.DataFrame({
        'text': df['sentence'].tolist(),
        'true_label': [labels[label_num] for label_num in df['label'].tolist()],
        'pred_label': model_output
    })

def calc_precision_recall_fscore(df, average='weighted'):
  return precision_recall_fscore_support(
    df['true_label'], df['pred_label'], average=average
)

#### Weighted metrics

In [99]:
def weighted_metrics(data_frame):
    accuracy = accuracy_score(data_frame['true_label'], data_frame['pred_label'])
    precision, recall, f1, _ = calc_precision_recall_fscore(data_frame, average='weighted')
    print(f"Accuracy: {accuracy:.2f}")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1-Score: {f1:.2f}")

#### Micro and Macro metrics

In [100]:
def macro_metrics(df):
    precision_macro, recall_macro, f1_macro, _ = calc_precision_recall_fscore(
        df, average='macro'
    )
    print(f"Macro Precision: {precision_macro:.2f}")
    print(f"Macro Recall: {recall_macro:.2f}")
    print(f"Macro F1-Score: {f1_macro:.2f}")

def micro_metrics(df):
    precision_micro, recall_micro, f1_micro, _ = calc_precision_recall_fscore(
    df, average='micro'
    )
    print(f"Micro Precision: {precision_micro:.2f}")
    print(f"Micro Recall: {recall_micro:.2f}")
    print(f"Micro F1-Score: {f1_micro:.2f}")

### Start analysis - Vicuna model

In [101]:
vicuna_output = load_model(models[0])
print('=====( Vicuna model metrics )=====')
vicuna_data_frame = get_data_frame(vicuna_output)
weighted_metrics(vicuna_data_frame)
print('=' * 20)
macro_metrics(vicuna_data_frame)
print('=' * 20)
micro_metrics(vicuna_data_frame)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Device set to use cuda:0


=====( Vicuna model metrics )=====
Accuracy: 0.91
Precision: 0.90
Recall: 0.91
F1-Score: 0.91
Macro Precision: 0.79
Macro Recall: 0.75
Macro F1-Score: 0.77
Micro Precision: 0.91
Micro Recall: 0.91
Micro F1-Score: 0.91


### Analysis - WizardLM model

In [102]:
wizardlm_output = load_model(models[1])
print('=====( WizardLM model metrics )=====')
wizardlm_data_frame = get_data_frame(wizardlm_output)
weighted_metrics(wizardlm_data_frame)
print('=' * 20)
macro_metrics(wizardlm_data_frame)
print('=' * 20)
micro_metrics(wizardlm_data_frame)

Device set to use cuda:0


=====( WizardLM model metrics )=====
Accuracy: 0.86
Precision: 0.88
Recall: 0.86
F1-Score: 0.85
Macro Precision: 0.83
Macro Recall: 0.79
Macro F1-Score: 0.80
Micro Precision: 0.86
Micro Recall: 0.86
Micro F1-Score: 0.86
