## Import Libraries

In [None]:
!pip install --upgrade huggingface_hub
!pip install datasets

In [None]:
from transformers import pipeline
import pandas as pd
from sklearn.metrics import confusion_matrix

from datasets import load_dataset, Dataset

## Load Dataset

In [None]:
### START OF CODE FROM EXTERNAL SOURCE (URL: https://huggingface.co/datasets/florentgbelidji/car-reviews)
ds = load_dataset("florentgbelidji/car-reviews")
### END OF CODE FROM EXTERNAL SOURCE (URL: https://huggingface.co/datasets/florentgbelidji/car-reviews)

## EDA

In [None]:
ds['train'][10]

In [None]:
reviews = ds['train']['Review']

In [None]:
reviews[10]

## Model 1 - Transformers

### Balanced Sampling

In [None]:
df = pd.DataFrame(ds['train'])

In [None]:
min_samples = df['Rating'].value_counts().min()
print(min_samples)

In [None]:
balanced_df = df.groupby('Rating').apply(lambda x: x.sample(min_samples))

In [None]:
sampled_dataset = Dataset.from_pandas(balanced_df)

### Filtering Negative Feedbacks

#### Sentiment Analysis Model 1

In [None]:
### START OF CODE FROM EXTERNAL SOURCE (URL:https://huggingface.co/learn/nlp-course/chapter1/3?fw=pt)
sentiment_classifier = pipeline("sentiment-analysis", device=0)

In [None]:
SA = sentiment_classifier(sampled_dataset['Review'], truncation=True)
### END OF CODE FROM EXTERNAL SOURCE (URL:https://huggingface.co/learn/nlp-course/chapter1/3?fw=pt)

In [None]:
SA

In [None]:
negative_reviews = []
for review, result in zip(sampled_dataset['Review'], SA):
    if result['label'] == 'NEGATIVE' and result['score'] >= 0.8:
        negative_reviews.append(review)

In [None]:
negative_reviews

#### Sentiment Analysis Model 2

In [None]:
### START OF CODE FROM EXTERNAL SOURCE (URL:https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest)
sentiment_classifier_2 = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest", batch_size=4, device=0)

In [None]:
SA2 = sentiment_classifier_2(sampled_dataset['Review'], truncation=True, max_length=512)
### END OF CODE FROM EXTERNAL SOURCE (URL:https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment-latest)

In [None]:
SA2

In [None]:
negative_reviews2 = []
for review, result in zip(sampled_dataset['Review'], SA2):
    if result['label'] == 'NEGATIVE' and result['score'] >= 0.8:
        negative_reviews2.append(review)

In [None]:
negative_reviews2

### Zero-Shot Classification

In [None]:
### START OF CODE FROM EXTERNAL SOURCE (URL:https://huggingface.co/facebook/bart-large-mnli)
zeroshot_classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli", device=0, batch_size=16)

In [None]:
candidate_labels=["performance", "efficiency ", "safety", "comfort", "design", "price", "service"]

In [None]:
ZC = zeroshot_classifier(negative_reviews, candidate_labels, truncation=True, multi_label=True)
### START OF CODE FROM EXTERNAL SOURCE (URL:https://huggingface.co/facebook/bart-large-mnli)

In [None]:
ZC

In [None]:
categories1 = [result['labels'][0] for result in ZC]
scores1 = [result['scores'][0] for result in ZC]
categories2 = [result['labels'][1] for result in ZC]
scores2 = [result['scores'][1] for result in ZC]
negative_df = pd.DataFrame({
    "Text": negative_reviews,
    "Category1": categories1,
    "Score1": scores1,
    "Category2": categories2,
    "Score2": scores2
})
negative_df

In [None]:
grouped1 = negative_df.groupby('Category1')
result1 = grouped1.agg(Count=('Category1', 'size'), Average=('Score1', 'mean'))
result1

In [None]:
grouped2 = negative_df.groupby('Category2')
result2 = grouped2.agg(Count=('Category2', 'size'), Average=('Score2', 'mean'))
result2

## Evaluation

### Evaluation - Sentiment Analysis Model 1

In [None]:
scores = [item['score'] for item in SA]
print(max(scores),min(scores))

In [None]:
predictions = []
for result in SA:
    score = result['score']
    label = result['label']
    if label == 'NEGATIVE':
        rating = round(5 - score * 4)
    else:
        rating = round(score * 4 + 1)
    predictions.append(rating)
label = balanced_df['Rating']

In [None]:
matrix = confusion_matrix(label, predictions)
matrix

In [None]:
SA_label = [result['label'] for result in SA]
SA_score = [result['score'] for result in SA]
SA_df = pd.DataFrame({
    "Text": balanced_df['Review'],
    "Label": SA_label,
    "Score": SA_score,
})
SA_df

In [None]:
sentiment_sample = SA_df.sample(n=20, random_state=97)
sentiment_sample

### Evaluation - Sentiment Analysis Model 2

In [None]:
scores2 = [item['score'] for item in SA2]
print(max(scores2),min(scores2))

In [None]:
predictions2 = []
for result in SA2:
    score2 = result['score']
    label2 = result['label']
    if label2 == 'NEGATIVE':
        rating2 = round(5 - score2 * 4)
    elif label2 == 'POSITIVE':
        rating2 = round(score2 * 4 + 1)
    else:
        rating2 = round((score2 - 0.3) / 0.7 * 4 + 1)
    predictions2.append(rating2)
label2 = balanced_df['Rating']

In [None]:
matrix2 = confusion_matrix(label2, predictions2)
matrix2

In [None]:
SA_label2 = [result['label'] for result in SA2]
SA_scores2 = [result['score'] for result in SA2]
SA_df2 = pd.DataFrame({
    "Text": balanced_df['Review'],
    "Label": SA_label2,
    "Score": SA_scores2,
})
SA_df2

In [None]:
sentiment_sample2 = SA_df2.sample(n=20, random_state=97)
sentiment_sample2

### Evaluation - Zero-Shot Classification

In [None]:
negative_df_sample = negative_df.sample(n=20, random_state=97)
negative_df_sample