In [None]:
!pip install transformers google-generativeai



In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

In [None]:
model_name = "facebook/bart-large-mnli"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
GEMINI_APIKEY = "AIzaSyDg8xag858TyyZrRiIZ4Y3CbWY4gBvWvJA"

In [None]:
from transformers import pipeline
import google.generativeai as genai

class DataInference:
  def __init__(self, model_name, device):
    self.model_name = model_name
    self.device = device
    self.task = "zero-shot-classification"
    self.classifier = pipeline(self.task, model=model_name)
    self.gemini_name = "gemini-1.5-flash"
    api_key = GEMINI_APIKEY
    genai.configure(api_key=api_key)

    # safety setting
    safety_setting = [
        {
            "category": "HARM_CATEGORY_HARASSMENT",
            "threshold": "BLOCK_NONE"
        },
        {
            "category": "HARM_CATEGORY_HATE_SPEECH",
            "threshold": "BLOCK_NONE"
        },
        {
            "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
            "threshold": "BLOCK_NONE"
        },
        {
            "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
            "threshold": "BLOCK_NONE"
        },
    ]

    # llm prediction
    self.llm = genai.GenerativeModel(self.gemini_name, safety_settings=safety_setting)

  def extract_aspect(self, sentence):
    # candidate labels
    labels = ["Price fairness", "Cleanliness", "Facilities", "Staff quality", "Convenience",
              "Punctuality", "Accessibility", "Safety", "Data availability"]

    # get result
    aspect = self.classifier(sentence, labels)
    return aspect

  def get_sentiment_based_on_aspect(self, sentence, aspect):
    labels = ["Positive", "Negative", "Neutral"]
    result = self.classifier(sentence, labels)
    return result

  def llm_prediction(self, prompt):
    try:
      response = self.llm.generate_content(prompt)
      return response.text
    except Exception as e:
      print(e)
      raise ValueError(e)

  def inference_pipeline(self, review):
    r1 = self.extract_aspect(review) # return dict with sequence, labels and score
    aspects = r1["labels"]
    scores = r1["scores"]

    # mapping aspect score
    aspect_score = dict(zip(aspects, scores))

    # threshold
    threshold = 0.22

    # getting aspect more than threshold
    aspect_score = {k: v for k, v in aspect_score.items() if v > threshold}
    aspectss = list(aspect_score.keys())

    rs = []

    for idx, aspect in enumerate(aspectss):
      prompt = f"""Get the opinion of user in this review: {review} that corresponded to aspect: {aspect}
      ***EXAMPLE***
      review example: 'I am impressed about the cleanliness, facilitiy at station is really good'
      output: 'Impressed about cleanliness'
      ***NOTE***
      Do not have any markup syntax used in Markdown.
      Do not have symbol '\n' and the end or start
      Do not summarize the review
      The output must be original do not change the text
      """
      opinion = self.llm_prediction(prompt).replace("\n", "").strip()
      result = self.get_sentiment_based_on_aspect(opinion, aspect)

      # mapping sentiment with score
      sentiments = result["labels"]
      score = result["scores"]
      sentiment_score = dict(zip(sentiments, score))
      sentiment = max(sentiment_score, key=sentiment_score.get)
      rs.append({"aspect": aspect, "sentiment": sentiment, "opinion": opinion})

    return rs


In [None]:
path = "/content/drive/MyDrive/nguyenanh-projects/internship-2024/dataset/ds-4-storage/subset-in/subset_7.json"

In [None]:
data_inference = DataInference(model_name, device)

In [None]:
import json

ds4inf = json.load(open(path, 'r', encoding='utf-8'))

In [None]:
import time

outpath = f"/content/drive/MyDrive/nguyenanh-projects/internship-2024/dataset/ds-4-storage/subset-out/sub-7/"

def ccc(outpath, start_index, end_index):
    filepath = outpath + f"storage-out-sub-3-{start_index}-{end_index}.json"

    outfile = open(filepath, 'w', encoding="utf-8")
    rs = []

    # 0 - 300 - 600 - 900 - 1300 - 1600 - 1900 - 2200 - 2500 - 2800
    # 3100 - 3400 - 3700 - 4000 - 4300 - 4600 - 4900 - 5200 - 5500 - 5800
    # 6100 - 6400 - 6700 - 7000 - 7300 - 7600 - 7900 - 8100 - 8400 - 8700
    # 9000 - 9300 - 9600 - 9900.

    for point in ds4inf[start_index:end_index]:
      point["result"] = data_inference.inference_pipeline(point["review"])
      print(point)
      rs.append(point)
      time.sleep(10)

    json.dump(rs, outfile, indent=4, ensure_ascii=False)

In [None]:
ccc(outpath, -700, -600)

{'id': 78013, 'title': 'Using the Train for Long-Distance Travel', 'review': "While Bangkok's train system is great for getting around the city, it can also be used for long-distance travel. There are several train lines that connect Bangkok to other major cities in Thailand. It's a comfortable and affordable way to travel.", 'country': 'Thailand', 'aspect': '', 'sentiment': '', 'opinion': '', 'month': 'Dec', 'year': 2015, 'social': 'pantip', 'original_text': 'แม้ว่าระบบรถไฟของกรุงเทพฯ จะดีเยี่ยมสำหรับการเดินทางในเมือง แต่ก็สามารถใช้สำหรับการเดินทางไกลได้เช่นกัน มีสายรถไฟหลายสายที่เชื่อมต่อกรุงเทพฯ กับเมืองใหญ่ๆ อื่นๆ ในประเทศไทย มันเป็นวิธีการเดินทางที่สะดวกสบายและราคาไม่แพง', 'result': [{'aspect': 'Data availability', 'sentiment': 'Neutral', 'opinion': 'The review does not express an opinion about data availability.'}]}
{'id': 78014, 'title': 'Navigating the Train Stations', 'review': "Bangkok's train stations can be confusing, especially for first-timers. It's important to familiari