In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Sentiment Analysis for large scale data using Gemini

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/generative_ai/sentiment_analysis/sentiment_analysis_movie_reviews.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/generative_ai/sentiment_analysis/sentiment_analysis_movie_reviews.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/ai-ml-recipes/main/notebooks/generative_ai/sentiment_analysis/sentiment_analysis_movie_reviews.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/bigquery/import?url=https://github.com/GoogleCloudPlatform/ai-ml-recipes/blob/main/notebooks/generative_ai/sentiment_analysis/sentiment_analysis_movie_reviews.ipynb">
      <img src="https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcTW1gvOovVlbZAIZylUtf5Iu8-693qS1w5NJw&s" alt="BQ logo" width="35">
      Open in BQ Studio
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fai-ml-recipes%2Fmain%2Fnotebooks%2Fgenerative_ai%2Fsentiment_analysis%2Fsentiment_analysis_movie_reviews.ipynb">
    <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo">
    Open in Colab Enterprise
    </a>
  </td>

</table>

## Overview

This notebook shows how to perform sentimental analysis on large scale data using LLM.
The dataset used is a public dataset from Bigquery Public Datasets.

#### **Steps**
Using Spark, 
1) This notebook reads data from Bigquery public dataset **bigquery-public-data.imdb.reviews**
2) It calls [Vertex AI Gemini API](https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/api-quickstart#try_text_prompts) to find the sentiment of each review (positive vs negative)
3) We compare the result side by side
4) Find accuracy, and again trim the input and observe the accuracy increase

#### Related content

- [Text Prompt](https://cloud.google.com/vertex-ai/docs/generative-ai/text/text-prompts)
- [Content Classification](https://cloud.google.com/vertex-ai/docs/generative-ai/text/text-prompts#content-classification)

In [None]:
import sys
import time
import google.auth
import google.auth.transport.requests
import requests

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, ArrayType
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer

In [None]:
# When using Dataproc Serverless, installed packages are automatically available on all nodes
!pip3 install --upgrade -q google-cloud-aiplatform google-genai "protobuf~=4.25.3" "numpy~=1.26.4" 
# When using a Dataproc cluster, you will need to install these packages during cluster creation: https://cloud.google.com/dataproc/docs/tutorials/python-configuration

In [None]:
# To use the newly installed packages, you must restart the runtime on Google Colab (Colab only)
if "google.colab" in sys.modules:
    import IPython
    app = IPython.Application.instance()
    app.kernel.do_shutdown(True)

#### Get credentials to authenticate with Google APIs


In [None]:
# Authenticate your notebook environment (Colab only)
if "google.colab" in sys.modules:
    from google.colab import auth
    auth.authenticate_user(project_id="PROJECT_ID") # Replace with your actual project ID (Colab only)

In [None]:
credentials, project_id = google.auth.default()
auth_req = google.auth.transport.requests.Request()
credentials.refresh(auth_req)

### Create Spark Session for the notebook

In [None]:
spark = SparkSession.builder \
    .appName("Sentimental Analysis using Dataproc and Gemini on Vertex AI") \
    .config("spark.jars.packages", "com.google.cloud.spark:spark-bigquery-with-dependencies_2.12:0.42.2") \
    .config("projectId", project_id) \
    .getOrCreate()

### Read data from Bigquery Public Dataset 

In [None]:
movie_reviews = spark.read.format("bigquery").option("table", "bigquery-public-data.imdb.reviews").load()

|                                                                                              review|split|   label| movie_id|reviewer_rating|                           movie_url|title|
|----------------------------------------------------------------------------------------------------|-----|--------|---------|---------------|------------------------------------|-----|
|I had to see this on the British Airways plane. It was terribly bad acting and a dumb story. Not ...| test|Negative|tt0158887|              2|http://www.imdb.com/title/tt0158887/| null|
|This is a family movie that was broadcast on my local ITV station at 1.00 am a couple of nights a...| test|Negative|tt0158887|              4|http://www.imdb.com/title/tt0158887/| null|
|I would like to comment on how the girls are chosen. why is that their are always more white wome...| test|Negative|tt0391576|              2|http://www.imdb.com/title/tt0391576/| null|
|Tyra & the rest of the modeling world needs to know that real women like myself and my daughter d...| test|Negative|tt0391576|              3|http://www.imdb.com/title/tt0391576/| null|

### Get Positive Reviews from Dataset

In [None]:
positive_movie_reviews = movie_reviews.select(col("review"), col("reviewer_rating"), col("movie_id"), col("label")).where(col("label") == "Positive").limit(100)

### Get Negative Reviews from Dataset

In [None]:
negative_movie_reviews = movie_reviews.select(col("review"), col("reviewer_rating"), col("movie_id"), col("label")).where(col("label") == "Negative").limit(100)

### Mix positive and negative 
Making union of positive and negative reviews to get a good dataset of mixed set of reviews. For the purpose notebook, each class of reviews has 100 rows each.

In [None]:
movie_reviews_mixed = positive_movie_reviews.union(negative_movie_reviews)

|              review|reviewer_rating| movie_id|   label|
|--------------------|---------------|---------|--------|
|This movie is ama...|             10|tt0187123|Positive|
|THE HAND OF DEATH...|             10|tt0187123|Positive|
|The Hand of Death...|              7|tt0187123|Positive|
|Just as a reminde...|             10|tt0163955|Positive|
|Like an earlier c...|              9|tt0163955|Positive|

### Final count is 200 as can be seen below

In [None]:
movie_reviews_mixed.count()

### Creating a UDF to get predictions from Gemini Model
In this method, text whose sentiment is to be predicted is passed

In [None]:
def gemini_predict(prompt, model_name="gemini-2.0-flash", max_retries=3, initial_delay=1):
    
    import time
    import enum
    from google import genai
    from google.genai import types
    
    client = genai.Client(
        vertexai=True,
        project=project_id,
        location="us-central1"
    )

    class ResponseSchema(enum.Enum):
        POSITIVE = "Positive"
        NEGATIVE = "Negative"
    
    generate_content_config = types.GenerateContentConfig(
        response_mime_type = "text/x.enum",
        response_schema = ResponseSchema
    )
    
    retries, delay = 0, initial_delay
    while retries <= max_retries:
        try:
            response = client.models.generate_content(model=model_name,
                                                      contents=prompt,
                                                      config=generate_content_config)
            
            return response.text
        except Exception:
            if retries == max_retries:
                return
            time.sleep(delay)
            delay *= 2
            retries += 1
    return ""

In [None]:
def classify_sentiment(text):
    
    prompt = f"""You are an expert at analyzing movie reviews from IMDb. Your task is to classify the sentiment of the provided review text.
                When classifying, pay close attention to:
                - **Overall sentiment**: Consider the entire review, not just individual words. Criticism is not always negative.
                - **Sarcasm and irony**: Identify when negative language is used to express positive sentiment, or vice-versa.
                - **Conditional statements**: Understand if the sentiment is dependent on certain conditions.
                - **Comparative language**: Determine if the review is comparing the current movie favorably or unfavorably to others.
                
                Provide the sentiment classification from one of the two classes:
                - Negative
                - Positive
                
                Always choose the most appropriate classification.
                
                Text: {text}
                Sentiment:"""
    
    sentiment = gemini_predict(prompt)
    return sentiment
    
classify_sentiment_udf = udf(classify_sentiment)

### Get prediction from Gemini using the UDF on the movie reviews

In [None]:
movie_review_sentiment_pred = movie_reviews_mixed.withColumn("pred", classify_sentiment_udf(movie_reviews_mixed["review"]))

### Let's check the predicted value and do a quick comparison of required output v/s actual label

In [None]:
movie_review_sentiment_pred.select(col("pred"), col("label")).show(20,50)

In [None]:
movie_review_sentiment_pred.cache()

### Evaluation

Let's index the classes Negative and Positive to 1 and 0.  

In [None]:
from pyspark.sql.functions import when

indexed_df = movie_review_sentiment_pred.withColumn("label_indexed",when(movie_review_sentiment_pred["label"] == "Positive", 0.0).when(movie_review_sentiment_pred["label"] == "Negative", 1.0)) \
                                        .withColumn("pred_indexed",when(movie_review_sentiment_pred["pred"] == "Positive", 0.0).when(movie_review_sentiment_pred["pred"] == "Negative", 1.0))

And use the BinaryClassificationEvaluator to output our Area Under the ROC curve (AUC-ROC)

In [None]:
evaluator = BinaryClassificationEvaluator()
evaluator.setRawPredictionCol("pred_indexed")
evaluator.setLabelCol("label_indexed")

area_under_roc = evaluator.evaluate(indexed_df, {evaluator.metricName: "areaUnderROC"})

print("area_under_roc (%): ", area_under_roc)

Without any prior training or chaining prompts, only by zero shot, the model has been able to predict sentiments properly with 94% AUC-ROC

#### Count the number of unsuccessful predictions

In [None]:
match_predictions_df = indexed_df.withColumn("if_match", when((col("pred_indexed")==col("label_indexed")),1).otherwise(0))

In [None]:
match_predictions_df.where(col("if_match")==0).count()

#### Percentage Accuracy  

Total Rows = 200
* Mislabeled rows = 11
* Accuracy = (True positives + True Negatives)/ (True positives + True negatives + False positives + False negatives)
* Percentage Accuracy= 189 / 200 = 90%

#### Check the mismatch predictions
Find the mismatched rows and show it

In [None]:
mismatch_df = match_predictions_df.where(col("if_match")==0).select(col('pred'),col('label'),col('review'))

In [None]:
mismatch_df.show(20,200)

|    pred|   label|                                                                                                                                                                                                  review|
|--------|--------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|Negative|Positive|Like one of the previous commenters said, this had the foundations of a great movie but something happened on the way to delivery. Such a waste because Collette's performance was eerie and Williams...|
|Negative|Positive|If there is one thing to recommend about this film is that it is intriguing. The premise certainly draws the audience in because it is a mystery, and throughout the film there are hints that there ...|
|Negative|Positive|Sure, Titanic was a good movie, the first time you see it, but you really should see it a second time and your opinion of the film will definetly change. The first time you see the movie you see th...|
|Negative|Positive|Verhoeven's movie was utter and complete garbage. He's a disgusting hack of a director and should be ashamed. By his own admission, he read 2 chapters of the book, got bored, and decided to make th...|
|Negative|Positive|quote by Nicolas Martin (nicmart) from Houston, TX: "Fine film, but DVD "reformatted for TV", 8 April 2002 - This is a charming and emotive film. On the other hand, the DVD I purchased has been "re...|
|Negative|Positive|In the rapid economic development of 1990's in China, there is a resurgence of traditional Chinese culture, partially due to the rise of nationalism accompanied by the increase in wealth, and more ...|
|Positive|Negative|Earth has been destroyed in a nuclear holocaust. Well, parts of the Earth, because somewhere in Italy, a band of purebred survivors--those without radioactive contamination--are holed up in a massi...|
|Positive|Negative|Everything everyone has said already pretty much rings true when it comes to 'The Prey'. Endless nature footage, bad acting - Aside from these elements, this is a watchable film for slasher fans th...|
|Positive|Negative|This tale of the upper-classes getting their come-uppance and wallowing in their high-class misery is like a contemporary Mid-Sommerish version of an old Joan Crawford movie in which she suffered i...|
|Positive|Negative|                                  Looking for a REAL super bad movie? If you wanna have great fun, don't hesitate and check this one!Ferrigno is incredibly bad but is also the best of this mediocrity.|
|Positive|Negative|From the fertile imagination which brought you the irresistible HERCULES (1983), comes its even more preposterous (read goofier) sequel: right off the bat, we get another unwieldy "beginning of tim...|