In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Sentimental Analysis of large scale data using LLM (PaLM API) (Vertex GenAI)

## Overview

This notebook shows how to perform sentimental analysis on large scale data using LLM. The datasetused here would be a public dataset which is present in Bigquery Public Dataset.
#### **Steps**
Using Spark, 
1) This notebook reads data from Bigquert public dataset bigquery-public-data.imdb.reviews
2) It runs data read activity for Sentimental Analysis from BQ public dataset bigquery-public-data.imdb.reviews
3) It calls [Vertex AI Text Bison](https://cloud.google.com/vertex-ai/docs/generative-ai/start/quickstarts/api-quickstart#try_text_prompts) to find the sentiment of each review
4) We compare the result side by side
5) Find accuracy, and again try trimming input token and observe the accuracy increase.


#### Related content

- [Text Prompt](https://cloud.google.com/vertex-ai/docs/generative-ai/text/text-prompts)
- [Content Classification](https://cloud.google.com/vertex-ai/docs/generative-ai/text/text-prompts#content-classification)

In [177]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, ArrayType

import google.auth
import google.auth.transport.requests
import requests

#### Get credentials to authenticate with Google APIs


In [178]:
credentials, project_id = google.auth.default()
auth_req = google.auth.transport.requests.Request()
credentials.refresh(auth_req)

### Create Spark Session for the notebook

In [179]:
spark = SparkSession.builder \
    .appName("Sentimental Analysis using Dataproc and Vertex LLM") \
    .getOrCreate()

### Read data from Bigquery Public Dataset 

In [180]:
movieReview =spark.read.format("bigquery").option("table","bigquery-public-data.imdb.reviews").load()

### Get Positive Reviews from Dataset

In [181]:
positive_movie_review=movieReview.select(col("review"),col("reviewer_rating"),col("movie_id"),col("label")).where(col("label")=="Positive").limit(100)

### Get Negative Reviews from Dataset

In [182]:
negative_movie_review=movieReview.select(col("review"),col("reviewer_rating"),col("movie_id"),col("label")).where(col("label")=="Negative").limit(100)

In [None]:
positive_movie_review.show()

+--------------------+---------------+---------+--------+
|              review|reviewer_rating| movie_id|   label|
+--------------------+---------------+---------+--------+
|This movie is ama...|             10|tt0187123|Positive|
|THE HAND OF DEATH...|             10|tt0187123|Positive|
|The Hand of Death...|              7|tt0187123|Positive|
|Just as a reminde...|             10|tt0163955|Positive|
|Like an earlier c...|              9|tt0163955|Positive|
|>>> Great News th...|             10|tt0163955|Positive|
|Insanely well cra...|             10|tt0163955|Positive|
|Oppenheimer was a...|              9|tt0163955|Positive|
|'Oppenheimer' wit...|             10|tt0163955|Positive|
|President Harry S...|             10|tt0163955|Positive|
|I, too, found "Op...|              9|tt0163955|Positive|
|I can't remember ...|             10|tt0163955|Positive|
|I watched this mi...|              9|tt0163955|Positive|
|Absolutely the be...|             10|tt0163955|Positive|
|I saw this se

In [None]:
negative_movie_review.show()

+--------------------+---------------+---------+--------+
|              review|reviewer_rating| movie_id|   label|
+--------------------+---------------+---------+--------+
|I had to see this...|              2|tt0158887|Negative|
|This is a family ...|              4|tt0158887|Negative|
|I would like to c...|              2|tt0391576|Negative|
|Tyra & the rest o...|              3|tt0391576|Negative|
|In watching this ...|              2|tt0391576|Negative|
|Tyra Banks needs ...|              1|tt0391576|Negative|
|This is by far th...|              1|tt0391576|Negative|
|Ik know it is imp...|              3|tt0676157|Negative|
|Terrific producti...|              3|tt0215002|Negative|
|Earnest effort wh...|              4|tt0215002|Negative|
|Although not a bi...|              3|tt0215002|Negative|
|I know my summary...|              3|tt0225272|Negative|
|I love documentar...|              2|tt0374071|Negative|
|Some ugly weirdo ...|              2|tt0374071|Negative|
|A frustrating

### Mix positive and negative 
Making union of positive and negative reviews to get a good dataset of mixed set of reviews. For the purpose notebook, each class of reviews has 100 rows each.

In [183]:
movieReviewMixed=positive_movie_review.union(negative_movie_review)

### Final count is 200 as can be seen below

In [184]:
movieReviewMixed.count()

200

### Creating a UDF to get predictions from Vertext Text Bison Model
In this method, text whose sentiment is to be predicted is passed. Model used is text-bison-32k which allow 32k input tokens, so as to accomodate bigger text inputs of movie reviews. 

In [199]:
def find_sentiment_zero_shot(text):
    
    def vertex_get_prediction(prompt):
        MODEL_ID="text-bison-32k"
        prediction = requests.post(
            f"https://us-central1-aiplatform.googleapis.com/v1/projects/{project_id}/locations/us-central1/publishers/google/models/{MODEL_ID}:predict",
            headers={'Authorization': 'Bearer %s' % credentials.token,
                     'Content-Type': 'application/json'},
            json = {
                      "instances": [
                        { "prompt": prompt}
                      ],
                      "parameters": {
                        "temperature": 0.2,
                        "maxOutputTokens": 256,
                        "topK": 40,
                        "topP": 0.95
                    }
            }
        ).json()
        print(prediction)

        if "predictions" in prediction:
            pred = prediction["predictions"][0]
            if "content" in pred:
                return pred["content"]
        else:
            if "error" in prediction:
                if prediction["error"]["code"] == 429:  # Quota exceeded
                    time.sleep(5)
                    return predict_palm(prompt)
                else:
                    return f"Error getting prediction: {prediction['error']}"

            return f"Error getting predictions"
        
    prompt = f"""For the given text below, provide the sentimental classification with two classes mentioned below:
    The two classes are: Negative, Positive
    text: {text}
    Sentiment:"""
    
    sentiment = vertex_get_prediction(prompt)
        
    return sentiment

In [200]:
find_sentiment_zero_shot_udf = udf(find_sentiment_zero_shot)

In [None]:
movieReviewMixed.printSchema()

root
 |-- review: string (nullable = true)
 |-- reviewer_rating: long (nullable = true)
 |-- movie_id: string (nullable = true)
 |-- label: string (nullable = true)



### Get prediction from the LLM using the UDF on the movie reviews

In [187]:
movie_review_sentiment_pred= movieReviewMixed.withColumn("predicted_sentiment", find_sentiment_zero_shot_udf(movieReviewMixed["review"])).cache()

### Let's check the predicted value and do a quick comparison of required output v/s actual label

In [None]:
movie_review_sentiment_pred.select(col("predicted_sentiment"),col("label")).show(truncate=False)

[Stage 258:>                                                        (0 + 1) / 1]

+-------------------+--------+
|predicted_sentiment|label   |
+-------------------+--------+
| Positive          |Positive|
| Positive          |Positive|
| Positive          |Positive|
| Positive          |Positive|
| Positive          |Positive|
| Positive          |Positive|
| Positive          |Positive|
| Positive          |Positive|
| Positive          |Positive|
| Positive          |Positive|
| Positive          |Positive|
| Positive          |Positive|
| Positive          |Positive|
| Positive          |Positive|
| Positive          |Positive|
| Positive          |Positive|
| Positive          |Positive|
| Negative          |Positive|
| Positive          |Positive|
| Positive          |Positive|
+-------------------+--------+
only showing top 20 rows



                                                                                

### Evaluation
Let's create a evaluation dataframe to check how many out of all got matched with actual label. For every match, if_match column will store '1' else '0'. We will then sum it to check how many was correctly predicted

In [188]:
evaluation_df=movie_review_sentiment_pred.withColumn("if_match",when((trim(col("predicted_sentiment"))==trim(col("label"))),1).otherwise(0))

#### Count the number of unsuccessful predictions

In [None]:
evaluation_df.where(col("if_match")==0).count()

                                                                                

28

#### Percentage Accuracy  
Total Rows= 200

* Rows not well predicted= 28

* Percentage Accuracy= 100-(28/200*100) = 86% 

We can observe that more **86%** of the data has been rightfully predicted.

#### Check the mismatch predictions
Find the mismatched rows and show it

In [189]:
mismatch_df=evaluation_df.where(col("if_match")==0).select(col('predicted_sentiment'),col('label'),col('review'))

In [None]:
mismatch_df.show()

+-------------------+--------+--------------------+
|predicted_sentiment|   label|              review|
+-------------------+--------+--------------------+
|           Negative|Positive|Has anyone found ...|
|                   |Positive|If it is true tha...|
|           Negative|Positive|Overall I found t...|
|           Negative|Positive|Isabelle Huppert ...|
|                   |Positive|At the end of my ...|
|                   |Positive|First of all, tho...|
|           Negative|Positive|Erika Kohut is a ...|
|                   |Positive|In the first twen...|
|           Negative|Positive|Michael Haneke is...|
|           Negative|Positive|I saw this film a...|
|                   |Positive|The theme is cont...|
|           Negative|Positive|The notion of mar...|
|           Negative|Positive|For a long time I...|
|                   |Positive|This was a bold m...|
|                   |Negative|Tyra & the rest o...|
|                   |Negative|This is by far th...|
|           

### Observation
It can be observed that there are few empty predictions because of extra input token. Lets try to shorten the input text for model by doing a quick substring and then again make a prediction.

In [203]:
mismatch_pred=mismatch_df.withColumn("predicted_sentiment_2", find_sentiment_zero_shot_udf(substring(movieReviewMixed["review"],0,35)))

In [204]:
mismatch_pred.show()

[Stage 294:>                                                        (0 + 1) / 1]

+-------------------+--------+--------------------+---------------------+
|predicted_sentiment|   label|              review|predicted_sentiment_2|
+-------------------+--------+--------------------+---------------------+
|           Negative|Positive|Has anyone found ...|             Negative|
|                   |Positive|If it is true tha...|             Negative|
|           Negative|Positive|Overall I found t...|             Positive|
|           Negative|Positive|Isabelle Huppert ...|             Positive|
|                   |Positive|At the end of my ...|             Negative|
|                   |Positive|First of all, tho...|             Negative|
|           Negative|Positive|Erika Kohut is a ...|             Positive|
|                   |Positive|In the first twen...|             Negative|
|           Negative|Positive|Michael Haneke is...|             Negative|
|           Negative|Positive|I saw this film a...|             Positive|
|                   |Positive|The them

                                                                                

In [209]:
final_eval=mismatch_pred.withColumn("if_match",when((trim(col("predicted_sentiment_2"))==trim(col("label"))),1).otherwise(0))

In [210]:
final_eval.where(col("if_match")==0).count()

                                                                                

12

### Observation
We can see that by trimming the length of text, we are able to bring the mismatch prediction to just 12. 
* Hence, after triming, number of mismatched pred : 12
* Hence percentage accuracy= 100-(12/200*100) = 94%

Hence the final accuracy obtained is ***94%***, which means without any prior traning or chaining prompts, only by zero shot, the model has been able to predict sentiments properly with 94% accuracy