In [None]:
! pip install spark-nlp-display
# Install PySpark and Spark NLP
! pip install -q pyspark==3.1.2 spark-nlp==4.2.8



In [None]:
import pandas as pd
import numpy as np
import json

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

import sparknlp
from sparknlp.annotator import *
from sparknlp.base import *
from sparknlp.pretrained import PretrainedPipeline
from pyspark.sql.types import StringType, IntegerType

In [None]:
spark = sparknlp.start()
print ("Spark NLP Version :", sparknlp.version())
spark

Spark NLP Version : 4.2.8


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = spark.read.json('/content/drive/MyDrive/Capstone-esya.ai/DATA/summarized_df1000_1.json')

In [None]:
df.show(10)

+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+----------+
|         article_url|    section|         summary_gpt|                text|               title|             website|word_count|
+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+----------+
|https://www.dhaka...|      world| The government a...|An Indian court j...|Self-styled India...|www.dhakatribune.com|       404|
|https://www.dhaka...|latest-news| The St Louis Cou...|US police said ea...|Police arrest 31 ...|www.dhakatribune.com|       571|
|http://www.catchn...|       NEWS| The next hearing...|Here's a trivia q...|Shady deal: how a...|   www.catchnews.com|       840|
|https://www.dhaka...|      world|                    |An Australian tra...|Australian blogge...|www.dhakatribune.com|       742|
|https://www.dhaka...|      world|                    |Ten people were k...|Landslides kil

In [None]:
import pyspark.sql.functions as F
from pyspark.sql.functions import regexp_replace
#df = df.filter(F.regexp_extract(df.text, r"\b[a-zA-Z]+\b", 0) != "")
#df= df.select('article_url','hashtag','section','text','title','website')
#df.count()

In [None]:
document_assembler = DocumentAssembler()\
                    .setInputCol("text")\
                    .setOutputCol("documents")

t5 = T5Transformer() \
    .pretrained("t5_small", 'en') \
    .setTask("summarize:")\
    .setMaxOutputLength(100)\
    .setInputCols(["documents"]) \
    .setOutputCol("summaries")

summarizer_pp = Pipeline(stages=[
    document_assembler, t5
])

t5_small download started this may take some time.
Approximate size to download 141.1 MB
[OK!]


In [None]:
#df=df.select("text")
#df = spark.createDataFrame(df, StringType()).toDF("text")
result = summarizer_pp.fit(df).transform(df)

In [None]:
result.show(10)

+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+
|         article_url|    section|         summary_gpt|                text|               title|             website|word_count|           documents|           summaries|
+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+----------+--------------------+--------------------+
|https://www.dhaka...|      world| The government a...|An Indian court j...|Self-styled India...|www.dhakatribune.com|       404|[{document, 0, 24...|[{document, 0, 20...|
|https://www.dhaka...|latest-news| The St Louis Cou...|US police said ea...|Police arrest 31 ...|www.dhakatribune.com|       571|[{document, 0, 34...|[{document, 0, 17...|
|http://www.catchn...|       NEWS| The next hearing...|Here's a trivia q...|Shady deal: how a...|   www.catchnews.com|       840|[{document,

In [None]:
#sum.schema[result].dataType
result.printSchema()

root
 |-- article_url: string (nullable = true)
 |-- section: string (nullable = true)
 |-- summary_gpt: string (nullable = true)
 |-- text: string (nullable = true)
 |-- title: string (nullable = true)
 |-- website: string (nullable = true)
 |-- word_count: long (nullable = true)
 |-- documents: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- summaries: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable

In [None]:
df1= result.select("article_url","section","summary_gpt", "title","website","text","summaries.result")

In [None]:
df1.show(10)

+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|         article_url|    section|         summary_gpt|               title|             website|                text|              result|
+--------------------+-----------+--------------------+--------------------+--------------------+--------------------+--------------------+
|https://www.dhaka...|      world| The government a...|Self-styled India...|www.dhakatribune.com|An Indian court j...|[a judge jails a ...|
|https://www.dhaka...|latest-news| The St Louis Cou...|Police arrest 31 ...|www.dhakatribune.com|US police said ea...|[police say 31 pe...|
|http://www.catchn...|       NEWS| The next hearing...|Shady deal: how a...|   www.catchnews.com|Here's a trivia q...|[TERI's governing...|
|https://www.dhaka...|      world|                    |Australian blogge...|www.dhakatribune.com|An Australian tra...|[australian coupl...|
|https://www.dhaka..

In [None]:
df1_copy = df1.select('*')

In [None]:
# Use regexp_replace to remove square brackets from the specified column
result = result.withColumn("result", regexp_replace("result", r"\[|\]", ""))


In [None]:
df1_copy.write.json('/content/drive/MyDrive/Capstone-esya.ai/DATA/final_df1000.json')

Use ROUGE score to evaluate

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

In [None]:
# Define a UDF for ROUGE calculation
def calculate_rouge(reference, candidate):
    reference = reference.split()
    candidate = candidate.split()
    score = sentence_bleu([reference], candidate)
    return score

calculate_rouge_udf = udf(calculate_rouge, FloatType())


In [None]:
df2 = df1.withColumn("rouge_score", calculate_rouge_udf(df1["summary"], df1["result"]))


In [None]:
df2.show(10,truncate=True)

In [None]:
#calcualte the average rouge score
df2.agg({'rouge_score': 'avg'}).show()


In [None]:
spark.stop()