### **Read a JSON file**

In [1]:
df = spark.read.option("multiline", "true").json("Files/bing_latest_news.json")
# df now is a Spark DataFrame containing JSON data from "Files/bing_latest_news.json".
display(df)

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 3, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, a6c7fd21-adaf-4573-9c23-6a0825e79da5)

### **Selected only value column from JSON object**

In [2]:
df = df.select('value')

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 4, Finished, Available, Finished)

In [3]:
display(df)

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 5, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 570e6914-1dec-40cd-b81d-d6bc234c8df3)

### **Used explode function to convert single row into multiple rows**

In [4]:
from pyspark.sql.functions import explode
df_exploded = df.select(explode(df['value']).alias('json_object'))

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 6, Finished, Available, Finished)

In [5]:
display(df_exploded)

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 7, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 22dafa7f-95ac-4365-b18e-b6c53862c46d)

### Converting explode json dataframe into json string

In [6]:
json_list = df_exploded.toJSON().collect()

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 8, Finished, Available, Finished)

In [7]:
print(json_list[-1])

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 9, Finished, Available, Finished)

{"json_object":{"about":[{"name":"Dallas","readLink":"https://api.bing.microsoft.com/api/v7/entities/9ed1db6b-57c5-47b3-9d50-8fb2a9898b35"},{"name":"Demographics of Dallas–Fort Worth","readLink":"https://api.bing.microsoft.com/api/v7/entities/481c718f-6a39-d83b-f015-7bf813fac2a0"},{"name":"MarketWatch","readLink":"https://api.bing.microsoft.com/api/v7/entities/efd5e2b3-3670-5032-ebea-a4b8c8342a02"}],"category":"LifeStyle","datePublished":"2024-07-30T21:24:00.0000000Z","description":"Jarai Howard, a 29-year-old tech worker with a growing family, is hunting for his second home in the Dallas-Fort Worth area, but the process has turned into a nightmare. Howard aimed to snag a","image":{"thumbnail":{"contentUrl":"https://www.bing.com/th?id=OVFT.YrFdNLnoa_uycOk1FXHPtC&pid=News","height":465,"width":700}},"mentions":[{"name":"Settlement"},{"name":"National Association of Realtors"},{"name":"Shutterstock"}],"name":"Dallas tech worker battles new real estate madness amid historic NAR settlement

In [10]:
import json
json_news = json.loads(json_list[1])

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 12, Finished, Available, Finished)

In [11]:
print(json_news)

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 13, Finished, Available, Finished)

{'json_object': {'about': [{'name': 'India', 'readLink': 'https://api.bing.microsoft.com/api/v7/entities/85fa63d3-9596-adb9-b4eb-502273d84f56'}], 'datePublished': '2024-07-30T10:17:00.0000000Z', 'description': "India TV News provides you with all the breaking news, latest news, breaking story videos, and Live TV on a single platform to ensure you don't miss the biggest happenings in India and the world.", 'image': {'thumbnail': {'contentUrl': 'https://www.bing.com/th?id=OVFT.ZiQvLfIU5vYy7l-61hdWES&pid=News', 'height': 393, 'width': 700}}, 'name': 'Breaking News, July 30 | LIVE updates', 'provider': [{'_type': 'Organization', 'image': {'thumbnail': {'contentUrl': 'https://www.bing.com/th?id=ODF.IaXDAOnJaO-iU5IbMMsMHw&pid=news'}}, 'name': 'India TV'}], 'url': 'https://www.indiatvnews.com/news/india/breaking-news-july-30-live-updates-parliament-session-nirmala-sitharaman-budget-rahul-gandhi-lok-sabha-delhi-coaching-centre-deaths-pm-modi-bjp-rains-944277'}}


In [12]:
print(json_news['json_object']['description'])

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 14, Finished, Available, Finished)

India TV News provides you with all the breaking news, latest news, breaking story videos, and Live TV on a single platform to ensure you don't miss the biggest happenings in India and the world.


In [17]:
print(json_news['json_object']['name'])
print(json_news['json_object']['description'])
#print(json_news['json_object']['category'])
print(json_news['json_object']['url'])
print(json_news['json_object']['image']['thumbnail']['contentUrl'])
print(json_news['json_object']['provider'][0]['name'])
print(json_news['json_object']['datePublished'])

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 19, Finished, Available, Finished)

Breaking News, July 30 | LIVE updates
India TV News provides you with all the breaking news, latest news, breaking story videos, and Live TV on a single platform to ensure you don't miss the biggest happenings in India and the world.
https://www.indiatvnews.com/news/india/breaking-news-july-30-live-updates-parliament-session-nirmala-sitharaman-budget-rahul-gandhi-lok-sabha-delhi-coaching-centre-deaths-pm-modi-bjp-rains-944277
https://www.bing.com/th?id=OVFT.ZiQvLfIU5vYy7l-61hdWES&pid=News
India TV
2024-07-30T10:17:00.0000000Z


In [16]:
title = []
description = []
category = []
url = []    
image = []
provider = [] 
datePublished = []

# process each json object in a list
for json_str in json_list:
    try:
        # Parse the json object in a list
        article = json.loads(json_str)

        if article["json_object"].get("category") and article["json_object"].get("image",{}).get("thumbnail",{}).get("contentUrl") :

            # Extract info from dict
            title.append(article["json_object"]["name"])
            description.append(article["json_object"]["description"])
            category.append(article["json_object"]["category"])
            url.append(article["json_object"]["url"])
            image.append(article["json_object"]["image"]["thumbnail"]["contentUrl"])
            provider.append(article["json_object"]["provider"][0]["name"])
            datePublished.append(article["json_object"]["datePublished"])

    except Exception as e:
        print(f"Error processing Json Object: {e}") 


StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 18, Finished, Available, Finished)

In [18]:
title

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 20, Finished, Available, Finished)

['Here are the latest news and big news stories to follow today',
 'Transfer news LIVE: Filip Jorgensen to have Chelsea MEDICAL, Alvarez drops hint over Man City future, Xavi Simons latest',
 'NFL training camp news and live updates: Hall of Fame Game, injuries, analysis, holdouts, position battles and more',
 'Wayanad Landslides News Live Updates: Over 100 killed, many still trapped; rescue operations halted',
 'Norah O’Donnell to Exit as ‘CBS Evening News’ Anchor to Become Senior Correspondent',
 'When is MLB trade deadline 2024? Latest updates and news on Yankees and Mets moves',
 "Transfer news LIVE: Osimehn agent speaks out, Conor Gallagher deal 'accelerates', Arsenal boost",
 'Latest Paris 2024 tennis news and results: What you may have missed',
 'Stellantis offering new round of voluntary buyouts to US salaried workers',
 "Netanyahu's new conditions delaying Gaza cease-fire, says Hamas",
 "Former Oilers Forward Skates in New Team's Colours",
 'Man Utd transfer news LIVE: United 

In [19]:
from pyspark.sql.types import StructType, StructField, StringType

# Combine the list
data = list(zip(title, description, category, url, image, provider, datePublished))

# Defined schema
schema = StructType([
    StructField("title", StringType(), True),
    StructField("description", StringType(), True),
    StructField("category", StringType(), True),
    StructField("url", StringType(), True),
    StructField("image", StringType(), True),
    StructField("provider", StringType(), True),
    StructField("datePublished", StringType(), True)
])

# create DataFrame
df_cleaned = spark.createDataFrame(data=data, schema=schema)

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 21, Finished, Available, Finished)

In [20]:
display(df_cleaned)

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 22, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, 5e6fbfaa-1872-4c16-b68c-1e6b3aa4f546)

In [21]:
from pyspark.sql.functions import to_date, date_format

df_cleaned_final = df_cleaned.withColumn("datePublished", date_format(to_date("datePublished"), "dd-MMM-yyyy"))

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 23, Finished, Available, Finished)

In [22]:
display(df_cleaned_final)

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 24, Finished, Available, Finished)

SynapseWidget(Synapse.DataFrame, c2174974-49fb-42f0-aa31-0b320146d373)

In [30]:
df_cleaned_final.write.format("delta").mode("overwrite").saveAsTable("latest_news_db")

StatementMeta(, 4e828378-ad90-4782-b3c4-ff782fd9c6ed, 32, Finished, Available, Finished)