In [1]:
from azure.storage.blob import BlobServiceClient, BlobClient 
import os

connection_string = ""
container_name = "testtech"
local_download_path = "../World_news_tutorial/blob_files"

blob_service_client = BlobServiceClient.from_connection_string(connection_string)
container_client = blob_service_client.get_container_client(container_name)
blob_list = container_client.list_blobs()

for blob in blob_list:
    blob_client = container_client.get_blob_client(blob.name)
    local_file_path = os.path.join(local_download_path,blob.name)

    with open(local_file_path, 'wb') as f:
        download_stream = blob_client.download_blob()
        f.write(download_stream.read())
        print(f'Downloaded blob: {blob.name}')


Downloaded blob: news-2024-04-12.parquet
Downloaded blob: news-2024-04-15.parquet
Downloaded blob: news-2024-04-17.parquet
Downloaded blob: news-2024-04-19.parquet
Downloaded blob: news-2024-04-21.parquet
Downloaded blob: news-2024-04-22.parquet
Downloaded blob: news-2024-04-24.parquet
Downloaded blob: news-2024-04-26.parquet
Downloaded blob: news-2024-04-28.parquet


In [8]:
import pandas as pd
import pyarrow.parquet as pq
import os

# Define the directory where the Parquet files are downloaded
directory = "../World_news_tutorial/blob_files"

# List all Parquet files in the directory
parquet_files = [file for file in os.listdir(directory) if file.endswith('.parquet')]

# Read each Parquet file into a pandas DataFrame
dfs = []
for file in parquet_files:
    file_path = os.path.join(directory, file)
    df = pd.read_parquet(file_path)
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
combined_df = pd.concat(dfs, ignore_index=True)

# Now you have all data from Parquet files in a single DataFrame
combined_df

Unnamed: 0,source,title,publishedAt,author,url,content,word_count,Sentiment,Compound_Score
0,Hindustan Times,Horoscope Today: Astrological prediction for A...,2024-04-13 19:30:18+00:00,"Dr Prem Kumar Sharma, Manisha Koushik",https://www.hindustantimes.com/astrology/horos...,All zodiac signs have their own characteristic...,836,Positive,0.9999
1,Hindustan Times,"Weekly Horoscope Aquarius, April 14-20, 2024 p...",2024-04-13 18:40:16+00:00,Dr J.N Pandey,https://www.hindustantimes.com/astrology/horos...,Aquarius – (20th January to 18th February) Wee...,472,Positive,0.9991
2,Hindustan Times,"Weekly Horoscope Capricorn, April 14-20, 2024 ...",2024-04-13 18:39:16+00:00,Dr J.N Pandey,https://www.hindustantimes.com/astrology/horos...,Capricorn - 24th March 2024 Weekly Horoscope P...,435,Positive,0.9989
3,Hindustan Times,"Weekly Horoscope Libra, April 14-20, 2024 pred...",2024-04-13 18:36:15+00:00,Dr J.N Pandey,https://www.hindustantimes.com/astrology/horos...,Libra - (23rd September to 22nd October) Weekl...,463,Positive,0.9993
4,Page Six,'Golden Bachelor' alum Theresa Nist still wear...,2024-04-13 17:53:00+00:00,Sara Whitman,https://pagesix.com/2024/04/13/entertainment/g...,Click to email a link to a friend (Opens in ne...,403,Positive,0.9664
...,...,...,...,...,...,...,...,...,...
718,Koreaboo.com,"Hit K-Drama Breaks ""Crash Landing On You"" Hist...",2024-04-29 08:08:43+00:00,,https://www.koreaboo.com/news/hit-k-drama-brea...,The final episode of the popular romantic dram...,185,Positive,0.9928
719,NDTV News,"Lara Dutta Once ""Bashed Up"" Man Who Pinched He...",2024-04-29 07:47:09+00:00,,https://www.ndtv.com/entertainment/lara-dutta-...,Image instagrammed by Lara Dutta. (courtesy: L...,317,Positive,0.9929
720,Indiaforums.com,Yeh Rishta Kya Khelata Hai: Abhira to get arre...,2024-04-29 07:39:53+00:00,Team India Forums,https://www.indiaforums.com/article/yeh-rishta...,Yeh Rishta Kya Kehlata Hai Abhira. Image Court...,142,Negative,-0.9320
721,PINKVILLA,Ghilli box office collections: Vijay starrer s...,2024-04-29 07:12:47+00:00,Jatinder Singh,https://www.pinkvilla.com/entertainment/box-of...,Ghilli (re-release) saw an outstanding second ...,365,Positive,0.1505


In [19]:

sentiment_counts = df.groupby(['publishedAt', 'Sentiment'])['Sentiment'].count().unstack(fill_value=0)
fig = px.bar(sentiment_counts, x=sentiment_counts.index, y=sentiment_counts.columns, title='Sentiment Distribution by Day')
sentiment_counts

Sentiment,Negative,Neutral,Positive
publishedAt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-04-29 06:20:22+00:00,0,0,1
2024-04-29 07:12:47+00:00,0,0,1
2024-04-29 07:39:53+00:00,1,0,0
2024-04-29 07:47:09+00:00,0,0,1
2024-04-29 08:08:43+00:00,0,0,1
...,...,...,...
2024-04-29 19:52:00+00:00,1,0,0
2024-04-29 20:00:00+00:00,0,0,1
2024-04-29 20:08:39+00:00,1,0,0
2024-04-29 20:33:57+00:00,0,0,1


In [None]:
#Checking for NaN values
# removing duplicates 
# # Check DataFrame's datatype and shape
# Top Keywords for the last day and overall
# Word Cloud for both 
# Sentiment distribution
# sentiment per author and per source
# sentiment overtime


In [3]:
empty_rows = combined_df.isnull().any(axis=1)
combined_df[empty_rows]
combined_df = combined_df.dropna()
combined_df.reset_index(drop=True, inplace=True)
combined_df

Unnamed: 0,source,title,publishedAt,author,url,content,word_count,Sentiment,Compound_Score
0,Hindustan Times,Horoscope Today: Astrological prediction for A...,2024-04-13 19:30:18+00:00,"Dr Prem Kumar Sharma, Manisha Koushik",https://www.hindustantimes.com/astrology/horos...,All zodiac signs have their own characteristic...,836,Positive,0.9999
1,Hindustan Times,"Weekly Horoscope Aquarius, April 14-20, 2024 p...",2024-04-13 18:40:16+00:00,Dr J.N Pandey,https://www.hindustantimes.com/astrology/horos...,Aquarius – (20th January to 18th February) Wee...,472,Positive,0.9991
2,Hindustan Times,"Weekly Horoscope Capricorn, April 14-20, 2024 ...",2024-04-13 18:39:16+00:00,Dr J.N Pandey,https://www.hindustantimes.com/astrology/horos...,Capricorn - 24th March 2024 Weekly Horoscope P...,435,Positive,0.9989
3,Hindustan Times,"Weekly Horoscope Libra, April 14-20, 2024 pred...",2024-04-13 18:36:15+00:00,Dr J.N Pandey,https://www.hindustantimes.com/astrology/horos...,Libra - (23rd September to 22nd October) Weekl...,463,Positive,0.9993
4,Page Six,'Golden Bachelor' alum Theresa Nist still wear...,2024-04-13 17:53:00+00:00,Sara Whitman,https://pagesix.com/2024/04/13/entertainment/g...,Click to email a link to a friend (Opens in ne...,403,Positive,0.9664
...,...,...,...,...,...,...,...,...,...
413,Eonline.com,Jill Duggar Shares Emotional Message Following...,2024-04-24 21:35:00+00:00,Gabrielle Chung,https://www.eonline.com/news/1400108/jill-dugg...,Watch : Jill Duggar Suffers Pregnancy Loss and...,167,Positive,0.9022
414,Fortune,Why David Beckham is suing Mark Wahlberg's fit...,2024-04-24 21:14:00+00:00,Jasmine Li,https://fortune.com/2024/04/24/david-beckham-m...,David Beckham is taking on Mark Wahlberg—but n...,497,Negative,-0.9438
415,News18,Jacqueline Fernandez Is A Bundle Of Sunshine A...,2024-04-24 19:58:30+00:00,Aditi Giri,https://www.news18.com/movies/jacqueline-ferna...,Jacqueline Fernandez delighted her Instagram f...,218,Positive,0.9854
416,Hindustan Times,"Capricorn Daily Horoscope Today,April 25, 2024...",2024-04-24 18:39:02+00:00,Dr J.N Pandey,https://www.hindustantimes.com/astrology/horos...,Capricorn – (22nd December to 19th January) Da...,437,Positive,0.9991


In [2]:
import os
import pandas as pd

# Function to extract date from filename
def extract_date(filename):
    return pd.to_datetime(filename.split('.')[0].split('news-')[1])

# Path to the folder containing parquet files
folder_path = "/workspace/World_news_tutorial/blob_files"

# Get list of parquet files in the folder
parquet_files = [f for f in os.listdir(folder_path) if f.endswith('.parquet')]

# Extract dates from filenames
dates = [extract_date(f) for f in parquet_files]

# Find the latest date
latest_date_index = dates.index(max(dates))

# Read the parquet file with the latest date into a DataFrame
latest_dataframe = pd.read_parquet(os.path.join(folder_path, parquet_files[latest_date_index]))
sentiment_data = latest_dataframe.Sentiment.value_counts()
sentiment_data = sentiment_data.reset_index()
sentiment_data


Unnamed: 0,Sentiment,count
0,Positive,63
1,Negative,14
2,Neutral,8


In [7]:
import streamlit as st
import plotly.graph_objects as go
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Sample text data (replace this with your actual text data)
text_data = latest_dataframe.content.to_list()

tokens = word_tokenize(text_data)

# Remove stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

# Join the filtered tokens back into a single string
filtered_text = ' '.join(filtered_tokens)

# Generate word cloud without stopwords
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(filtered_text)

# Create Plotly figure
fig3 = go.Figure()

# Add word cloud image as a trace
fig3.add_trace(go.Image(z=wordcloud.to_array(), hoverinfo='skip'))

# Update layout
fig3.update_layout(title='Word Cloud')

# Display the word cloud using Streamlit
st.plotly_chart(fig3)


Unnamed: 0,source,count
0,NDTV News,7
1,Deadline,5
2,Hindustan Times,5
3,The Times of India,4
4,Billboard,3
5,Fox News,3
6,BBC News,3
7,Variety,3
8,The Indian Express,3
9,YouTube,3
