In [4]:
from azure.storage.blob import BlobServiceClient
import pandas as pd
from io import StringIO

# Azure Blob credentials
account_url = "https://storageaccountmuskan.blob.core.windows.net"
container_name = "input-data"
blob_name = "news_sentiment_analysis.csv"
sas_token = "sp=r&st=2025-07-23T17:32:15Z&se=2025-07-24T01:47:15Z&spr=https&sv=2024-11-04&sr=b&sig=1sBAqhgqg25nc5jij87Sm5qydWn3anKvan3RIjnD%2F9M%3D"
# Create client
blob_service_client = BlobServiceClient(account_url=account_url, credential=sas_token)
blob_client = blob_service_client.get_blob_client(container=container_name, blob=blob_name)

# Download blob content
download_stream = blob_client.download_blob()
data = download_stream.readall().decode('utf-8')

# Load into pandas
df = pd.read_csv(StringIO(data))
print(df.head())


          Source          Author  \
0        stgnews  Bridger Palmer   
1  Zimbabwe Mail  Staff Reporter   
2      4-traders             NaN   
3      4-traders             NaN   
4         PLANET             NaN   

                                               Title  \
0  Pine View High teacher wins Best in State awar...   
1  Businesses Face Financial Strain Amid Liquidit...   
2  Musk donates to super pac working to elect Tru...   
4                          Rooftop solar's dark side   

                                         Description  \
0  ST. GEORGE — Kaitlyn Larson, a first-year teac...   
1  Harare, Zimbabwe – Local businesses are grappl...   
2  (marketscreener.com) Billionaire Elon Musk has...   
3  (marketscreener.com) A U.S. trade regulator on...   
4  4.5 million households in the U.S. have solar ...   

                                                 URL  \
0  https://www.stgeorgeutah.com/news/archive/2024...   
1  https://www.thezimbabwemail.com/business/busin... 

In [2]:
df.shape

(3500, 8)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3500 entries, 0 to 3499
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Source        3500 non-null   object        
 1   Author        2512 non-null   object        
 2   Title         3500 non-null   object        
 3   Description   3500 non-null   object        
 4   URL           3500 non-null   object        
 5   Published At  3500 non-null   datetime64[ns]
 6   Sentiment     3500 non-null   object        
 7   Type          3500 non-null   object        
dtypes: datetime64[ns](1), object(7)
memory usage: 218.9+ KB


Cleaning and Preprocessing the Data 

In [4]:
# Drop rows where Title or Description is missing
df = df.dropna(subset=['Title', 'Description'])

# Remove any unwanted characters (optional)
df['Title'] = df['Title'].astype(str).str.strip()
df['Description'] = df['Description'].astype(str).str.strip()

# Combine Title and Description for better sentiment context
df['combined_text'] = df['Title'] + ". " + df['Description']

Calculating Sentiment Score

In [5]:
import sys
!{sys.executable} -m pip install textblob
!{sys.executable} -m pip install nltk



In [6]:
from textblob import TextBlob

# Function to calculate sentiment polarity
def get_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity  # -1 to 1
    if polarity > 0.1:
        return 'positive'
    elif polarity < -0.1:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment analysis
df['calculated_sentiment'] = df['combined_text'].apply(get_sentiment)


In [7]:
df[['Title', 'Sentiment', 'calculated_sentiment']].head()

Unnamed: 0,Title,Sentiment,calculated_sentiment
0,Pine View High teacher wins Best in State awar...,positive,positive
1,Businesses Face Financial Strain Amid Liquidit...,neutral,neutral
2,Musk donates to super pac working to elect Tru...,positive,positive
3,US FTC issues warning to franchisors over unfa...,negative,negative
4,Rooftop solar's dark side,positive,positive


Compare original vs calculated:

In [8]:
print("Original vs Calculated Sentiment Comparison:")
print(df[['Sentiment', 'calculated_sentiment']].value_counts())

Original vs Calculated Sentiment Comparison:
Sentiment  calculated_sentiment
positive   positive                1447
neutral    neutral                  736
positive   neutral                  678
negative   neutral                  312
           negative                 249
neutral    positive                  40
negative   positive                  16
neutral    negative                  13
positive   negative                   9
dtype: int64


In [9]:
df.to_csv('cleaned_dataset_sentiment_analysis.csv', index=False)