# AIDI  1006 : AI Infrastucture and Architecture
# Assignment 4: Sentiment Analysis
# Jay Babulal Patel : 200543276

In [12]:
import json
import feedparser
import requests
from lxml import html
import pandas as pd

# Load RSS feeds from JSON file
with open('NewsPapers.json', 'r') as file:
    newspapers = json.load(file)['newspapers']

# Function to fetch and parse article content
def fetch_article_content(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            tree = html.fromstring(response.content)
            content = tree.xpath('//*[@id="main-content"]/article/div[3]')
            if content:
                return content[0].text_content().strip()
    except Exception as e:
        print(f"Error fetching {url}: {e}")
    return None

# Initialize list to hold article data
articles = []

# Iterate through each newspaper and fetch articles
for name, info in newspapers.items():
    print(f"Fetching articles from {name}...")
    feed = feedparser.parse(info['rss'])
    for entry in feed.entries:
        title = entry.title
        link = entry.link
        published = entry.published if 'published' in entry else 'N/A'
        content = fetch_article_content(link)
        if content:
            articles.append({
                'newspaper': name,
                'title': title,
                'link': link,
                'published': published,
                'content': content
            })

# Create a DataFrame and save to CSV
df = pd.DataFrame(articles)
df.to_csv('scraped_articles.csv', index=False)
print("Articles saved to scraped_articles.csv")


Fetching articles from BBC world...
Fetching articles from BBC Asia...
Fetching articles from BBC UK...
Fetching articles from BBC Business...
Fetching articles from BBC Politics...
Fetching articles from BBC Health...
Fetching articles from BBC Science...
Fetching articles from BBC Technology...
Articles saved to scraped_articles.csv


In [15]:
import pandas as pd
from textblob import TextBlob

# Read the CSV file
df = pd.read_csv('scraped_articles.csv')

# Function to get sentiment polarity and subjectivity
def analyze_sentiment(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity, blob.sentiment.subjectivity

# Apply sentiment analysis to the content
df['polarity'], df['subjectivity'] = zip(*df['content'].apply(analyze_sentiment))

# Save the DataFrame with sentiment analysis results to a new CSV file
df.to_csv('scraped_articles_with_sentiment.csv', index=False)
print("Sentiment analysis completed and saved to scraped_articles_with_sentiment.csv")


Sentiment analysis completed and saved to scraped_articles_with_sentiment.csv


In [27]:
import pandas as pd
import csv

# Read the CSV file
df = pd.read_csv('scraped_articles_with_sentiment.csv')

# Save the CSV file with proper quoting
df.to_csv('scraped_articles_with_sentiment_corrected.csv', index=False, quoting=csv.QUOTE_ALL, quotechar='"')


In [3]:
# Read the CSV file
df = pd.read_csv('scraped_articles_with_sentiment.csv')

# Convert 'published' column to datetime format
df['published'] = pd.to_datetime(df['published'], format='%a, %d %b %Y %H:%M:%S %Z')

# Extract date in the desired format
df['published'] = df['published'].dt.strftime('%Y-%m-%d')

In [4]:
df.to_csv('scraped_articles_with_sentiment_corrected_modified.csv', index=False)

In [14]:
import csv

def remove_quotes_and_commas(input_file, output_file, delimiter=','):
    try:
        cleaned_rows = []
        problematic_rows = []

        with open(input_file, 'r', encoding='utf-8') as infile:
            reader = csv.reader(infile, delimiter=delimiter, quotechar='"')
            headers = next(reader)
            cleaned_rows.append(headers)

            for i, row in enumerate(reader):
                cleaned_row = [cell.replace('"', '').replace(',', '') for cell in row]
                if len(cleaned_row) != len(headers):
                    problematic_rows.append((i + 2, row))
                cleaned_rows.append(cleaned_row)

        with open(output_file, 'w', newline='', encoding='utf-8') as outfile:
            writer = csv.writer(outfile, delimiter=delimiter, quotechar='"', quoting=csv.QUOTE_MINIMAL)
            writer.writerows(cleaned_rows)
        
        print(f"CSV file has been successfully cleaned and saved to {output_file}")
        if problematic_rows:
            print(f"Problematic rows found and cleaned: {problematic_rows}")
    except Exception as e:
        print(f"Error while processing the CSV file: {e}")

input_file = 'scraped_articles_with_sentiment_corrected_modified.csv'
output_file = 'data.csv'
remove_quotes_and_commas(input_file, output_file)


CSV file has been successfully cleaned and saved to data.csv
