In [1]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import json

# Base URLs of news sites
BASE_BBC_URL = "https://www.bbc.com"
BASE_DAWN_URL = "https://www.dawn.com"


articles = []

# Function to get article links
def extract_article_links(base_url):
    response = requests.get(base_url)
    soup = BeautifulSoup(response.content, 'html.parser')
    article_links = [urljoin(base_url, link['href']) for link in soup.find_all('a', href=True)]
    return article_links

# Function to get article content
def extract_article_content(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    title = soup.find('h1').text if soup.find('h1') else ''
    description = soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else ''
    return title, description

# Fetch article links and their content from BBC
links = extract_article_links(BASE_BBC_URL)

for link in links:
    try:
        title, description = extract_article_content(link)
        # Add each article to the articles list
        articles.append({"title": title, "description": description})
    except Exception as e:
        print(f"Error with URL {link}: {e}")

# Save articles
output_path = 'scrapped_bbc_articles.json'
with open(output_path, 'w', encoding='utf-8') as json_file:
    json.dump(articles, json_file, ensure_ascii=False, indent=4)

print(f"The Data has been successfully written to: {output_path}")

The Data has been successfully written to: scrapped_bbc_articles.json


In [2]:
import json
import re

# Function to clean text
def cleaned_text(text):
    text = re.sub('<.*?>', '', text)  # Remove HTML tags
    text = re.sub('\s+', ' ', text).strip()  # Remove extra spaces and newlines
    return text

# Load data from the existing JSON file
input_path = 'scrapped_bbc_articles.json'
with open(input_path, 'r', encoding='utf-8') as json_file:
    articles = json.load(json_file)

# Clean each article's title and description
for article in articles:
    article['title'] = cleaned_text(article.get('title', ''))
    article['description'] = cleaned_text(article.get('description', ''))

# Save cleaned data
output_path = 'bbc_articles_cleaned.json'
with open(output_path, 'w', encoding='utf-8') as json_file:
    json.dump(articles, json_file, ensure_ascii=False, indent=4)

print(f"Cleaned data successfully written to {output_path}")

Cleaned data successfully written to bbc_articles_cleaned.json


  text = re.sub('\s+', ' ', text).strip()  # Remove extra spaces and newlines


In [1]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive

# Set the path to your client secrets file
client_secrets_path = 'client_secrets.json'

# Authenticate with Google Drive
gauth = GoogleAuth()
gauth.LoadClientConfigFile(client_secrets_path)
gauth.LocalWebserverAuth()

# Initialize Google Drive client
drive = GoogleDrive(gauth)

# Example function to upload a file
def upload_file_to_drive(file_path, drive_folder_id):
    file = drive.CreateFile({'parents': [{'id': drive_folder_id}]})
    file.SetContentFile(file_path)
    file.Upload()


drive_folder_id = "1ykMTaoSCHoW7QzFWmzmlaxd7IcCaqftD"
file_path = r"C:\Users\Ahsan\Downloads\i201787_Assignment#2\bbc_articles_cleaned.json"

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=461645083159-nq6vu5pdur3ua86lldafjik3msc9cetp.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8080%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&access_type=offline&response_type=code

Authentication successful.
