In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup



In [2]:

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service)

In [3]:
allsides_url = "https://www.allsides.com/media-bias/ratings"
driver.get(allsides_url)
time.sleep(3)

In [4]:

try:
    
    all_sources = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "#edit-field-featured-bias-rating-value > div:nth-child(1) > label"))
    )
    # Scroll the element into view
    driver.execute_script("arguments[0].scrollIntoView(true);", all_sources)
    WebDriverWait(driver, 2)
    print("Attempting to click the element using Selenium's click method.")
    all_sources.click()
    print("Element clicked successfully using Selenium's click method.")

except ElementClickInterceptedException:
    
    print("ElementClickInterceptedException occurred, attempting JavaScript click.")
    driver.execute_script("arguments[0].click();", all_sources)
    print("Element clicked successfully using JavaScript click.")

except TimeoutException:
    print("Element not found within the time limit")





Attempting to click the element using Selenium's click method.
ElementClickInterceptedException occurred, attempting JavaScript click.
Element clicked successfully using JavaScript click.


In [5]:
from selenium.common.exceptions import TimeoutException, ElementClickInterceptedException

def try_click_element(element, description):
    try:
        element.click()
        print(f"'{description}' clicked successfully.")
    except ElementClickInterceptedException:
        print(f"ElementClickInterceptedException occurred for '{description}', attempting JavaScript click.")
        driver.execute_script("arguments[0].click();", element)
        print(f"'{description}' clicked successfully using JavaScript click.")

try:

    author_select = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "#edit-field-news-source-type-tid-wrapper > div > div > div > div > div.form-item.form-type-bef-checkbox.form-item-edit-field-news-source-type-tid-1 > label"))
    )
    print("Attempting to click 'Author' checkbox using Selenium's click method.")
    try_click_element(author_select, 'Author checkbox')

    think_select = WebDriverWait(driver, 10).until(
        EC.element_to_be_clickable((By.CSS_SELECTOR, "#edit-field-news-source-type-tid-wrapper > div > div > div > div > div.form-item.form-type-bef-checkbox.form-item-edit-field-news-source-type-tid-3 > label"))
    )
    print("Attempting to click 'Think Tank / Policy Group' checkbox using Selenium's click method.")
    try_click_element(think_select, 'Think Tank / Policy Group checkbox')

except TimeoutException:
    print("An element was not found within the time limit.")


Attempting to click 'Author' checkbox using Selenium's click method.
ElementClickInterceptedException occurred for 'Author checkbox', attempting JavaScript click.
'Author checkbox' clicked successfully using JavaScript click.
Attempting to click 'Think Tank / Policy Group' checkbox using Selenium's click method.
ElementClickInterceptedException occurred for 'Think Tank / Policy Group checkbox', attempting JavaScript click.
'Think Tank / Policy Group checkbox' clicked successfully using JavaScript click.


A key challenge was efficiently scraping dynamically loaded content, which was elegantly circumvented by utilizing the website's page parameter to navigate data sections instead of the slower Selenium automation.

In [None]:
# from selenium.common.exceptions import NoSuchElementException, TimeoutException
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC

# def scroll_until_loaded(driver, timeout=100):
#     check_interval = 0.5  # How often to check for the loading indicator's disappearance
#     max_attempts = timeout / check_interval
#     attempts = 0

#     while True:
#         # Scroll down by one viewport height
#         time.sleep(3 )
#         driver.execute_script("window.scrollBy(0, window.innerHeight);")
        

#         try:
#             # Wait for the loading indicator to be visible
#             loader = WebDriverWait(driver, check_interval).until(
#                 EC.visibility_of_element_located((By.CSS_SELECTOR, "div#views_infinite_scroll-ajax-loader"))
#             )
#             attempts = 0  # Reset attempts counter after loader has been found

#             # Now wait until the loading indicator is no longer visible
#             WebDriverWait(driver, check_interval).until_not(
#                 EC.visibility_of_element_located((By.CSS_SELECTOR, "div#views_infinite_scroll-ajax-loader"))
#             )
#         except TimeoutException:
#             # If the loading indicator wasn't found or didn't disappear after check_interval seconds,
#             # increment the failed attempts counter.
#             attempts += 1
        
#         # If the loading indicator hasn't been seen for several checks, we may be at the end
#         if attempts >= max_attempts:
#             break

# # Use the scroll_until_loaded function within your main scraping logic
# try:
#     scroll_until_loaded(driver)
#     print("Finished scrolling, the full page content should be loaded now.")
# except Exception as e:
#     print(f"An error occurred during scrolling: {e}")


In [6]:

def page_creator(page_num):
    return f"https://www.allsides.com/media-bias/ratings?field_featured_bias_rating_value=All&field_news_source_type_tid%5B%5D=1&field_news_source_type_tid%5B%5D=2&field_news_source_type_tid%5B%5D=3&field_news_bias_nid_1%5B1%5D=1&field_news_bias_nid_1%5B2%5D=2&field_news_bias_nid_1%5B3%5D=3&title=&page={page_num}"


In [7]:
"""
Fetches and processes media bias data from allsides.com for a given page number.

    This function constructs a URL for the specified page, sends an HTTP request, 
    and parses the response content using BeautifulSoup. It extracts the media bias column, 
    internal URLs, and data table, converting it into a pandas DataFrame. The function 
    then cleans up the DataFrame by renaming columns, separating 'agree' and 'disagree' 
    counts, and mapping bias ratings to numerical values.

    Parameters:
    page_num (int): The page number to scrape data from.

    Returns:
    DataFrame: A pandas DataFrame containing the scraped data with additional 
    columns for 'agree', 'disagree', 'perc_agree', and 'rating_num'.
"""

def get_bias(page_num):

    url = page_creator(page_num)
    response = requests.get(url)
    if response.status_code != 200:
        print('Failed to retrieve the webpage')
        return
    
    soup = BeautifulSoup(response.content, 'html.parser')

    print("Fetching media bias column...")
    media_bias_col =  [link['href'].split('/')[-1] for link in soup.select(".views-field-field-bias-image a") if link.get('href')]
    
    print("Fetching internal URLs...")
    internal_url = ["https://www.allsides.com" + a['href'] for a in soup.select(".source-title a")]
    
    print("Extracting the data table...")
    table = soup.select_one(".views-table")
    df = pd.read_html(str(table))[0]
    print("Table extracted successfully.")
    
    df['bias_rating'] = media_bias_col
    df['news_source_internal_url'] = internal_url
    print("Bias rating links and internal URLs columns added.")
    
    # Cleaning column names
    print("Cleaning column names...")
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    print("Column names cleaned.")
    df = df.drop(columns=['allsides_bias_rating', 'what_do_you_think?'])

    df = df.rename(columns={'community_feedback_(biased,_not_normalized)': 'agree_disagree'})
       
    df[['agree', 'disagree']] = df['agree_disagree'].str.extract(r'(\d+)/(\d+)')

    df['agree'] = pd.to_numeric(df['agree'], errors='coerce')

    df['disagree'] = pd.to_numeric(df['disagree'], errors='coerce')

    df['perc_agree'] = df['agree'] / (df['agree'] + df['disagree'])

    rating_conversion = {
        "left": 1,
        "left-center": 2,
        "center": 3,
        "right-center": 4,
        "right": 5
    }
    df['rating_num'] = df['bias_rating'].map(rating_conversion)
    
    print(df.head())
        
    return df


In [9]:

all_pages_df = pd.DataFrame()

for page_num in range(46):  
    df = get_bias(page_num)
    if df is not None:
        all_pages_df = pd.concat([all_pages_df, df], ignore_index=True)
    else:
        print(f"Failed to retrieve data for page {page_num}")



Fetching media bias column...
Fetching internal URLs...
Extracting the data table...
Table extracted successfully.
Bias rating links and internal URLs columns added.
Cleaning column names...
Column names cleaned.
                      news_source          agree_disagree   bias_rating  \
0                 Karol Markowicz  agree disagree 145/249         right   
1                    Philip Klein   agree disagree 60/135  right-center   
2  "The Conversation" Contributor   agree disagree 115/97   left-center   
3       "The Fulcrum" Contributor    agree disagree 76/95        center   
4         "USA Today" Contributor  agree disagree 106/112   left-center   

                            news_source_internal_url  agree  disagree  \
0  https://www.allsides.com/news-source/karol-mar...    145       249   
1  https://www.allsides.com/news-source/philip-kl...     60       135   
2  https://www.allsides.com/news-source/conversat...    115        97   
3  https://www.allsides.com/news-source/fulc

In [10]:
all_pages_df.to_csv('allsides_bias_data.csv', index=False)

In [11]:
all_pages_df.head()

Unnamed: 0,news_source,agree_disagree,bias_rating,news_source_internal_url,agree,disagree,perc_agree,rating_num
0,Karol Markowicz,agree disagree 145/249,right,https://www.allsides.com/news-source/karol-mar...,145,249,0.36802,5.0
1,Philip Klein,agree disagree 60/135,right-center,https://www.allsides.com/news-source/philip-kl...,60,135,0.307692,4.0
2,"""The Conversation"" Contributor",agree disagree 115/97,left-center,https://www.allsides.com/news-source/conversat...,115,97,0.542453,2.0
3,"""The Fulcrum"" Contributor",agree disagree 76/95,center,https://www.allsides.com/news-source/fulcrum-c...,76,95,0.444444,3.0
4,"""USA Today"" Contributor",agree disagree 106/112,left-center,https://www.allsides.com/news-source/usa-today...,106,112,0.486239,2.0
