In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import plotly.express as px


In [None]:
"""
    Constructs a search URL for the AllSides website based on the given topic and page number.

    Parameters:
    topic (str): A keyword that defines the topic of interest, such as 'israel_hamas_war' or 'ukraine_russia_war'.
    page_num (int): The page number to access in the paginated search results.

    Returns:
    str: A complete URL string that can be used to fetch the search results page for the given topic and page number.
"""
def page_creator(topic, page_num):
    search_terms = {
        'israel_hamas_war': 'middle%20east',
        'ukraine_russia_war': 'ukraine%20war'
    }
    base_url = "https://www.allsides.com/search"
    search_term = search_terms.get(topic, '')
    return f"{base_url}?search={search_term}&item_bundle=All&sort_by=search_api_relevance&page={page_num}"

In [None]:
topics = ['israel_hamas_war', 'ukraine_russia_war']


In [27]:
"""
    Scrapes articles from the AllSides website for a given topic over a specified number of pages.

    Parameters:
    topic (str): The topic for which articles are to be scraped, used to generate the correct URL.
    pages (int): The number of pages to scrape for the topic, where each page contains multiple articles.

    Returns:
    pandas.DataFrame: A DataFrame containing the scraped data with columns for the date, title, and bias rating of each article.
"""
def scrape_allside_articles(topic, pages):
    all_articles = []

    for page_num in range(pages):
        url = page_creator(topic, page_num)
        response = requests.get(url)
        if response.status_code != 200:
            print(f'Failed to retrieve the webpage for page {page_num}')
            continue

        soup = BeautifulSoup(response.content, 'html.parser')
        articles = soup.select('.views-row')

        for article in articles:
            date_element = article.select_one('p.search-result-publish-date > span.field-content')
            article_date = date_element.get_text(strip=True) if date_element else 'Date not found'

            headline_tag = article.select_one('h3.search-result-title > span > a')
            headline_text = headline_tag.get_text(strip=True) if headline_tag else 'No headline found'

            bias_image_tag = article.select_one('.search-result-source img')
            bias_image_url = bias_image_tag.get('src') if bias_image_tag else None
            bias_rating = 'not rated'
            if bias_image_url:
                bias_rating = bias_image_url.split('/')[-1].split('.')[0]
                bias_rating = bias_rating.replace('bias-', '').replace('leaning-', '').replace('allsides-new-12_0', 'not rated')
            
            all_articles.append({
                'Date': article_date,
                'Title': headline_text,
                'Bias Rating': bias_rating
            })

    return pd.DataFrame(all_articles)

df_articles_israel_hamas = scrape_allside_articles(topics[0], 1270) 
print(df_articles_israel_hamas)

                 Date                                              Title  \
0         Feb 01 2024                 How to end the Middle East’s agony   
1         Jan 30 2024   Biden struggles to keep a lid on the Middle East   
2      Date not found                                    Middle East Eye   
3         Feb 01 2024  A Biden Doctrine for the Middle East Is Formin...   
4         Feb 01 2024  Will America Get Pulled Into Another War in th...   
...               ...                                                ...   
12691     Oct 06 2015  Two Reasons We Have Limited Options for President   
12692  Date not found                      How AllSides Rates Media Bias   
12693     Sep 29 2016                        Who Won the Hofstra Debate?   
12694     Feb 01 2016              Menace or Victim? Depends Who You Ask   
12695  Date not found                                              Press   

      Bias Rating  
0            left  
1            left  
2       not rated  
3      

In [28]:
df_articles_israel_hamas.to_csv('israel_hamas_war_article_data.csv', index=False) 

In [37]:
df_articles_israel_hamas.dropna(subset=['Date'], inplace=True)
df_articles_israel_hamas['Date'] = pd.to_datetime(df_articles_israel_hamas['Date'], format='%b %d %Y', errors='coerce')

In [46]:
sorted_articles = df_articles_israel_hamas.sort_values(by='Bias Rating')

left_articles = sorted_articles[sorted_articles['Bias Rating'] == 'left']
center_articles = sorted_articles[sorted_articles['Bias Rating'] == 'center']
right_articles = sorted_articles[sorted_articles['Bias Rating'] == 'right']

left_agg = left_articles.groupby(left_articles['Date'].dt.date).size().reset_index(name='Article Count')
center_agg = center_articles.groupby(center_articles['Date'].dt.date).size().reset_index(name='Article Count')
right_agg = right_articles.groupby(right_articles['Date'].dt.date).size().reset_index(name='Article Count')

fig = px.line(left_agg, x='Date', y='Article Count', title='Number of Articles Over Time by Bias')
fig.add_scatter(x=center_agg['Date'], y=center_agg['Article Count'], mode='lines', name='Center')
fig.add_scatter(x=right_agg['Date'], y=right_agg['Article Count'], mode='lines', name='Right')

fig.update_xaxes(title_text='Time')
fig.update_yaxes(title_text='Number of Articles')
fig.update_layout(legend_title_text='Bias')


fig.show()

In [52]:
df_ukraine_war_articles = scrape_allside_articles(topics[1], 1429)
print(df_ukraine_war_articles)
df_ukraine_war_articles.to_csv('ukraine_war_article_data.csv', index=False)