In [94]:
import time
from copy import deepcopy
import json

import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.notebook import tqdm

In [1]:
# Function for saving html

def save_html(html, path):
    with open(path, mode='wb')as f:
        f.write(html)

In [2]:
# Function for reading a local html file

def open_html(path):
    with open(path, mode='rb') as f:
        return f.read()

## # Sending GET request

Sending a get request and storing the response in response variable

In [4]:
url = "https://www.allsides.com/media-bias/media-bias-ratings"

response = requests.get(url)

In [5]:
print(response.content[:100])

b'<!DOCTYPE html>\n<!--[if IEMobile 7]><html class="iem7"  lang="en" dir="ltr"><![endif]-->\n<!--[if lte'


## # Creating a soup object

A soup object defines many methods that can be used to find elements and their contents.

In [7]:
soup = BeautifulSoup(response.content, features='html.parser')

In [8]:
rows = soup.select("tbody tr")

In [21]:
# print(rows[0].select_one(".source-title").text.strip())

name = rows[0].select_one(".source-title").text.strip()
name

'ABC News (Online)'

In [28]:
# getting the relative URL in href of anchor tag
# use brackets to select HTML attribute values

abc = rows[0].select_one('.source-title a')['href']
abc

'/news-source/abc-news-media-bias'

In [30]:
# creating an absolute URL

allsides_abcpage = "https://www.allsides.com" + abc

allsides_abcpage

https://www.allsides.com/news-source/abc-news-media-bias


### Getting political bias info

In [38]:
rows[0].select_one(".views-field-field-bias-image a")['href'].split('/')[-1]

'left-center'

In [39]:
bias = rows[0].select_one(".views-field-field-bias-image a")['href'].split('/')[-1]
bias

'left-center'

In [52]:
agree = int(rows[0].select_one('.agree').text)
agree

32966

In [54]:
disagree = int(rows[0].select_one('.disagree').text)
disagree

16881

In [55]:
agree_ratio = agree/ disagree
agree_ratio

1.9528463953557254

In [59]:
print(f"Agree: {agree}, disagree: {disagree}, ratio: {agree_ratio: .2f}")

Agree: 32966, disagree: 16881, ratio:  1.95


In [60]:
def get_agreeance_text(ratio):
    if ratio > 3: return "absolutely agrees"
    elif 2 < ratio <= 3: return "strongly agrees"
    elif 1.5 < ratio <= 2: return "agrees"
    elif 1 < ratio <= 1.5: return "somewhat agrees"
    elif ratio == 1: return "neutral"
    elif 0.67 < ratio < 1: return "somewhat disagrees"
    elif 0.5 < ratio <= 0.67: return "disagrees"
    elif 0.33 < ratio <= 0.5: return "strongly disagree"
    elif ratio <= 0.33: return "absolutly disagrees"
    else: return None

In [61]:
data = list()

for row in rows:
    
    d = dict()
    d['name'] = row.select_one('.source-title').text.strip()
    d['allsides_page'] = "https://www.allsides.com" + row.select_one('.source-title a')['href']
    d['bias'] = row.select_one(".views-field-field-bias-image a")['href'].split('/')[-1]
    d['agree'] = int(row.select_one('.agree').text)
    d['disagree'] = int(row.select_one('.disagree').text)
    d['agree_ratio'] = d['agree'] / d['disagree']
    d['agreeance_text'] = get_agreeance_text(d['agree_ratio'])
    
    data.append(d)

In [62]:
data

[{'name': 'ABC News (Online)',
  'allsides_page': 'https://www.allsides.com/news-source/abc-news-media-bias',
  'bias': 'left-center',
  'agree': 32966,
  'disagree': 16881,
  'agree_ratio': 1.9528463953557254,
  'agreeance_text': 'agrees'},
 {'name': 'AlterNet',
  'allsides_page': 'https://www.allsides.com/news-source/alternet-media-bias',
  'bias': 'left',
  'agree': 12408,
  'disagree': 2536,
  'agree_ratio': 4.892744479495268,
  'agreeance_text': 'absolutely agrees'},
 {'name': 'AP Politics & Fact Check',
  'allsides_page': 'https://www.allsides.com/news-source/ap-fact-check-media-bias',
  'bias': 'left-center',
  'agree': 3680,
  'disagree': 4441,
  'agree_ratio': 0.82864219770322,
  'agreeance_text': 'somewhat disagrees'},
 {'name': 'Associated Press',
  'allsides_page': 'https://www.allsides.com/news-source/associated-press-media-bias',
  'bias': 'center',
  'agree': 24763,
  'disagree': 18518,
  'agree_ratio': 1.3372394427043957,
  'agreeance_text': 'somewhat agrees'},
 {'name'

In [71]:
pages = [
    'https://www.allsides.com/media-bias/media-bias-ratings',
    'https://www.allsides.com/media-bias/media-bias-ratings?page=1'
]

In [72]:
data = list()
n = 1
for page in pages:
     
    r = requests.get(page)
    soup = BeautifulSoup(r.content, 'html.parser')
    rows = soup.select("tbody tr")

    for row in rows:

        d = dict()
        
        d['name'] = row.select_one('.source-title').text.strip()
        d['allsides_page'] = "https://www.allsides.com" + row.select_one('.source-title a')['href']
        d['bias'] = row.select_one(".views-field-field-bias-image a")['href'].split('/')[-1]
        d['agree'] = int(row.select_one('.agree').text)
        d['disagree'] = int(row.select_one('.disagree').text)
        d['agree_ratio'] = d['agree'] / d['disagree']
        d['agreeance_text'] = get_agreeance_text(d['agree_ratio'])

        data.append(d)
    print(f"loop {n}")
    n += 1
    time.sleep(10)
print("done")

loop 1
loop 2
done


In [74]:
data[0]

{'name': 'ABC News (Online)',
 'allsides_page': 'https://www.allsides.com/news-source/abc-news-media-bias',
 'bias': 'left-center',
 'agree': 32972,
 'disagree': 16882,
 'agree_ratio': 1.9530861272361095,
 'agreeance_text': 'agrees'}

In [78]:
# Getting the news provider detail page

response = requests.get(data[0]['allsides_page'])

In [82]:
# Getting the URL to home page of the news provider

k_soup = BeautifulSoup(response.content, 'html.parser')
web_url = k_soup.select_one('.dynamic-grid a')['href']
web_url

'http://abcnews.go.com/'

In [84]:
pd.DataFrame(data)

Unnamed: 0,name,allsides_page,bias,agree,disagree,agree_ratio,agreeance_text
0,ABC News (Online),https://www.allsides.com/news-source/abc-news-...,left-center,32972,16882,1.953086,agrees
1,AlterNet,https://www.allsides.com/news-source/alternet-...,left,12412,2537,4.892393,absolutely agrees
2,AP Politics & Fact Check,https://www.allsides.com/news-source/ap-fact-c...,left-center,3683,4445,0.828571,somewhat disagrees
3,Associated Press,https://www.allsides.com/news-source/associate...,center,24766,18526,1.336824,somewhat agrees
4,Axios,https://www.allsides.com/news-source/axios,center,5089,5349,0.951393,somewhat disagrees
5,BBC News,https://www.allsides.com/news-source/bbc-news-...,center,27413,23426,1.170196,somewhat agrees
6,Bloomberg,https://www.allsides.com/news-source/bloomberg...,left-center,14804,20200,0.732871,somewhat disagrees
7,Breitbart News,https://www.allsides.com/news-source/breitbart,right,37067,10790,3.43531,absolutely agrees
8,BuzzFeed News,https://www.allsides.com/news-source/buzzfeed-...,left,22603,8891,2.542234,strongly agrees
9,CBS News (Online),https://www.allsides.com/news-source/cbs-news-...,left-center,17770,11371,1.562747,agrees


In [92]:
loop = 0
for d in tqdm(data):
    r = requests.get(d['allsides_page'])
    soup = BeautifulSoup(r.content, 'html.parser')
    
    try:
        web_url = soup.select_one(".dynamic-grid a")['href']
        d['website'] = web_url
    except TypeError:
        pass
    
    print(f"Loop: {loop}")
    loop += 1
    time.sleep(10)

  0%|          | 0/53 [00:00<?, ?it/s]

Loop: 0
Loop: 1
Loop: 2
Loop: 3
Loop: 4
Loop: 5
Loop: 6
Loop: 7
Loop: 8
Loop: 9
Loop: 10
Loop: 11
Loop: 12
Loop: 13
Loop: 14
Loop: 15
Loop: 16
Loop: 17
Loop: 18
Loop: 19
Loop: 20
Loop: 21
Loop: 22
Loop: 23
Loop: 24
Loop: 25
Loop: 26
Loop: 27
Loop: 28
Loop: 29
Loop: 30
Loop: 31
Loop: 32
Loop: 33
Loop: 34
Loop: 35
Loop: 36
Loop: 37
Loop: 38
Loop: 39
Loop: 40
Loop: 41
Loop: 42
Loop: 43
Loop: 44
Loop: 45
Loop: 46
Loop: 47
Loop: 48
Loop: 49
Loop: 50
Loop: 51
Loop: 52


In [95]:
# saving our list of dictionaries in a JSON file.

with open('allsides.json', 'w') as f:
    json.dump(data, f)

In [96]:
# Reading from the saved JSON file above.

with open('allsides.json', 'r') as f:
    k = json.load(f)

print(k)

[{'name': 'ABC News (Online)', 'allsides_page': 'https://www.allsides.com/news-source/abc-news-media-bias', 'bias': 'left-center', 'agree': 32972, 'disagree': 16882, 'agree_ratio': 1.9530861272361095, 'agreeance_text': 'agrees', 'website': 'http://abcnews.go.com/'}, {'name': 'AlterNet', 'allsides_page': 'https://www.allsides.com/news-source/alternet-media-bias', 'bias': 'left', 'agree': 12412, 'disagree': 2537, 'agree_ratio': 4.892392589672842, 'agreeance_text': 'absolutely agrees', 'website': 'http://www.alternet.org/'}, {'name': 'AP Politics & Fact Check', 'allsides_page': 'https://www.allsides.com/news-source/ap-fact-check-media-bias', 'bias': 'left-center', 'agree': 3683, 'disagree': 4445, 'agree_ratio': 0.8285714285714286, 'agreeance_text': 'somewhat disagrees', 'website': 'https://apnews.com/APFactCheck'}, {'name': 'Associated Press', 'allsides_page': 'https://www.allsides.com/news-source/associated-press-media-bias', 'bias': 'center', 'agree': 24766, 'disagree': 18526, 'agree_ra