In [1]:
import time
from copy import deepcopy
import json

import requests
from bs4 import BeautifulSoup
import pandas as pd
from tqdm.notebook import tqdm

# A warning on robots
  
go to example.com/robots.txt and you should find a text file that defines what & how web crawlers(robots) can crawl the website.  

Examle:  
        User-agent: *  
        Crawl-delay: 10   
        Allow: /pages/   
        Disallow: /scripts/  

it means general bots should make a request once per 10 seconds, and they are allowed to scrape pages and not scripts.

In [2]:
# Function for saving html

def save_html(html, path):
    with open(path, mode='wb')as f:
        f.write(html)

In [3]:
# Function for reading a local html file

def open_html(path):
    with open(path, mode='rb') as f:
        return f.read()

## # Sending GET request

Sending a get request and storing the response in response variable

In [4]:
url = "https://www.allsides.com/media-bias/media-bias-ratings"

response = requests.get(url)

In [5]:
print(response.content[:100])

b'<!DOCTYPE html>\n<!--[if IEMobile 7]><html class="iem7"  lang="en" dir="ltr"><![endif]-->\n<!--[if lte'


## # Creating a soup object

A soup object defines many methods that can be used to find elements and their contents.

In [6]:
soup = BeautifulSoup(response.content, features='html.parser')

In [7]:
rows = soup.select("tbody tr")

In [8]:
# print(rows[0].select_one(".source-title").text.strip())

name = rows[0].select_one(".source-title").text.strip()
name

'ABC News (Online)'

In [9]:
# getting the relative URL in href of anchor tag
# use brackets to select HTML attribute values

abc = rows[0].select_one('.source-title a')['href']
abc

'/news-source/abc-news-media-bias'

In [10]:
# creating an absolute URL

allsides_abcpage = "https://www.allsides.com" + abc

allsides_abcpage

'https://www.allsides.com/news-source/abc-news-media-bias'

### # Getting political bias info

In [12]:
bias = rows[0].select_one(".views-field-field-bias-image a")['href'].split('/')[-1]
bias

'left-center'

#### # How many people agree with the bias rating?

In [13]:
agree = int(rows[0].select_one('.agree').text)
agree

33387

#### # How many people disagree with the bias rating?

In [14]:
disagree = int(rows[0].select_one('.disagree').text)
disagree

17034

In [15]:
agree_ratio = agree/ disagree
agree_ratio

1.9600211342021838

In [16]:
print(f"Agree: {agree}, disagree: {disagree}, ratio: {agree_ratio: .2f}")

Agree: 33387, disagree: 17034, ratio:  1.96


In [17]:
def get_agreeance_text(ratio):
    if ratio > 3: return "absolutely agrees"
    elif 2 < ratio <= 3: return "strongly agrees"
    elif 1.5 < ratio <= 2: return "agrees"
    elif 1 < ratio <= 1.5: return "somewhat agrees"
    elif ratio == 1: return "neutral"
    elif 0.67 < ratio < 1: return "somewhat disagrees"
    elif 0.5 < ratio <= 0.67: return "disagrees"
    elif 0.33 < ratio <= 0.5: return "strongly disagree"
    elif ratio <= 0.33: return "absolutly disagrees"
    else: return None

In [18]:
data = list()

for row in rows:
    
    d = dict()
    d['name'] = row.select_one('.source-title').text.strip()
    d['allsides_page'] = "https://www.allsides.com" + row.select_one('.source-title a')['href']
    d['bias'] = row.select_one(".views-field-field-bias-image a")['href'].split('/')[-1]
    d['agree'] = int(row.select_one('.agree').text)
    d['disagree'] = int(row.select_one('.disagree').text)
    d['agree_ratio'] = d['agree'] / d['disagree']
    d['agreeance_text'] = get_agreeance_text(d['agree_ratio'])
    
    data.append(d)

In [21]:
data[0:3]

[{'name': 'ABC News (Online)',
  'allsides_page': 'https://www.allsides.com/news-source/abc-news-media-bias',
  'bias': 'left-center',
  'agree': 33387,
  'disagree': 17034,
  'agree_ratio': 1.9600211342021838,
  'agreeance_text': 'agrees'},
 {'name': 'AlterNet',
  'allsides_page': 'https://www.allsides.com/news-source/alternet-media-bias',
  'bias': 'left',
  'agree': 12619,
  'disagree': 2574,
  'agree_ratio': 4.902486402486402,
  'agreeance_text': 'absolutely agrees'},
 {'name': 'AP Politics & Fact Check',
  'allsides_page': 'https://www.allsides.com/news-source/ap-fact-check-media-bias',
  'bias': 'left-center',
  'agree': 3893,
  'disagree': 4630,
  'agree_ratio': 0.8408207343412527,
  'agreeance_text': 'somewhat disagrees'}]

In [20]:
pages = [
    'https://www.allsides.com/media-bias/media-bias-ratings',
    'https://www.allsides.com/media-bias/media-bias-ratings?page=1'
]

In [21]:
data = list()
n = 1
for page in pages:
     
    r = requests.get(page)
    soup = BeautifulSoup(r.content, 'html.parser')
    rows = soup.select("tbody tr")

    for row in rows:

        d = dict()
        
        d['name'] = row.select_one('.source-title').text.strip()
        d['allsides_page'] = "https://www.allsides.com" + row.select_one('.source-title a')['href']
        d['bias'] = row.select_one(".views-field-field-bias-image a")['href'].split('/')[-1]
        d['agree'] = int(row.select_one('.agree').text)
        d['disagree'] = int(row.select_one('.disagree').text)
        d['agree_ratio'] = d['agree'] / d['disagree']
        d['agreeance_text'] = get_agreeance_text(d['agree_ratio'])

        data.append(d)
    print(f"loop {n}")
    n += 1
    time.sleep(10)
print("done")

loop 1
loop 2
done


In [22]:
data[0]

{'name': 'ABC News (Online)',
 'allsides_page': 'https://www.allsides.com/news-source/abc-news-media-bias',
 'bias': 'left-center',
 'agree': 32991,
 'disagree': 16893,
 'agree_ratio': 1.95293908719588,
 'agreeance_text': 'agrees'}

In [25]:
pd.DataFrame(data)

Unnamed: 0,name,allsides_page,bias,agree,disagree,agree_ratio,agreeance_text
0,ABC News (Online),https://www.allsides.com/news-source/abc-news-...,left-center,32991,16893,1.952939,agrees
1,AlterNet,https://www.allsides.com/news-source/alternet-...,left,12420,2538,4.893617,absolutely agrees
2,AP Politics & Fact Check,https://www.allsides.com/news-source/ap-fact-c...,left-center,3699,4460,0.829372,somewhat disagrees
3,Associated Press,https://www.allsides.com/news-source/associate...,center,24780,18558,1.335273,somewhat agrees
4,Axios,https://www.allsides.com/news-source/axios,center,5094,5360,0.950373,somewhat disagrees
5,BBC News,https://www.allsides.com/news-source/bbc-news-...,center,27426,23447,1.169702,somewhat agrees
6,Bloomberg,https://www.allsides.com/news-source/bloomberg...,left-center,14822,20209,0.733436,somewhat disagrees
7,Breitbart News,https://www.allsides.com/news-source/breitbart,right,37090,10794,3.436168,absolutely agrees
8,BuzzFeed News,https://www.allsides.com/news-source/buzzfeed-...,left,22624,8893,2.544023,strongly agrees
9,CBS News (Online),https://www.allsides.com/news-source/cbs-news-...,left-center,17782,11380,1.562566,agrees


In [23]:
# Getting the news provider detail page

response = requests.get(data[0]['allsides_page'])

In [24]:
# Getting the URL to home page of the news provider

k_soup = BeautifulSoup(response.content, 'html.parser')
web_url = k_soup.select_one('.dynamic-grid a')['href']
web_url

'http://abcnews.go.com/'

In [27]:
# loop = 0
for d in tqdm(data):
    r = requests.get(d['allsides_page'])
    soup = BeautifulSoup(r.content, 'html.parser')
    
    try:
        web_url = soup.select_one(".dynamic-grid a")['href']
        d['website'] = web_url
    except TypeError:
        pass
    
#     print(f"Loop: {loop}")
#     loop += 1
    time.sleep(10)

  0%|          | 0/53 [00:00<?, ?it/s]

In [95]:
# saving our list of dictionaries in a JSON file.

with open('allsides.json', 'w') as f:
    json.dump(data, f)

In [96]:
# Reading from the saved JSON file above.

with open('allsides.json', 'r') as f:
    k = json.load(f)

print(k)

[{'name': 'ABC News (Online)', 'allsides_page': 'https://www.allsides.com/news-source/abc-news-media-bias', 'bias': 'left-center', 'agree': 32972, 'disagree': 16882, 'agree_ratio': 1.9530861272361095, 'agreeance_text': 'agrees', 'website': 'http://abcnews.go.com/'}, {'name': 'AlterNet', 'allsides_page': 'https://www.allsides.com/news-source/alternet-media-bias', 'bias': 'left', 'agree': 12412, 'disagree': 2537, 'agree_ratio': 4.892392589672842, 'agreeance_text': 'absolutely agrees', 'website': 'http://www.alternet.org/'}, {'name': 'AP Politics & Fact Check', 'allsides_page': 'https://www.allsides.com/news-source/ap-fact-check-media-bias', 'bias': 'left-center', 'agree': 3683, 'disagree': 4445, 'agree_ratio': 0.8285714285714286, 'agreeance_text': 'somewhat disagrees', 'website': 'https://apnews.com/APFactCheck'}, {'name': 'Associated Press', 'allsides_page': 'https://www.allsides.com/news-source/associated-press-media-bias', 'bias': 'center', 'agree': 24766, 'disagree': 18526, 'agree_ra

# # Data Analysis

### # Extracting news organizations on which the community agrees on absolutly

In [32]:
abs_agree = list(d for d in data if d['agreeance_text'] == 'absolutely agrees')

print(f"{'Outlet':<28} {'Bias':<20}")
print("-" * 38)

for d in abs_agree:
    print(f"{d['name']:<28} {d['bias']:<20}")

Outlet                       Bias                
--------------------------------------
AlterNet                     left                
Breitbart News               right               
CNN (Opinion)                left                
Daily Beast                  left                
Democracy Now                left                
Fox News (Opinion)           right               
Mother Jones                 left                
MSNBC                        left                
New York Times (Opinion)     left                
The Federalist               right               
The Intercept                left                
The New Yorker               left                


In [33]:
# Creating a dataframe of of our data

In [34]:
df = pd.DataFrame(data)
df.head(5)

Unnamed: 0,name,allsides_page,bias,agree,disagree,agree_ratio,agreeance_text,website
0,ABC News (Online),https://www.allsides.com/news-source/abc-news-...,left-center,32991,16893,1.952939,agrees,http://abcnews.go.com/
1,AlterNet,https://www.allsides.com/news-source/alternet-...,left,12420,2538,4.893617,absolutely agrees,http://www.alternet.org/
2,AP Politics & Fact Check,https://www.allsides.com/news-source/ap-fact-c...,left-center,3699,4460,0.829372,somewhat disagrees,https://apnews.com/APFactCheck
3,Associated Press,https://www.allsides.com/news-source/associate...,center,24780,18558,1.335273,somewhat agrees,https://apnews.com/
4,Axios,https://www.allsides.com/news-source/axios,center,5094,5360,0.950373,somewhat disagrees,https://www.axios.com/


In [40]:
df[df['agreeance_text'] == 'disagrees']

Unnamed: 0,name,allsides_page,bias,agree,disagree,agree_ratio,agreeance_text,website
37,The Economist,https://www.allsides.com/news-source/economist,left-center,5251,9251,0.567614,disagrees,http://www.economist.com/
