## Web Scraping Script for all 20 Websites

This Script first gets the link for the newest articles of a website and then builds a DataFrame with the most important features of each article.

The News Site used are:
- CNN
- FOX NEWS
- ABC NEWS
- CNBC NEWS
- Politico
- Washington Post
- New York Times
- MSNBC
- Reuters
- USA Today
- Bloomberg
- CBS News
- The Wall Street Journal
- Los Angeles Times
- Chicago Tribune
- HuffPost
- Al Jazeera English
- Time
- NPR
- BBC


In [1]:
import pyforest
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.edge.options import Options
from dateutil import parser

In [2]:
from bs4 import BeautifulSoup
import re

def clean_html_tags(html_text):
    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(html_text, 'html.parser')

    # Extract text content without HTML tags
    text_content = soup.get_text()

    # Remove extra whitespaces and newline characters
    cleaned_text = re.sub(r'\s+', ' ', text_content).strip()

    return cleaned_text
def remove_words(input_string, words_to_remove):
    return ' '.join([word for word in input_string.split() if word not in words_to_remove])

## CNN

In [3]:
# links for all articles
def links_cnn():
    index = 0
    dict_cnn = {}  
    links = []
    driver = webdriver.Edge()
    for i in range(1):
        url = f"https://edition.cnn.com/search?q=&from={i * 100}&size=100&page={i}&sort=newest&types=article&section="
        driver.get(url)
        time.sleep(5)
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        links_on_page = soup.find_all("span", attrs = {"class":"container__headline-text"})
        links.extend(links_on_page)

    driver.quit()

    for i in range(len(links)):
        link = links[i]["data-zjs-href"]
        dict_cnn[index]= link
        index +=1
    return dict_cnn

In [4]:
# getting all the features for one article
def attributes_cnn(article):
    """This function receives an url and returns a DF with different attributes"""
    response = requests.get(article)
    soup = BeautifulSoup(response.content)
    content = soup.find_all("p", attrs = {"class": "paragraph inline-placeholder"})
    # definitions
    text = []
    new_dict= {}
    text_count = {}
    # text and lenght
    for paragraph in content:
        y = paragraph.get_text().strip()
        text.append(y)
    text ="".join(text)
    text = clean_html_tags(text)
    lenght = len(text)
    try:
        title = soup.find("h1").get_text().strip()
    except AttributeError:
        title = None
    try:   
        date = soup.find("div", attrs = {"class":"timestamp"}).get_text().strip()
        date = date.split("     ")[-1]
    except AttributeError:
        date = None
    try:
        reading_time = soup.find("div", attrs = {"class": "headline__sub-description"}).get_text().strip()
        reading_time = reading_time.split(" ")[0]
    except AttributeError:
        reading_time = None
    try:
        author = soup.find("span", attrs = {"class": "byline__name"}).get_text().strip()
    except AttributeError:
        author = None  
    try:
        imageURL = soup.find_all("img")[0]["src"]
    except Exception:
        imageURL= None
        
    url = article
    
    x = (article.split(".com/")[1])
    x = x.split("/")
    if x[0]=="2023":
        category = x[3]
    else:
        category = x[0]

    new_dict[0]={"Source":"CNN", "Title": title, "Date" : date,"Author" : author, "Category": category, "Text" : text , "Text lenght" : lenght, "URL" : url, "imageURL":imageURL}
    df = pd.DataFrame.from_dict(new_dict, orient="index")
    return df

In [5]:
# building a DatFrame with all articles
def cnn_df():
    dict_cnn = links_cnn()
    dataframes_CNN = []
    for url in dict_cnn.keys():
        df = attributes_cnn(dict_cnn[url])
        dataframes_CNN.append(df)

    cnn_df = pd.concat(dataframes_CNN, ignore_index=True)
    cnn_df["Date"] = cnn_df["Date"].apply(lambda x : parser.parse(x) if pd.notnull(x) else None)
    cnn_df["Date"].dropna(inplace=True)
    cnn_df["Date"] = cnn_df["Date"].astype(str)
    cnn_df["Date"] = cnn_df["Date"].apply(lambda x : x.split(".")[0] if pd.notnull(x) else None)
    cnn_df = cnn_df[cnn_df["Text lenght"]>100]
    return cnn_df

## FOX NEWS

In [6]:
def links_fox():
    index = 0
    dict_fox = {}  
    urls = 'https://www.foxnews.com/search-results/search?q=news'
    driver = webdriver.Edge()
    driver.get(urls)
    time.sleep(60)
    html = driver.page_source
    soup = BeautifulSoup(html)
    items = soup.find_all("h2", attrs = {"class":"title"})
    items = items[2:]
    for i in range(len(items)):
        dict_fox[index]= items[i].find_all("a")[0]["href"]
        index +=1
    driver.quit()
    return dict_fox

In [7]:
def attributes_fox(article):
    """This function receives an url and returns a DF with different attributes"""
    response = requests.get(article)
    soup = BeautifulSoup(response.content)
    content = soup.find_all("p")
    # definitions
    text = []
    new_dict= {}
    text_count = {}
    # text and lenght
    for paragraph in content:
        y = paragraph.get_text().strip()
        text.append(y)
    text ="".join(text[2:-6])
    text = clean_html_tags(text)
    lenght = len(text)
    try:
        title = soup.find("h1").get_text().strip()
    except AttributeError:
        title = None
        
    try:   
        date = soup.find("span", attrs = {"class":"article-date"}).get_text().strip()
        date = date.split("     ")[-1]
    except AttributeError:
        date = None
    source = "Fox News"
    url = article
    try:
        x = soup.find_all("div", attrs={"class":"author-byline"})
        author = x[0].get_text().strip()
        author = author.split("\n")[1]
        author = author.split("Fox")[0].strip()
    except IndexError:
        author = None
    try:
        imageURL = soup.find_all("img")[0]["src"]
    except Exception:
        imageURL= None

    category = article.split("/")[3]
    new_dict[0]={"Source":source, "Title":title, "Date" : date,"Author":author, "Category": category, "Text" : text , "Text lenght" : lenght, "URL" : url, "imageURL":imageURL}
    df = pd.DataFrame.from_dict(new_dict, orient="index")
    return df

In [8]:
def fox_df():
    dict_fox = links_fox()
    dataframes_FOX = []
    for url in dict_fox.keys():
        df = attributes_fox(dict_fox[url])
        dataframes_FOX.append(df)

    fox_df = pd.concat(dataframes_FOX, ignore_index=True)
    fox_df["Date"] = fox_df["Date"].apply(lambda x : parser.parse(x) if pd.notnull(x) else None)
    fox_df["Date"].dropna(inplace=True)
    fox_df["Date"] = fox_df["Date"].astype(str)
    fox_df["Date"] = fox_df["Date"].apply(lambda x : x.split(".")[0] if pd.notnull(x) else None)
    fox_df = fox_df[fox_df["Text lenght"]>100]
    return fox_df

## ABC

In [9]:
try:
    index = 0
    dict_abc = {}
    driver = webdriver.Edge()

    for i in range(10):
        url = f"https://abcnews.go.com/search?searchtext=news&sort=date&page={i}"
        driver.get(url)
        time.sleep(2)
        html = driver.page_source
        soup = BeautifulSoup(html)
        section = soup.find("section", {"class":"ContentRoll"})
        links = section.find_all("a", {"class":"AnchorLink"})
        for i in links:
            dict_abc[index]= i["href"]
            index +=1

    driver.quit()
except Exception:
        index = 0
        dict_abc = {}
        driver = webdriver.Edge()

        for i in range(10):
            url = f"https://abcnews.go.com/search?searchtext=news&sort=date&page={i}"
            driver.get(url)
            time.sleep(2)
            html = driver.page_source
            soup = BeautifulSoup(html)
            section = soup.find("section", {"class":"ContentRoll"})
            links = section.find_all("a", {"class":"AnchorLink"})
            for i in links:
                dict_abc[index]= i["href"]
                index +=1

        driver.quit()

In [10]:
# links for all articles
def links_abc():
    index = 0
    dict_abc = {}
    driver = webdriver.Edge()

    for i in range(2):
        url = f"https://abcnews.go.com/search?searchtext=news&sort=date&page={i}"
        driver.get(url)
        html = driver.page_source
        time.sleep(2)
        soup = BeautifulSoup(html)
        section = soup.find("section", {"class":"ContentRoll"})
        links = section.find_all("a", {"class":"AnchorLink"})
        for i in links:
            dict_abc[index]= i["href"]
            index +=1

    driver.quit()
    return dict_abc


In [11]:
# getting all the features for one article
def attributes_abc(article):
    """This function receives an url and returns a DF with different attributes"""
    # definitions
    text = []
    new_dict= {}
    text_count = {}
    
    
    response = requests.get(article)
    soup = BeautifulSoup(response.content)
    content = soup.find_all("p")

    # text and lenght
    for paragraph in content[1:-2]:
        y = paragraph.get_text().strip()
        text.append(y)
    text ="".join(text)
    text = clean_html_tags(text)
    lenght = len(text)
    
    try:
        title = soup.find("h1").get_text().strip()
    except AttributeError:
        title = None
    
    x = soup.find_all("div")
    if x==[]:
        author = None
        date = None
    for i in x:
        try:
            if i["data-testid"] == "prism-byline":
                info = i.get_text().strip()
                author = info
                date = info
                break
        except (AttributeError, KeyError):
            author = None
            date = None 
    try:
        imageURL = soup.find_all("img")[1]["src"]
    except Exception:
        imageURL= None

        
    category = article.split(".com")[1].split("/")[1]

    new_dict[0]={"Source":"ABC","Title": title, "Date" : date,"Author" : author, "Category": category, "Text" : text , "Text lenght" : lenght, "URL" : article, "imageURL":imageURL}
    df = pd.DataFrame.from_dict(new_dict, orient="index")
    return df

In [12]:
def clean_date(row):
    if row["Date"]==None:
        row["Author"] = None
        row["Date"] = None
    elif "December" in row["Date"]:
        row["Author"] = row["Author"].split("December")[0].split("By")[1].strip()
        row["Date"] = f"December {row['Date'].split('December')[1]}".strip()
    else:
        row["Author"] = None
        row["Date"] = None
    return row

In [13]:
def abc_df(dict_abc):
    dataframes_ABC = []
    for url in dict_abc.keys():
        df = attributes_abc(dict_abc[url])
        dataframes_ABC.append(df)

    abc_df = pd.concat(dataframes_ABC, ignore_index=True)
    abc_df["Date"].dropna(inplace=True)
    abc_df = abc_df.apply(clean_date, axis=1)
    abc_df["Date"].dropna(inplace=True)
    abc_df["Date"] = abc_df["Date"].apply(lambda x : parser.parse(x) if pd.notnull(x) else None)
    abc_df["Date"].dropna(inplace=True)
    abc_df["Date"] = abc_df["Date"].astype(str)
    abc_df["Date"] = abc_df["Date"].apply(lambda x : x.split(".")[0] if pd.notnull(x) else None)
    abc_df = abc_df[abc_df["Text lenght"]>100]    
    return abc_df

## Politico

In [14]:
# links for all articles
def links_politico():
    index = 0
    dict_politico = {} 
    for i in range(1,6):
        url = f"https://www.politico.com/search/{i}?q=news&adv=true&c=a9d449a5-b61d-32fc-b70c-e15227dcdca7"
        r = requests.get(url)
        soup = BeautifulSoup(r.content)
        sections = soup.find_all("div", {"class":"summary"})
        for i in range(len(sections)):
            link = sections[i].find("a")["href"]
            if "subscriber" not in link:
                dict_politico[index] = link
                index +=1
    return dict_politico

In [15]:
# getting all the features for one article
def attributes_politico(article):
    """This function receives an url and returns a DF with different attributes"""
    # definitions
    text = []
    new_dict= {}
    text_count = {}
    
    r = requests.get(article)
    soup = BeautifulSoup(r.content)
    content = soup.find_all("p", {"class":"story-text__paragraph"})
    
    # text and lenght
    for i in content:
        y = i.get_text().strip()
        text.append(y)
    text ="".join(text)
    text = clean_html_tags(text)
    lenght = len(text)
    
    try:
        title = soup.find("h2", {"class":"headline"}).get_text().strip()
    except AttributeError:
        title = None
        
    try:
        date = soup.find("time").get_text()
    except AttributeError:
        date = None
    
    try:
        author = soup.find("span", {"class":"vcard"}).get_text()
    except AttributeError:
        author = None
    try:
        category = soup.find("p", {"class":"category"}).get_text().strip()
    except AttributeError:
        category = None
    try:
        imageURL = soup.find_all("img")[0]["src"]
    except Exception:
        imageURL= None

    new_dict[0]={"Source":"Politico","Title": title, "Date" : date,"Author" : author, "Category": category, "Text" : text , "Text lenght" : lenght, "URL" : article, "imageURL":imageURL}
    df = pd.DataFrame.from_dict(new_dict, orient="index")
    return df

In [16]:
def politico_df():
    dict_politico = links_politico()
    dataframes_politico = []
    for url in dict_politico.keys():
        df = attributes_politico(dict_politico[url])
        dataframes_politico.append(df)

    politico_df = pd.concat(dataframes_politico, ignore_index=True)
    politico_df["Date"] = politico_df["Date"].apply(lambda x : parser.parse(x) if pd.notnull(x) else None)
    politico_df["Date"].dropna(inplace=True)
    politico_df["Date"] = politico_df["Date"].astype(str)
    politico_df["Date"] = politico_df["Date"].apply(lambda x : x.split(".")[0] if pd.notnull(x) else None)
    politico_df = politico_df[politico_df["Text lenght"]>100]
    return politico_df

## Washington Post

In [17]:
# links for all articles
def links_wapo():
    index = 0
    dict_wapost = {} 
    url = "https://www.washingtonpost.com/latest-headlines/"
    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    sections = soup.find_all("a", {"data-pb-local-content-field":"web_headline"})
    for i in sections:
        link = i["href"]
        if "podcasts" not in link:
            dict_wapost[index] = link
            index +=1
    return dict_wapost

In [18]:
# getting all the features for one article
def attributes_wapo(article):
    """This function receives an url and returns a DF with different attributes"""
    # definitions
    text = []
    new_dict= {}
    text_count = {}
    
    r = requests.get(article)
    soup = BeautifulSoup(r.content)
    content = soup.find_all("p", {"data-testid":"drop-cap-letter"})
    
    # text and lenght
    for i in content[:-1]:
        y = i.get_text().strip()
        text.append(y)
    text ="".join(text)
    text = clean_html_tags(text)
    lenght = len(text)
    
    try:
        title = soup.find("span", {"data-qa":"headline-text"}).get_text().strip()
    except AttributeError:
        title = None
        
    try:
        date = soup.find("span", {"data-testid":"display-date"}).get_text().strip()
    except AttributeError:
        try:
            date = soup.find("span", {"data-testid": "updated-date"}).get_text().strip()
        except AttributeError:
            date = None
            
    try:
        x = soup.find_all("a", {"data-qa":"author-name"})
        author = []
        for i in x:
            y = i.get_text().strip()
            author.append(y)
        author = ", ".join(author)
    except AttributeError:
        author = None
    try:
        imageURL = soup.find_all("img")[0]["srcset"]
    except Exception:
        imageURL= None
        
    category = article.split(".com")[1].split("/")[1]

    new_dict[0]={"Source":"Washington Post","Title": title, "Date" : date,"Author" : author, "Category": category, "Text" : text , "Text lenght" : lenght, "URL" : article, "imageURL":imageURL}
    df = pd.DataFrame.from_dict(new_dict, orient="index")
    return df

In [19]:
from dateutil import parser
import pandas as pd

def parse_date(x):
    try:
        return parser.parse(x)
    except Exception:
        return None

In [20]:
def wapo_df():
    dict_wapost = links_wapo()
    dataframes_wapo = []
    for url in dict_wapost.keys():
        df = attributes_wapo(dict_wapost[url])
        dataframes_wapo.append(df)

    wapo_df = pd.concat(dataframes_wapo, ignore_index=True)
    wapo_df["Date"] = wapo_df["Date"].apply(lambda x: None if x is None or "ago" in x else x)    
    wapo_df["Date"] = wapo_df["Date"].apply(parse_date)
    wapo_df["Date"].dropna(inplace=True)
    wapo_df["Date"] = wapo_df["Date"].astype(str)
    wapo_df["Date"] = wapo_df["Date"].apply(lambda x : x.split(".")[0] if pd.notnull(x) else None)
    wapo_df = wapo_df[wapo_df["Text lenght"]>100]
    return wapo_df

## New York Times | Do again

In [21]:
# links for all articles
def links_nytimes():
    index = 0
    dict_nytimes = {} 
    url = "https://www.nytimes.com/search?dropmab=false&query=news&sort=newest&types=article"
    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    sections = soup.find_all("div", {"class":"css-e1lvw9"})
    for i in sections:
        link = f"https://www.nytimes.com/{i.find('a')['href']}"
        link = link.split("?")[0]
        if "podcast" not in link:
            dict_nytimes[index] = link
            index +=1
    return dict_nytimes

In [22]:
# getting all the features for one article
def attributes_nytimes(article):
    """This function receives an url and returns a DF with different attributes"""
    # definitions
    text = []
    new_dict= {}
    text_count = {}
    
    driver = webdriver.Edge()
    driver.get(article)
    time.sleep(2)
    html = driver.page_source
    soup = BeautifulSoup(html)
    driver.quit()
    # text and lenght
    content = soup.find_all("p")
    for i in content[4:-3]:
        y = i.get_text().strip()
        text.append(y)
    text ="".join(text)
    text = clean_html_tags(text)
    lenght = len(text)
    
    try:
        title = soup.find("h1").get_text().strip()
    except AttributeError:
        title = None
        
    try:
        date = soup.find("time").get_text().strip()
    except AttributeError:
        date = None
            
    try:
        x = soup.find_all("span", {"class":"css-1baulvz"})
        author = []
        for i in x:
            y = i.get_text().strip()
            author.append(y)
        author = ", ".join(author)
    except AttributeError:
        author = None
    try:
        imageURL = soup.find_all("img")[0]["srcset"]
    except Exception:
        imageURL= None
        
    category = article.split("2023")[1].split("/")[3]

    new_dict[0]={"Source":"New York Times","Title": title, "Date" : date,"Author" : author, "Category": category, "Text" : text , "Text lenght" : lenght, "URL" : article, "imageURL":imageURL}
    df = pd.DataFrame.from_dict(new_dict, orient="index")
    return df

In [23]:
def nytimes_df():
    dataframes_nytimes = []
    dict_nytimes = links_nytimes()
    for url in dict_nytimes.keys():
        df = attributes_nytimes(dict_nytimes[url])
        dataframes_nytimes.append(df)
    nytimes_df = pd.concat(dataframes_nytimes, ignore_index=True)
    nytimes_df["Date"] = nytimes_df["Date"].apply(clean_html_tags)
    nytimes_df["Date"] = [remove_words(x, ["Updated", "hours", "ago"]) for x in nytimes_df["Date"]]
    nytimes_df["Date"] = nytimes_df["Date"].apply(lambda x: f"{x.split('ET')[0]}ET" if "ET" in x else x)
    nytimes_df["Date"] = nytimes_df["Date"].apply(parse_date)
    nytimes_df["Date"].dropna(inplace=True)
    nytimes_df["Date"] = nytimes_df["Date"].astype(str)
    nytimes_df["Date"] = nytimes_df["Date"].apply(lambda x : x.split(".")[0] if pd.notnull(x) else None)
    nytimes_df = nytimes_df[nytimes_df["Text lenght"]>100]
    return nytimes_df

## MSNBC

In [24]:
# links for all articles
def links_msnbc():
    index = 0
    dict_msnbc = {} 
    url = "https://www.msnbc.com/"
    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    section = soup.find("div", {"class":"rail__container styles_rightRail__S621d layout-rightRail undefined layout-grid-container"})
    articles = section.find_all("h3")
    for i in articles:
        link = i.find("a")["href"]
        dict_msnbc[index] = link
        index +=1
        if index >= 11:
            break
    return dict_msnbc

In [25]:
# getting all the features for one article
def attributes_msnbc(article):
    """This function receives an url and returns a DF with different attributes"""
    # definitions
    text = []
    new_dict= {}
    
    r = requests.get(article)
    soup = BeautifulSoup(r.content)
    
    
    # text and lenght
    sec = soup.find_all("div", {"class":"article-body__content"})
    for i in sec:
        x = i.find_all("p")
        for p in x:
            y = p.get_text().strip()
            text.append(y)
    text = " ".join(text)
    text = clean_html_tags(text)
    lenght = len(text)
    
    try:
        title = soup.find("h1").get_text().strip()
    except AttributeError:
        title = None
        
    try:
        date = soup.find("time").get_text().strip()
    except AttributeError:
        date = None
            
    try:
         author = soup.find("span", {"class":"byline-name"}).get_text().strip().split(",")[0].split("By")[0]
    except AttributeError:
        author = None
        
    try:
        imageURL = soup.find_all("img")[11]["src"]
    except Exception:
        imageURL= None
        
    category = article.split(".com")[1].split("/")[1]

    new_dict[0]={"Source":"MSNBC","Title": title, "Date" : date,"Author" : author, "Category": category, "Text" : text , "Text lenght" : lenght, "URL" : article, "imageURL":imageURL}
    df = pd.DataFrame.from_dict(new_dict, orient="index")
    return df

In [26]:
def msnbc_df():
    dict_msnbc = links_msnbc()
    dataframes_msnbc = []
    for url in dict_msnbc.keys():
        df = attributes_msnbc(dict_msnbc[url])
        dataframes_msnbc.append(df)

    msnbc_df = pd.concat(dataframes_msnbc, ignore_index=True)
    msnbc_df["Date"] = msnbc_df["Date"].apply(lambda x : x.split("/")[0])
    msnbc_df["Date"] = msnbc_df["Date"].apply(lambda x : parser.parse(x) if pd.notnull(x) else None)
    msnbc_df["Date"].dropna(inplace=True)
    msnbc_df["Date"] = msnbc_df["Date"].astype(str)
    msnbc_df["Date"] = msnbc_df["Date"].apply(lambda x : x.split("+")[0] if pd.notnull(x) else None)
    msnbc_df = msnbc_df[msnbc_df["Text lenght"]>100]
    return msnbc_df

## Reuters

In [27]:
# links for all articles
def links_reuters():
    index = 0
    dict_reuters = {} 
    url = "https://www.reuters.com/site-search/?query=news&date=past_24_hours&offset=0"
    driver = webdriver.Edge()
    driver.get(url)
    html = driver.page_source
    soup = BeautifulSoup(html)
    sections = soup.find_all("li", {"class":"search-results__item__2oqiX"})
    for i in range(len(sections)):
        link = sections[i].find("a")["href"]
        link = f"https://www.reuters.com{link}"
        dict_reuters[index] = link
        index +=1
    driver.quit()
    return dict_reuters

In [28]:
# getting all the features for one article
def attributes_reuters(article):
    """This function receives an url and returns a DF with different attributes"""
    # definitions
    new_dict= {}
    author_list = []
    
    r = requests.get(article)
    soup = BeautifulSoup(r.content)
    
    
    # text and lenght
    info = soup.find("div", {"class":"article-body__content__17Yit"})
    try:
        content = info.find_all("p")
    except AttributeError:
        content = soup.find_all("p")

    text = []
    for i in content[:-4]:
        y = i.get_text().strip()
        text.append(y)
    text = "".join(text)
    text = clean_html_tags(text)
    lenght = len(text)
    try:
        title = soup.find("h1").get_text().strip()
    except AttributeError:
        title = None
        
    try:
        date = soup.find("time").get_text().strip()
    except AttributeError:
        date = None
            
    try:
        info = soup.find("div", {"class":"info-content__author-date__1Epi_"})
        authors = info.find_all("a")
        for a in authors:
            y = a.get_text()
            author_list.append(y)
        author = ", ".join(author_list)
    except AttributeError:
        author = None
    try:
        imageURL = soup.find_all("img")[0]["src"]
    except Exception:
        imageURL= None
        
    category = article.split(".com")[1].split("/")[1]

    new_dict[0]={"Source":"Reuters","Title": title, "Date" : date,"Author" : author, "Category": category, "Text" : text , "Text lenght" : lenght, "URL" : article, "imageURL":imageURL}
    df = pd.DataFrame.from_dict(new_dict, orient="index")
    return df

In [29]:
def reuters_df():
    dict_reuters = links_reuters()
    dataframes_reuters = []
    for url in dict_reuters.keys():
        df = attributes_reuters(dict_reuters[url])
        dataframes_reuters.append(df)

    reuters_df = pd.concat(dataframes_reuters, ignore_index=True)
    reuters_df["Date"] = reuters_df["Date"].apply(lambda x : x.split("Updated")[0] if pd.notnull(x) else None)
    reuters_df["Date"] = reuters_df["Date"].apply(lambda x : " ".join(x.split("2023")) if pd.notnull(x) else None)
    reuters_df["Date"] = reuters_df["Date"].apply(lambda x : parser.parse(x, fuzzy=True) if pd.notnull(x) else None)
    reuters_df["Date"].dropna(inplace=True)
    reuters_df["Date"] = reuters_df["Date"].astype(str)
    reuters_df["Date"] = reuters_df["Date"].apply(lambda x : x.split("+")[0] if pd.notnull(x) else None)
    reuters_df = reuters_df[reuters_df["Text lenght"]>100]
    return reuters_df

## USA TODAY

In [30]:
# links for all articles
def links_usatoday():
    index = 0
    dict_usatoday = {} 
    url = "https://eu.usatoday.com/"
    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    section = soup.find("div", {"class":"content-well"})
    links = section.find_all("a")
    for i in links:
        link = i["href"]
        link = f"https://eu.usatoday.com{link}"
        dict_usatoday[index] = link
        index +=1
    return dict_usatoday

In [31]:
# getting all the features for one article
def attributes_usatoday(article):
    """This function receives an url and returns a DF with different attributes"""
    # definitions
    text = []
    new_dict= {}
    
    r = requests.get(article)
    soup = BeautifulSoup(r.content)
    
    
    # text and lenght
    content = soup.find_all("p")
    for i in content:
        y = i.get_text().strip()
        text.append(y)
    text = " ".join(text)
    text = clean_html_tags(text)
    lenght = len(text)
    
    try:
        title = soup.find("h1").get_text().strip()
    except AttributeError:
        title = None
        
    try:
        date = soup.find("lit-timestamp")["publishdate"]
    except AttributeError:
        date = None
            
    try:
        authors = soup.find("span", {"class":"author"})
        authors = authors.find_all("a")
        author_list = []
        for a in authors:
            y = a.get_text().strip()
            author_list.append(y)
        author = ", ".join(author_list)
    except AttributeError:
        author = None
        
    try:
        imageURL = soup.find_all("img")[2]["src"]
    except Exception:
        imageURL= None
        
    c = article.split("story")[1]
    if "news" in c:
        category = c.split("/")[2]
    else:
        category = c.split("/")[1]

    new_dict[0]={"Source":"USA TODAY","Title": title, "Date" : date,"Author" : author, "Category": category, "Text" : text , "Text lenght" : lenght, "URL" : article, "imageURL":imageURL}
    df = pd.DataFrame.from_dict(new_dict, orient="index")
    return df

In [32]:
def usatoday_df():
    dict_usatoday = links_usatoday()
    dataframes_usatoday = []
    for url in dict_usatoday.keys():
        df = attributes_usatoday(dict_usatoday[url])
        dataframes_usatoday.append(df)

    usatoday_df = pd.concat(dataframes_usatoday, ignore_index=True)
    usatoday_df["Date"] = usatoday_df["Date"].apply(lambda x : parser.parse(x) if pd.notnull(x) else None)
    usatoday_df["Date"].dropna(inplace=True)
    usatoday_df["Date"] = usatoday_df["Date"].astype(str)
    usatoday_df["Date"] = usatoday_df["Date"].apply(lambda x : x.split("+")[0] if pd.notnull(x) else None)
    usatoday_df = usatoday_df[usatoday_df["Text lenght"]>100]
    return usatoday_df

In [33]:
    dict_usatoday = links_usatoday()
    dataframes_usatoday = []
    for url in dict_usatoday.keys():
        df = attributes_usatoday(dict_usatoday[url])
        dataframes_usatoday.append(df)

    usatoday_df = pd.concat(dataframes_usatoday, ignore_index=True)
    usatoday_df["Date"] = usatoday_df["Date"].apply(lambda x : parser.parse(x) if pd.notnull(x) else None)
    usatoday_df["Date"].dropna(inplace=True)
    usatoday_df["Date"] = usatoday_df["Date"].astype(str)
    usatoday_df["Date"] = usatoday_df["Date"].apply(lambda x : x.split("+")[0] if pd.notnull(x) else None)
    usatoday_df = usatoday_df[usatoday_df["Text lenght"]>100]

## NPR

In [34]:
def links_npr():
    index = 0
    dict_npr = {} 
    url = "https://www.npr.org/sections/news/"
    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    sections = soup.find_all("h2", {"class":"title"})
    for i in sections:
        link = i.find("a")["href"]
        dict_npr[index] = link
        index += 1
    return dict_npr

In [35]:
# getting all the features for one article
def attributes_npr(article):
    """This function receives an url and returns a DF with different attributes"""
    # definitions
    text = []
    new_dict= {}
    
    r = requests.get(article)
    soup = BeautifulSoup(r.content)
    
    
    # text and lenght
    content = soup.find_all("p")
    for i in content[4:-2]:
        y = i.get_text().strip()
        text.append(y)
    text = " ".join(text)
    text = clean_html_tags(text)
    lenght = len(text)
    
    try:
        title = soup.find("h1").get_text().strip()
    except AttributeError:
        title = None
        
    try:
        date = soup.find("span", {"class":"date"}).get_text().strip()
        date = f"{date} {soup.find('span', {'class':'time'}).get_text().strip()}"
    except AttributeError:
        date = None
            
    try:
        author = soup.find("p", {"class":"byline__name byline__name--block"}).get_text().strip()
    except AttributeError:
        author = None
    try:
        category = soup.find("a", {"class":"tag tag--story"}).get_text().strip()
    except AttributeError:
        category = None
    try:
        imageURL = soup.find_all("img")[15]["src"]
    except Exception:
        imageURL= None
    
    new_dict[0]={"Source":"NPR","Title": title, "Date" : date,"Author" : author, "Category": category, "Text" : text , "Text lenght" : lenght, "URL" : article, "imageURL":imageURL}
    df = pd.DataFrame.from_dict(new_dict, orient="index")
    return df

In [36]:
def npr_df():
    dict_npr = links_npr()
    dataframes_npr = []
    for url in dict_npr.keys():
        df = attributes_npr(dict_npr[url])
        dataframes_npr.append(df)

    npr_df = pd.concat(dataframes_npr, ignore_index=True)
    npr_df["Date"] = npr_df["Date"].apply(lambda x : parser.parse(x, fuzzy=True) if pd.notnull(x) else None)
    npr_df["Date"].dropna(inplace=True)
    npr_df["Date"] = npr_df["Date"].astype(str)
    npr_df["Date"] = npr_df["Date"].apply(lambda x : x.split(".")[0] if pd.notnull(x) else None)
    npr_df = npr_df[npr_df["Text lenght"]>100]
    return npr_df

## Chicago Tribune

In [37]:
# get all links
def links_ct():
    index = 0
    dict_ct = {} 
    url = "https://www.chicagotribune.com/news/breaking/"
    r = requests.get(url)
    soup = BeautifulSoup(r.content)
    sections = soup.find_all("article", {"class":"container-fluid row flex_row"})
    for i in sections:
        link = i.find("a")["href"]
        link = f"https://www.chicagotribune.com{link}"
        dict_ct[index] = link
        index +=1
    return dict_ct

In [38]:
# getting all the features for one article
def attributes_ct(article):
    """This function receives an url and returns a DF with different attributes"""
    # definitions
    new_dict= {}
    
    r = requests.get(article)
    soup = BeautifulSoup(r.content)
    
    
    # text and lenght
    content = soup.find_all("p", {"class":"default__StyledText-sc-1wxyvyl-0 fxgoSg body-paragraph"})
    text = []
    for i in content:
        text.append(i.get_text().strip())
    text = "".join(text)
    text = clean_html_tags(text)
    lenght = len(text)
    
    try:
        title = soup.find("h1").get_text().strip()
    except AttributeError:
        title = None
        
    try:
        date = soup.find("time").get_text().strip()
    except AttributeError:
        date = None
            
    try:
        x = soup.find("div", {"class":"article_byline"})
        x = x.find_all("a")
        y = []
        for i in x:
            y.append(i.get_text().strip())
        author = ", ".join(y)
    except AttributeError:
        author = None
    
    c = article.split(".com")[1]
    if "news" in c:
        category = c.split("/")[2]
    else:
        category = c.split("/")[1]
    try:
        imageURL = soup.find_all("img")[3]["src"]
    except Exception:
        imageURL= None
    
    new_dict[0]={"Source":"Chicago Tribune","Title": title, "Date" : date,"Author" : author, "Category": category, "Text" : text , "Text lenght" : lenght, "URL" : article, "imageURL":imageURL}
    df = pd.DataFrame.from_dict(new_dict, orient="index")
    return df

In [39]:
def ct_df():
    dict_ct = links_ct()
    dataframes_ct = []
    for url in dict_ct.keys():
        df = attributes_ct(dict_ct[url])
        dataframes_ct.append(df)

    ct_df = pd.concat(dataframes_ct, ignore_index=True)
    ct_df["Date"] = ct_df["Date"].apply(lambda x : parser.parse(x) if pd.notnull(x) else None)
    ct_df["Date"].dropna(inplace=True)
    ct_df["Date"] = ct_df["Date"].astype(str)
    ct_df["Date"] = ct_df["Date"].apply(lambda x : x.split(".")[0] if pd.notnull(x) else None)
    ct_df = ct_df[ct_df["Text lenght"]>100]
    return ct_df

## Whole DF

In [40]:
final_df1 = pd.concat([cnn_df(), fox_df(), abc_df(dict_abc), politico_df(), wapo_df(), nytimes_df(), msnbc_df()], ignore_index=True)



In [42]:
final_df2 = pd.concat([usatoday_df, npr_df(), ct_df()], ignore_index=True)



In [43]:
final_df = pd.concat([final_df1, final_df2], ignore_index=True)
final_df

Unnamed: 0,Source,Title,Date,Author,Category,Text,Text lenght,URL,imageURL
0,CNN,Details emerge about UNLV gunman who killed 3 ...,2023-12-08 03:06:00,Elizabeth Wolfe,us,Investigators searching for the motive of Anth...,3999,https://www.cnn.com/2023/12/08/us/university-o...,https://media.cnn.com/api/v1/images/stellar/pr...
1,CNN,Azerbaijan and Armenia agree to prisoner swap ...,2023-12-08 02:44:00,Angela Dewan,europe,Azerbaijan and Armenia have agreed to a prison...,3408,https://www.cnn.com/2023/12/08/europe/azerbaij...,https://media.cnn.com/api/v1/images/stellar/pr...
2,CNN,Italy quits Belt and Road plan as Europe rethi...,2023-12-08 02:38:00,Simone McCarthy,china,"Italy, the only G7 country to join China’s fla...",4743,https://www.cnn.com/2023/12/08/china/italy-bel...,https://media.cnn.com/api/v1/images/stellar/pr...
3,CNN,How the impasse over Ukraine aid could have cr...,2023-12-08 00:01:00,Stephen Collinson,politics,America’s paralyzing political estrangement ma...,10601,https://www.cnn.com/2023/12/08/politics/congre...,https://media.cnn.com/api/v1/images/stellar/pr...
4,CNN,‘Laws need to change’: Stella McCartney calls ...,2023-12-07 22:58:00,Christy Choi,style,Stella McCartney has called on world leaders t...,4517,https://www.cnn.com/style/stella-mccartney-lea...,https://media.cnn.com/api/v1/images/stellar/pr...
...,...,...,...,...,...,...,...,...,...
440,Chicago Tribune,"Kennedy Expressway lanes to reopen, as first y...",2023-12-06 17:15:00,Sarah Freishtat,business,Kennedy Expressway drivers heading toward down...,2971,https://www.chicagotribune.com/business/ct-biz...,https://www.chicagotribune.com/resizer/5cAZOcy...
441,Chicago Tribune,Advisory council created to ensure diversity i...,2023-12-06 16:57:00,Rick Pearson,politics,The host committee for next year’s Democratic ...,1896,https://www.chicagotribune.com/politics/ct-dnc...,https://www.chicagotribune.com/resizer/Zzsk8IB...
442,Chicago Tribune,McDonald’s will open first CosMc’s spinoff in ...,2023-12-06 16:16:00,Talia Soglin,business,The week after photos of its new spinoff CosMc...,3937,https://www.chicagotribune.com/business/ct-biz...,https://www.chicagotribune.com/resizer/wyE3Veq...
443,Chicago Tribune,Dolton trustees at odds with Mayor Tiffany Hen...,2023-12-06 15:28:00,Mike Nolan,suburbs,A day after walking out on a Village Board mee...,3656,https://www.chicagotribune.com/suburbs/daily-s...,https://www.chicagotribune.com/resizer/c1NKmKy...


## Data Cleaning and Formatting

In [44]:
df = final_df.copy()

In [45]:
df["Category"] = df["Category"].astype(str)

In [46]:
df.drop_duplicates(subset="URL", inplace=True)

In [47]:
df.dropna(subset="Title", inplace=True)
df["Title"] = df["Title"].apply(clean_html_tags)

  soup = BeautifulSoup(html_text, 'html.parser')


In [48]:
def categorize_categories(x):
    category_mapping = {
        'International News': ['world',"ukraine",'israel-hamas war','israel','gaza', 'israel-gaza war' 'australia','india', 'china', 'americas', 'middleeast','international','israel hamas War', 'africa', 'asia', 'europe'],
        'Politics': ['nation', "election", "immigration", 'new york', 'Congress','us', 'politics'],
        'Business and Economy': ['economy','investing', 'business', 'markets', 'money'],
        'Entertainment and Lifestyle': ["lifestyle","travel", 'entertainment', 'cars', 'culture', 'food', 'style', 'tech','advice', 'success', 'books', 'cruise ship', 'wellness', 'family', 'life expectancy'],
        'Climate and Environment': ['climate','energy',"environment", 'climate-environment', 'climate-solutions'],
        'Health | Science | Technology': ['health', 'science', 'technology', 'artificial intelligence'],
        'Sports': ['sport', 'sports'],
        'Law and Justice': ['national-security','legal', 'criminajustice', "retail theft", "financial crimes", "crime",  ],
        'del' : ["weather" ] 
    }

    categorized_result = {category: [] for category in category_mapping.keys()}
    matched_category = None
    for main_category, sub_categories in category_mapping.items():
        if any(sub_category in x.lower() for sub_category in sub_categories):
            matched_category = main_category
            break
    
    return matched_category

df["cat_label"] = df["Category"].apply(categorize_categories)

In [49]:
df = df[df["cat_label"]!="del"]

In [50]:
df['Date'] = pd.to_datetime(df['Date'])
df["Date"]

0     2023-12-08 03:06:00
1     2023-12-08 02:44:00
2     2023-12-08 02:38:00
3     2023-12-08 00:01:00
4     2023-12-07 22:58:00
              ...        
440   2023-12-06 17:15:00
441   2023-12-06 16:57:00
442   2023-12-06 16:16:00
443   2023-12-06 15:28:00
444   2023-12-06 14:56:00
Name: Date, Length: 332, dtype: datetime64[ns]

In [51]:
img_replacements = {
    "CNN":"https://tse2.mm.bing.net/th?id=OIP.W3gWO3a5Tnx3PvP_-LsgjQHaE7&pid=Api&P=0&h=180",
    "Fox News": "https://tse1.mm.bing.net/th?id=OIP.cgyUI9sP8zj7Zhf5tpOI2wHaHa&pid=Api&P=0&h=180",
    "ABC" : "https://tse1.mm.bing.net/th?id=OIP.KAIEmdZnussbmh0xL1UqIAHaHa&pid=Api&P=0&h=180",
    "Politico": "https://tse1.mm.bing.net/th?id=OIP.GNggW5ET63WsrjcQpARO4QHaHa&pid=Api&P=0&h=180",
    "Washington Post" : "https://tse3.explicit.bing.net/th?id=OIP.cysHEGJuAL_FgzGaxQpl1QHaHa&pid=Api&P=0&h=180",
    'New York Times': "https://tse2.mm.bing.net/th?id=OIP.jVot1lhu9PmkBYqVP_y4QAHaGg&pid=Api&P=0&h=180",
    'MSNBC': "https://tse4.mm.bing.net/th?id=OIP.0QnXrS2STCJkb2iD0VvN8AHaHa&pid=Api&P=0&h=180",
    'Reuters': "https://tse1.mm.bing.net/th?id=OIP.jw0U2QGUAVScpEN8_Ik8UQHaHa&pid=Api&P=0&h=180",
    'USA TODAY': "https://tse4.mm.bing.net/th?id=OIP.SY6y2UDBYYuKUPjSwFXbegHaHa&pid=Api&P=0&h=180",
    'NPR': "https://tse4.mm.bing.net/th?id=OIP.VhogjnWRzUbgzzIcAE8DswHaHa&pid=Api&P=0&h=180",
    'Chicago Tribune': "https://tse1.mm.bing.net/th?id=OIP.bdQ4hj2mS6umETrhkh7uTQAAAA&pid=Api&P=0&h=180"
}
df["sourceURL"] = df["Source"].apply(lambda x: img_replacements[x])
df.loc[df["imageURL"].isnull(), "imageURL"] = df.loc[df["imageURL"].isnull(), "sourceURL"]

In [52]:
df["cat_label"].value_counts()

Politics                         134
Entertainment and Lifestyle       46
Sports                            29
International News                28
Health | Science | Technology     10
Business and Economy               6
Climate and Environment            4
Law and Justice                    1
Name: cat_label, dtype: int64

In [53]:
df.dropna(subset="Date", inplace=True)

In [54]:
df.reset_index(drop=True, inplace=True)

In [55]:
df["Source"].value_counts()

CNN                92
ABC                58
Fox News           44
Washington Post    38
Chicago Tribune    25
NPR                22
Politico           20
MSNBC              11
USA TODAY          10
New York Times      6
Name: Source, dtype: int64

## Prep for model

In [56]:
import nltk #Natural Language tool kit -- this pacakge is quite a mess. Was poorly design and the documentation is not great
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
lemmatizer = WordNetLemmatizer()

In [57]:
def tokenizer_and_remove_punctuation(row):
    tokens = word_tokenize(row['Text'])
    return [word.lower() for word in tokens if word.isalpha()]

df['tokenized'] = df.apply(tokenizer_and_remove_punctuation,axis=1)

In [58]:
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word], lang='eng')[0][1][0].upper() # gets first letter of POS categorization
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [59]:
def lemmatizer_with_pos(row):
      return [lemmatizer.lemmatize(word,get_wordnet_pos(word)) for word in row['tokenized']]

df['lemmatized'] = df.apply(lemmatizer_with_pos,axis=1)

In [60]:
# remove stopwords

def remove_sw(row):
      return list(set(row['lemmatized']).difference(stopwords.words()))

df['no_stopwords'] = df.apply(remove_sw,axis=1)

In [61]:
def re_blob(row):
      return " ".join(row['no_stopwords'])

df['clean_blob'] = df.apply(re_blob,axis=1)

In [62]:
df.drop(["tokenized", "lemmatized", "no_stopwords"], axis=1, inplace=True)
df.head()

Unnamed: 0,Source,Title,Date,Author,Category,Text,Text lenght,URL,imageURL,cat_label,sourceURL,clean_blob
0,CNN,Details emerge about UNLV gunman who killed 3 ...,2023-12-08 03:06:00,Elizabeth Wolfe,us,Investigators searching for the motive of Anth...,3999,https://www.cnn.com/2023/12/08/us/university-o...,https://media.cnn.com/api/v1/images/stellar/pr...,Politics,https://tse2.mm.bing.net/th?id=OIP.W3gWO3a5Tnx...,police country slain return relative patricia ...
1,CNN,Azerbaijan and Armenia agree to prisoner swap ...,2023-12-08 02:44:00,Angela Dewan,europe,Azerbaijan and Armenia have agreed to a prison...,3408,https://www.cnn.com/2023/12/08/europe/azerbaij...,https://media.cnn.com/api/v1/images/stellar/pr...,International News,https://tse2.mm.bing.net/th?id=OIP.W3gWO3a5Tnx...,country european climate prosperous export enc...
2,CNN,Italy quits Belt and Road plan as Europe rethi...,2023-12-08 02:38:00,Simone McCarthy,china,"Italy, the only G7 country to join China’s fla...",4743,https://www.cnn.com/2023/12/08/china/italy-bel...,https://media.cnn.com/api/v1/images/stellar/pr...,International News,https://tse2.mm.bing.net/th?id=OIP.W3gWO3a5Tnx...,country european ursula raise reason refer top...
3,CNN,How the impasse over Ukraine aid could have cr...,2023-12-08 00:01:00,Stephen Collinson,politics,America’s paralyzing political estrangement ma...,10601,https://www.cnn.com/2023/12/08/politics/congre...,https://media.cnn.com/api/v1/images/stellar/pr...,Politics,https://tse2.mm.bing.net/th?id=OIP.W3gWO3a5Tnx...,country philosophical raise spring anna increa...
4,CNN,‘Laws need to change’: Stella McCartney calls ...,2023-12-07 22:58:00,Christy Choi,style,Stella McCartney has called on world leaders t...,4517,https://www.cnn.com/style/stella-mccartney-lea...,https://media.cnn.com/api/v1/images/stellar/pr...,Entertainment and Lifestyle,https://tse2.mm.bing.net/th?id=OIP.W3gWO3a5Tnx...,climate showcased available enter combination ...


In [63]:
df["cat_label"].value_counts()

Politics                         132
Entertainment and Lifestyle       46
Sports                            29
International News                28
Health | Science | Technology     10
Business and Economy               6
Climate and Environment            4
Law and Justice                    1
Name: cat_label, dtype: int64

## Category model

In [66]:
with open('bow_vect.pkl', 'rb') as file:
    bow_vect = pickle.load(file)

<IPython.core.display.Javascript object>

In [67]:
with open('cat_model.pkl', 'rb') as file:
    cat_model = pickle.load(file)

<IPython.core.display.Javascript object>

In [68]:
X_unlabeled = bow_vect.transform(df['clean_blob']).toarray()

In [69]:
df["pred_cat"] = cat_model.predict(X_unlabeled)

In [70]:
check = df[["Title","Source","Category","cat_label", "pred_cat"]]
check[check["cat_label"].isnull()].tail(60)

Unnamed: 0,Title,Source,Category,cat_label,pred_cat
131,Pennsylvania school board president sworn into...,Fox News,media,,Politics
134,Hunter indictment a 'nuclear bomb for the Bide...,Fox News,media,,Politics
194,Washington Post staffers launch 24-hour walkout,Politico,Labor,,Politics
196,California’s budget deficit swells to record $...,Politico,California,,Politics
199,Blinken explains his reaction to Biden calling...,Politico,foreign affairs,,Politics
201,‘The Pain and the Trauma Lasts Longer Than a N...,Politico,q&a,,Politics
202,A China brawl looms for House Republicans,Politico,Finance & Tax,,Politics
208,McCarthy’s exit and the ripple effects back home,Politico,California,,Politics
209,Legal weed takes effect in Ohio as lawmakers s...,Politico,Cannabis,,Politics
210,House Education committee to launch probe into...,Politico,Education,,Politics


In [71]:
df.loc[df["cat_label"].isnull(), "cat_label"] = df.loc[df["cat_label"].isnull(), "pred_cat"]
df.drop(["Category", "pred_cat"], axis=1, inplace=True)
df.head()

Unnamed: 0,Source,Title,Date,Author,Text,Text lenght,URL,imageURL,cat_label,sourceURL,clean_blob
0,CNN,Details emerge about UNLV gunman who killed 3 ...,2023-12-08 03:06:00,Elizabeth Wolfe,Investigators searching for the motive of Anth...,3999,https://www.cnn.com/2023/12/08/us/university-o...,https://media.cnn.com/api/v1/images/stellar/pr...,Politics,https://tse2.mm.bing.net/th?id=OIP.W3gWO3a5Tnx...,police country slain return relative patricia ...
1,CNN,Azerbaijan and Armenia agree to prisoner swap ...,2023-12-08 02:44:00,Angela Dewan,Azerbaijan and Armenia have agreed to a prison...,3408,https://www.cnn.com/2023/12/08/europe/azerbaij...,https://media.cnn.com/api/v1/images/stellar/pr...,International News,https://tse2.mm.bing.net/th?id=OIP.W3gWO3a5Tnx...,country european climate prosperous export enc...
2,CNN,Italy quits Belt and Road plan as Europe rethi...,2023-12-08 02:38:00,Simone McCarthy,"Italy, the only G7 country to join China’s fla...",4743,https://www.cnn.com/2023/12/08/china/italy-bel...,https://media.cnn.com/api/v1/images/stellar/pr...,International News,https://tse2.mm.bing.net/th?id=OIP.W3gWO3a5Tnx...,country european ursula raise reason refer top...
3,CNN,How the impasse over Ukraine aid could have cr...,2023-12-08 00:01:00,Stephen Collinson,America’s paralyzing political estrangement ma...,10601,https://www.cnn.com/2023/12/08/politics/congre...,https://media.cnn.com/api/v1/images/stellar/pr...,Politics,https://tse2.mm.bing.net/th?id=OIP.W3gWO3a5Tnx...,country philosophical raise spring anna increa...
4,CNN,‘Laws need to change’: Stella McCartney calls ...,2023-12-07 22:58:00,Christy Choi,Stella McCartney has called on world leaders t...,4517,https://www.cnn.com/style/stella-mccartney-lea...,https://media.cnn.com/api/v1/images/stellar/pr...,Entertainment and Lifestyle,https://tse2.mm.bing.net/th?id=OIP.W3gWO3a5Tnx...,climate showcased available enter combination ...


## Scoring

In [72]:
# sentiment score
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

df['sentiment_scores1'] = df['clean_blob'].apply(lambda x: sia.polarity_scores(x)["compound"])
df['sentiment_scores2'] = df['Text'].apply(lambda x: sia.polarity_scores(x)["compound"])
df["sentiment_score"] = (df['sentiment_scores1']+df['sentiment_scores2'])/2
df.drop(["sentiment_scores1", "sentiment_scores2"], axis=1, inplace=True)
df.head(2)

Unnamed: 0,Source,Title,Date,Author,Text,Text lenght,URL,imageURL,cat_label,sourceURL,clean_blob,sentiment_score
0,CNN,Details emerge about UNLV gunman who killed 3 ...,2023-12-08 03:06:00,Elizabeth Wolfe,Investigators searching for the motive of Anth...,3999,https://www.cnn.com/2023/12/08/us/university-o...,https://media.cnn.com/api/v1/images/stellar/pr...,Politics,https://tse2.mm.bing.net/th?id=OIP.W3gWO3a5Tnx...,police country slain return relative patricia ...,-0.98715
1,CNN,Azerbaijan and Armenia agree to prisoner swap ...,2023-12-08 02:44:00,Angela Dewan,Azerbaijan and Armenia have agreed to a prison...,3408,https://www.cnn.com/2023/12/08/europe/azerbaij...,https://media.cnn.com/api/v1/images/stellar/pr...,International News,https://tse2.mm.bing.net/th?id=OIP.W3gWO3a5Tnx...,country european climate prosperous export enc...,0.94895


In [73]:
# sensationalism score

sensationalism_keywords = ['breaking', 'exclusive', 'shocking', 'explosive', 'revealed', 'urgent', 'unbelievable', 'mind-blowing', 'scandalous', 'outrageous', 'sensational', 'never-before-seen', 'dramatic', 'jaw-dropping', 'killer', 'massive', 'insane', 'terrifying', 'banned', 'controversial', 'secret', 'conspiracy', 'nightmare', 'apocalyptic', 'incredible', 'forbidden', 'sinister', 'catastrophic', 'shock', 'danger', 'fear', 'panic', 'monster', 'death-defying', 'omg', 'alarming', 'tremendous', 'never', 'deadly', 'hellish', 'paranormal', 'bewildering', 'menacing', 'twisted', 'kill', 'wild', 'devious', 'exposed', 'unprecedented', 'crisis', 'apocalypse', 'bizarre', 'explosive revelation', 'apocalyptic', 'deadly virus', 'unearthed', 'censored', 'life-threatening', 'cataclysmic', 'underground', 'illegal', 'menace', 'gruesome', 'intense', 'unholy', 'insidious', 'doomsday', 'untold', 'mysterious', 'censored', 'horrifying', 'unexplained', 'phenomenal', 'surreal', 'freakish', 'clash', 'supernatural', 'reckless', 'banned', 'taboo', 'untamed', 'monstrous', 'forbidden', 'pandemonium', 'infernal', 'no holds barred', 'ruthless', 'ghostly', 'frightening', 'dangerous liaison', 'freak accident', 'aberration', 'anarchy', 'untouchable', 'eerie', 'conspiracy theory', 'lost civilization', 'apocalyptic nightmare', 'inexplicable']

In [74]:
def sensationalism_score(x):
    sensationalism_score = 0  # base score
    x_lower = x.lower()  

    for keyword in sensationalism_keywords:
        keyword_count = x_lower.count(keyword.lower()) 
        sensationalism_score += 0.3 * keyword_count  # Add to score based on keyword occurrences

    divisor = len(x) / 1000  
    sensationalism_score /= divisor  # Normalize the score with dividing by text length
    return sensationalism_score

In [75]:
df["sensationalism_score"] = df["Text"].apply(sensationalism_score)

In [76]:
# readability score
import nltk
from nltk import sent_tokenize, word_tokenize

In [77]:
def coleman_liau_index(x):
    words = word_tokenize(x)
    words_count = len(words)
    sentences = sent_tokenize(x)
    sentences = len(sentences)
    letters = sum(len(letter) for word in words for letter in word)
    L = (letters / words_count) * 100
    S = (sentences / words_count) * 100
    return 0.0588 * L - 0.296 * S - 15.8
df["Readability"] = df["Text"].apply(coleman_liau_index)

In [None]:
from textblob import TextBlob
def calculate_bias_score(text):
    # Use TextBlob for sentiment analysis
    analysis = TextBlob(text)
    sentiment_score = analysis.sentiment.subjectivity
    bias_score = sentiment_score
    bias_score = np.sqrt(bias_score**2)
    return bias_score
df["bias_score"] = df["Text"].apply(calculate_bias_score)

<IPython.core.display.Javascript object>

## Clickbait prediction | not finished yet

In [None]:
with open('bow_vect_clickbait.pkl', 'rb') as file:
    bow_vect_clickbait = pickle.load(file)

In [None]:
with open('clickbait_model.pkl', 'rb') as file:
    clickbait_model = pickle.load(file)

In [None]:
df["title_lenght"] = df["Title"].apply(lambda x: len(x))   # characters of title
df["uppercased"] = df["Title"].apply(lambda x: len([l for l in x if l.isupper()])) # num of uppercased
features = df[["title_lenght", "uppercased"]]
features = features.reset_index(drop=True)
df.drop(["title_lenght", "uppercased"], axis=1, inplace=True)

In [None]:
bow_vect_clickbait

In [None]:
headlines = df["Title"].reset_index(drop=True)
vs = bow_vect_clickbait.transform(headlines).toarray()
vs = pd.DataFrame(vs, columns=bow_vect_clickbait.get_feature_names_out())

In [None]:
X = pd.concat([features, vs], axis=1)
df["isClickbait"] = clickbait_model.predict(X)

In [None]:
df.to_csv("articles-tuesday-morning.csv", index=False)

## TO SQL

In [None]:
old_df = pd.read_csv("./datasets/all_articles.csv")
full_df = pd.concat([df, old_df], ignore_index=True)
full_df.drop_duplicates(subset="URL",keep="first", inplace=True)
full_df.drop_duplicates(subset="Title", keep="first", inplace=True)
try:
    full_df.drop("Unnamed: 0", axis=1, inplace=True)
except KeyError:
    full_df = full_df
    
full_df.to_csv("./datasets/all_articles.csv", index=False)

In [None]:
full_df = full_df.reset_index()
full_df.rename(columns={'index': 'Article_ID'}, inplace=True)

In [None]:
articles = full_df.copy()
to_drop = ["Author", "Text", "Text lenght", "cat_label", "clean_blob"]
for i in to_drop:
    articles.drop(i, axis=1, inplace=True)

In [None]:
text = full_df.copy()
to_drop = ["Author", "Source", "Title", "Date", "URL", "cat_label", "sentiment_score", "sensationalism_score", "sourceURL", "imageURL", "Readability", "bias_score", "isClickbait"]
for i in to_drop:
    text.drop(i, axis=1, inplace=True)

In [None]:
data = {"Author":list(full_df["Author"].unique())}
authors = pd.DataFrame(data)
authors.reset_index(inplace=True)
authors.columns = ["Author_ID", "Author"]

In [None]:
article_author = pd.merge(authors, full_df, on="Author", how="inner")
to_drop = ["Source", "Title", "Date", "URL", "cat_label", "Text", "Text lenght", "clean_blob", "sentiment_score","sourceURL", "imageURL", "sensationalism_score", "Readability", "bias_score", "isClickbait"]
for i in to_drop:
    article_author.drop(i, axis=1, inplace=True)
article_author = article_author.set_index("Author_ID").reset_index()

In [None]:
datas = {"Category":list(full_df["cat_label"].unique())}
categories = pd.DataFrame(datas)
categories.reset_index(inplace=True)
categories.columns = ["Category_ID", "Category"]
categories = categories[categories["Category"]!="Breaking News"]

In [None]:
article_category = pd.merge(categories, full_df, left_on="Category",right_on="cat_label", how="inner")
to_drop = ["Source", "Title", "Author", "Date", "URL", "cat_label", "Text", "Text lenght", "clean_blob", "sourceURL", "imageURL","sentiment_score", "sensationalism_score", "Readability", "bias_score"]
for i in to_drop:
    article_category.drop(i, axis=1, inplace=True)

In [None]:
datas = {"Source":list(full_df["Source"].unique()), "sourceURL":list(full_df["sourceURL"].unique())}
source = pd.DataFrame(datas)
source.reset_index(inplace=True)
source.columns = ["source_id", "Source", "sourceURL"]

In [None]:
article_source = pd.merge( source, full_df, on="Source", how="inner")
to_drop = ["Title", "Author", "Date", "URL", "cat_label", "Text", "Text lenght", "clean_blob", "sentiment_score", "sensationalism_score", "Readability", "imageURL","bias_score", "isClickbait"]
for i in to_drop:
    article_source.drop(i, axis=1, inplace=True)

In [None]:
import pymysql
from sqlalchemy import create_engine
from getpass import getpass
password = getpass()

engine = create_engine("mysql+pymysql://{user}:{pw}@localhost/{db}"
                       .format(user="root",
                               pw=password,
                               db="headlinehub"))

In [None]:
articles.to_sql("articles",con = engine, if_exists = 'replace', chunksize = 1000,index=False)

In [None]:
text.to_sql("text",con = engine, if_exists = 'replace', chunksize = 1000,index=False)

In [None]:
authors.to_sql("author",con = engine, if_exists = 'replace', chunksize = 1000,index=False)

In [None]:
article_author.to_sql("article_author",con = engine, if_exists = 'replace', chunksize = 1000,index=False)

In [None]:
categories.to_sql("category",con = engine, if_exists = 'replace', chunksize = 1000,index=False)

In [None]:
article_category.to_sql("article_category",con = engine, if_exists = 'replace', chunksize = 1000,index=False)

In [None]:
source.to_sql("source",con = engine, if_exists = 'replace', chunksize = 1000,index=False)

In [None]:
article_source.to_sql("article_source",con = engine, if_exists = 'replace', chunksize = 1000,index=False)