In [None]:
apiKey = 'e938341216df4163be5f15cb92d413e6'

# Web-crawling

In [1]:
import pandas as pd
import requests
import time
# import json

################################## Web-crawling News data ##################################
# 1. Set a few News websites to crawl from
# 2. Use Newsapi to crawl all of their news article links in the past 3 years
# 3. Crawl all of the news article's content from these links and store as data
# 4. Format all of the news article content to make them vectorizable by a word2vec model
# 5. Load the pre-trained Google-News Word2Vec model & turn all news article content into
#       an average of all of the vector of its words
# 6. Create a similarity matrix that stores the similarity between every pair of news articles
# Everything above just need to be packged into a function + make a few changes to make it
# ready for integration into a web app.
# 7. Test case - Simulating some random user myself

##############################     1     ##############################
# Set the sources that we are going to crawl from
# Since this is a draft we won't worry about #sourcequality yet.
# The sources are decided from this list: https://www.top10.com/news-websites
# There isn't New York Times & NPR in Newsapi, so it won't be included either for now
# Google News is a search engine that shows news from other news websites, which makes crawling
# too hard, so won't be included either
sources = 'cnn,reuters,the-wall-street-journal,bbc-news,fox-news,nbc-news,the-washington-post'
# Note: To see all of the sources offered by Newsapi, visit: https://newsapi.org/v2/sources?apiKey=e938341216df4163be5f15cb92d413e6

# Use a dictionary of dictionaries to set the tags & attributes for which news
# content is stored in each of the above news sources, will be used later in crawling
contentLocation = {
    'cnn': {
        'tag': 'p',
        'class': 'paragraph inline-placeholder'
    },
    'reuters': {
        'tag': 'p',
        'class': 'text__text__1FZLe text__dark-grey__3Ml43 text__regular__2N1Xr text__large__nEccO body__full_width__ekUdw body__large_body__FV5_X article-body__element__2p5pI'
    },
    'the-wall-street-journal': {
        'tag': 'p',
        'class': 'css-xbvutc-Paragraph e3t0jlg0'
    },
    'bbc-news': {
        'tag': 'p',
        'class': 'ssrcss-1q0x1qg-Paragraph eq5iqo00'
    },
    'fox-news': {
        'tag': 'p' # Fox news' article content has no attribute but just a tag <p>
    },
    'nbc-news': {
        'tag': 'p',
        'class': '' # Content is stored under the class attribute with no value on NBC news
    },
    'the-washington-post': {
        'tag': 'p',
        'class': 'wpds-c-cYdRxM wpds-c-cYdRxM-iPJLV-css font-copy'
    }
}

##############################     2     ##############################
# use headers to hide our API key
headers = {'Authorization': 'e938341216df4163be5f15cb92d413e6'}

# Set the API endpoint to crawl data from
everything = "https://newsapi.org/v2/everything?"
top_headlines = "https://newsapi.org/v2/top-headlines?"

# Define keyword for how sources will be sorted
sorby = "popularity"

# Store keywords into a dictionary for use in crawling
params = {'apiKey': apiKey,
          'sources': sources,
          'sortBy': sorby,
          'language': 'en',
          'page': 1}

# Set html requests and get a response object, this is first run to establish
# the total number of articles that we need to crawl
response = requests.get(url = everything, headers = headers, params = params)

# Turn response into a json object, and get the 'totalResults' field
output = response.json()
totalResults = output['totalResults']
# Since each crawl gives a maximum of 100 articles, we would need to crawl
# floor division by 100 + 1 times in total
totalCrawlsNeeded = totalResults//100 + 1

# However, due to using an upaid plan, I am actually only allowed 100 requests
# per day. Thus this will be instead set to 100
totalCrawlsNeeded = 100

# Crawl all of the article urls and store them into our dataframe
for i in range(1, totalCrawlsNeeded+1):
    # Set the page number crawled in this run
    params['page'] = i

    # Send html requests and get response
    response = requests.get(url = everything, headers = headers, params = params)

    # Turn response into a json object
    output = response.json()

    ## Turn the json result into a pandas.dataframe object

    # Our unpaid version limits the number of urls we can get, thus we need to wrap
    # the next line in a try, except block
    try:
        # Variable to hold the list of article information
        articles = output['articles']
    except KeyError:
        break

    # Create dataframe from the list of dictionaries, if not present, else just concatenate
    try:
        df
        temp = pd.DataFrame(articles)
        df = pd.concat([df, temp], ignore_index = True)
    except NameError:
        df = pd.DataFrame(articles)        

    # Sleep for 2 seconds to avoid overloading the API
    time.sleep(2)

In [2]:
##############################     3     ##############################
from bs4 import BeautifulSoup
import sys

def get_news_content(url, tag, class_=None):
    '''
    Crawls a new's content, given the url, tag, and class.

    Returns True, news_content if crawling was successful, else
    returns False, [url, error_type, error_info] if error was encountered.
    '''
    # Since our code might produce errors in the process for various
    # reasons, calling it in a try-except block will make the code run better
    try:
        # Send requests to the url & obtain response object
        response = requests.get(url)

        # Use BeautifulSoup to parse the html response & finding data
        soup = BeautifulSoup(response.text, "html.parser")

        # Find the news content using the given tag & attribute
        if class_:
            content = soup.find_all(tag, class_ = class_)
        else:
            content = soup.find_all(tag)

        # Content is a list of all the html elements found, we need to 
        # further concatenate them together into a string and strip it
        news_content = ''
        for tag_found in content:
            news_content += tag_found.text + ' '

        # Return True & content
        return True, news_content

    # this describes what to do if an exception is thrown 
    except Exception:
        
        # get the exception information
        error_type, error_obj, error_info = sys.exc_info()
        
        # Return False & error info
        return False, [url, error_type, error_obj, error_info]

# Create a list to hold all of the failure info
failure_info = []

# Crawl news article data from all of the urls in the df
for index, row in df.iterrows():
    # Get the url
    url = row['url']

    # Get the id of the news website, then obtain tag & class info
    # using our predefined dictionary
    id = row['source']['id']
    tag = contentLocation[id]['tag']
    # Fox news is the only news website where its article content
    # doesn't have any html attribute but just a tag <p>
    if contentLocation[id] != 'fox-news':
        class_ = contentLocation[id]['class']

    # Crawl news content given the inputs
    if class_:
        successful, content = get_news_content(url, tag, class_)
    else:
        successful, content = get_news_content(url, tag)

    # If successful, then replace the 'content' section of our df with the content
    # Which is the last column, thus can be accessed by df.iloc[index, -1]
    if successful:
        df.iloc[index, -1] = content
    # If failed then we gather the failure's info
    else:
        failure_info.append(content)

    # Sleep for 2 seconds to avoid overloading
    time.sleep(2)

In [3]:
print(failure_info)

[]


\#cs110-PythonProgramming: Explicitly shows that the PythonProgram passes verification tests.

The placeholder list failure_info is completely empty shows that the above function of get_news_content() worked perfectly without any errors. There are certain runs in which the news_content variable returned from the function 

In [4]:
##############################     4     ##############################
# First we need to do some formatting. Right now the 'content' column
# of our df is really messy. Let's define a function to do that
def format_string(text):
    '''
    Formats a string of text to make it more standardized. Includes
    the following operations in the exact order:
    1. Removes contractions, e.g. I'll -> I will
    2. Removes punctuations, e.g. That is it. -> That is it
    3. Removes numbers, e.g. 300 turtles -> turtles
    4. Removes extra space, e.g. you  are right -> you are right
    5. Makes words lowercase, e.g. Terminal -> terminal
    6. Removes stop words (i.e. words that don't add value to our analysis),
        e.g. the library -> library
    '''
    # 1. Removes contractions
    from contractions import fix
    text = fix(text)

    # 2. Removes punctuation
    from string import punctuation
    translator = str.maketrans(punctuation, ' '*len(punctuation)) # map punctuation to space
    text = text.translate(translator)

    # 3. Removes numbers
    text = ''.join([i for i in text if not i.isdigit()])

    # 4. Removes extra space
    text = ' '.join(text.split())

    # 5. Makes lowercase
    text = text.lower()

    # 6. Removes stop words
    from nltk.corpus import stopwords
    stop = stopwords.words('english')
    text = " ".join([word for word in text.split() if word not in (stop)])

    # Return the result
    return text

# Next we are going to format all of the news article content
# Define a list to hold all of the formatted strings
formatted_contents = []
for index, row in df.iterrows():
    # Format the text
    formatted_string = format_string(row['content'])

    # Add the formatted text to list
    formatted_contents.append(formatted_string)

# Insert this list as a new column into our df
df['Formatted content'] = formatted_contents

# If formatted content is empty, then something went wrong in the previous
# process (likely with web crawling). To avoid it interrupting subsequent
# code, we are going to delete the row from the df
delete_indexes = []
for index, row in df.iterrows():
    if not row['Formatted content']:
        delete_indexes.append(index)

df = df.drop(index = delete_indexes)
# Reset indexes
df = df.reset_index(drop = True)

In [5]:
##############################     5     ##############################
import gensim
import numpy as np

# Load the pre-tained google-news model
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

# Define a function, that takes the model & a string of text as input
# and outputs the averaged vector over the text.
def text_to_average_vector(model, text):
    '''
    Using the gensim model, converts each word in the text string into
    a vector, averages over all these vectors, and returns the average.
    '''
    # Split the text into a list of words
    words = text.split()

    # Create an empty numpy array, with ncol = number of words, and
    # nrow = output dimensions of the model
    all_vectors = np.zeros((model.vector_size, len(words)))

    # If our model is large, then it would be reasonable for us to assume that 
    # any word that cannot be found in the model is not a word, e.g. â or ©
    # If such words are encountered, it also means that we need to delete a
    # column from our vector of all the words, to avoid disrupting the subsequent
    # average, thus define a list to hold the column indexes that we are going to
    # delete later:
    col_to_delete = []

    # Looping over all words, turn them into vectors and insert into np array
    for i in range(len(words)):
        try:
            vector = model.get_vector(words[i])
        # If the word cannot be found in the model, we will need to delete this
        # column from our array of vectors
        except KeyError:
            col_to_delete.append(i)
            continue

        # Insert this vector at the right position
        # a[:, 0] means select all rows from column 0
        all_vectors[:, i] = vector

    # Delete the columns in which the word cannot be found in the model
    final_vec = np.delete(all_vectors, col_to_delete, axis = 1)

    # Average over the columns, and return the averaged vector
    averaged_vec = np.mean(final_vec, axis = 1)
    return averaged_vec

# For all of our news articles, obtain an averaged vector of its news content,
# and add as a new column to our df
df_vectors = []
for index, row in df.iterrows():
    news_vector = text_to_average_vector(model, row['Formatted content'])
    df_vectors.append(news_vector)

df['Vector'] = df_vectors

In [6]:
##############################     6     ##############################
# I've tried the n_similarity function given by the model, but it did
# not work so well. I think it is because of the enormous amount of words
# that made the model think every pair of articles is similar. The minimum
# of all of them is 0.97 (which does not make enough sense), and it takes
# a long time to run as well. Thus instead I created an average vector on my
# own, and computed their cosine similarity. It is still not very good, but
# it's better than all news articles being similar (good enough for a 1st draft).

similarity_matrix = np.zeros((len(df), len(df)))
from sklearn.metrics.pairwise import cosine_similarity

# Looping over all the rows in df
for index1, row1 in df.iterrows():
    # Compare row's vector against every other row's vector
    for index2, row2 in df.iterrows():
        # Reshape the vectors for use in sklearn's function
        vec1 = row1['Vector'].reshape(1, -1)
        vec2 = row2['Vector'].reshape(1, -1)

        # Compute score using cosine similarity
        score = cosine_similarity(vec1, vec2)

        # Add the score to our matrix
        similarity_matrix[index1, index2] = score

In [7]:
# In a deployment environment, this matrix will be stored in the server,
# and updated every once in a while
similarity_matrix

array([[1.        , 0.89798543, 0.77919876, ..., 0.80570019, 0.79856328,
        0.76625432],
       [0.89798543, 1.        , 0.76081528, ..., 0.77043512, 0.76672343,
        0.73776677],
       [0.77919876, 0.76081528, 1.        , ..., 0.67246047, 0.78875015,
        0.80760098],
       ...,
       [0.80570019, 0.77043512, 0.67246047, ..., 1.        , 0.83692341,
        0.76949084],
       [0.79856328, 0.76672343, 0.78875015, ..., 0.83692341, 1.        ,
        0.79670984],
       [0.76625432, 0.73776677, 0.80760098, ..., 0.76949084, 0.79670984,
        1.        ]])

In [8]:
##############################     7     ##############################
# (Intended) Upon login, 10 random articles is shown to the user to
# learn their preference.
# Choose 10 random articles, show them to the user, obtain their 
# satisfaction score on these articles on a scale of 1 to 10, with
# 10 the most satisfied and 1 being the least satisfied, then compute
# the user's preference vector by averaging over these vectors using
# their rating as the weights.

# Set numpy random seed to ensure replicability
np.random.seed(125)
ten_random_articles = np.random.choice(df.index, size=10, replace=False)

# Holder variable to record vector and score
user_scores = []

# Looping over the 10 articles
for index in ten_random_articles:
    # Show the article to the user and inquire a score from the user
    score = int(input("Please give a satisfaction score in the range of 1 to 10 on the recommended article: \n\n" + df['content'][index]))

    # Turn it into a dictionary and add to holder list
    temp = {'vector': df['Vector'][index],
            'score' : score}
    user_scores.append(temp)

# Obtain an average of the user's vector weighted by score, to generate the initial
# learned preference of the user
sum_vector = np.zeros((model.vector_size, ))
for i in range(len(user_scores)):
    weighted_score = user_scores[i]['score'] / 10
    
    sum_vector += weighted_score * user_scores[i]['vector']

# Average over all vectors to get the initial preference
user_preference = sum_vector / len(user_scores)
user_preference = user_preference.reshape(1, -1)

# Find the article that is most similar to the learned preference of the user,
# Excluding the articles that was used for training
df_user = df.drop(index=ten_random_articles)
similarities = []
for index, row in df_user.iterrows():
    vector_article = row['Vector'].reshape(1, -1)
    sim = cosine_similarity(user_preference, vector_article)
    similarities.append(sim)

index = similarities.index(max(similarities))

df['content'][index]

'\n      In my most vulnerable moments, I have pictured having to tell my young child that his moms’ marriage is no longer recognized. I imagined choking back tears and reminding him that it doesn’t mean he is any less loved or protected. I envisioned telling him that our family is just as important as any other family – including ones with one mom and one dad. \n   \n      Thankfully, Congress just ensured I will not have to have that conversation with my child anytime soon. \n   \n      On Tuesday, the Senate voted to pass the Respect for All Marriage Act, a law that received a strong showing of bipartisan support in the House some months ago. It now goes to the House for a final vote, where it is expected to pass before the end of the year.\n   \nThe Respect for All Marriage Act repeals the long-outdated Defense of Marriage Act, which defined marriage as between one man and one woman and allowed states to deny recognition of same-sex marriages that originated in states where they we

# Results analysis

The above results are reasonable and expected, since I gave two ratings of 9 to two Ukrainian war news articles, and two 8s and one 7 to three other political articles (if you want to check out the articles out themselves, you can run all of the above code cells, it should output the same data), and the most similar piece of text that the model recommends is also about the Ukrainian war. It makes perfect sense for the recommender to be recommending news articles about the Ukrainian war, after it sees that I am interested in the Ukranian war and politics. This is defintely not the ideal version of the recommender that we want, since we want it to be able to extrapolate our preference and recommend thing that we do not know we are interested in.

I have thought about a few interesting applications that might strength this current Word2Vec model, and I will be testing one of them out in the next iteration of the product. They are:

1. Instead of recommending articles that has the closest similarity to the learned user's preference, we always attempt to recommend articles that are a given distance away from the user's preference. In other words, if we were to think of the endpoint of the vector of user's preference as a point in a high-dimensional word-embedding space, we always attempt to recommend articles that is on the surface of a hyper-sphere that has its center as the preference of the user. With every news article that we recommend to the user, instead of asking for the user to score their satisfaction, we ask the user whether this is too convergent to their ideas (by which we move the center of the hyper-sphere away from this point), or too divergent to their ideas (by which we move the center of the hyper-sphere closer to this point); and whether they would like more convergent new (by which we decrease the radius of the hyper-sphere), or they would like more divergent news (by which we then increase the radius of the hyper-sphere).

2. The limitation of the above method is that it allows users to create their own "media bubble". Stubborn people only gets more stubborn. A potential improvement to the model is if we do not allow the user to control that. In addition, we include some randomness into the model prediction, by adding a random vector of a set magnitude on top of the learned user's preference every time we are generating a news recommendation, that is to say, instead of recommending news articles that are the most similar to the preference of the user, we recommend news articles that are always a set semantic distance away from the preference of the user (in a random direction). Obviously, the limitation of this method is that the setting of the magnitude of the random vector becomes very influential in the recommendation process.

In [9]:
import random as rd
# Prototype of how user preference learning will be like (next step)
# Here's how I imagined the user preference learning process to be:
# 1. User logins onto their account and their preference will be learned
#       by supplying 10 articles for them to rate on a scale of 1 to 10,
#       with respect to how satisfied they are with the article with 10
#       being the most satisfied and 1 being not satisfied at all.
# 2. After the rating of each article, we learn a little bit more about
#       the user. We aim to give user the most semantically different
#       article to rate (by maximizing the difference in similarity
#       score compared to their previous ratings), to get more info of
#       the user.
# 3. After the user rates all 10 articles, we will get an average of these
#       vectors, weighted by the ratings they gave to each of them. This
#       will be the initial learned preference of the user.
# 4. Whenever the user clicks on an article recommended to them (which pops
#       up in a separate window), the website will show a window which asks
#       them to rate the article that they just read on a scale of 1 to 10.
#       We use this to learn a little bit further about the user.


# Taking myself as the test case, I am going to simulate being a user here
# Create a holder list that will hold all of the vectors that the user
# has rated, the index of the article, and their rating of the articles
zichen = []
# Set a seed to ensure replicability
rd.seed(12454)
# Determine a random first article to show
first_article_index = rd.choice(df.index)

# Hidden here to avoid excessive amount of output, but basically it was
# about shooting crimes in Virginia which I honestly don't care for,
# so I am going to give it a score of 1
#df['content'][first_article_index]
rating1 = {'vector': df['Vector'][first_article_index],
           'index' : first_article_index,
           'score' : 1}
zichen.append(rating1)

# Next we are going to find the most different article from the first one
# to give to the user for rating, so lets find the least similar article
# from our similarity matrix compared to the first article shown
first_sim = similarity_matrix[first_article_index]
second_article_indexes = np.where(first_sim == first_sim.min())

# We found two, 63 and 147, so let's just randomly choose one
if rd.random() < 0.5:
    second_article_index = 63
else:
    second_article_index = 147

# The random run gave 63, now show the second article
# Hidden to avoid excessive output
df['content'][second_article_index]

'France has suspended a plan to take in 3,500 refugees currently in Italy after Rome refused to let a migrant rescue ship disembark on its shores. Tensions between the two neighbours have escalated since Italy\'s new government barred the Ocean Viking ship from docking with 230 migrants. France has denounced the "unacceptable behaviour", but Italy insists it has been taking in its share of migrants. The boat will be allowed to dock on Friday in the French port of Toulon. The charity that runs the vessel, SOS MÃ©diterranÃ©e, said it was both relieved by the French decision and angry that for three weeks those on board had been let down by Europe\'s "dramatic failure". French Interior Minister GÃ©rald Darmanin warned of "extremely severe consequences for our bilateral relations" with Italy. For a start, France would not take in 3,500 refugees it had earlier agreed with Italy to accept in protest at Rome\'s actions. He also said France would take measures to strengthen controls on the bor

In [10]:
python.__version__

NameError: name 'python' is not defined