In [1]:
#\-- IMPORT MODULES, CLASSES AND METHODS --/#

import zipfile                          #############################
import os                               # || FILE SYSTEM / UTILS || #
import copy                             #############################
from prettytable import PrettyTable

# ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

import numpy as np                  ###################################
import pandas as pd                 # || EXPLORATIVE DATA ANALYSIS || #
import matplotlib.pyplot as plt     ###################################
import seaborn as sns
# https://towardsdatascience.com/handling-missing-data-like-a-pro-part-3-model-based-multiple-imputation-methods-bdfe85f93087 NumPyro, impyute,

# ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

import sklearn
import re
import importlib
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from sklearn import naive_bayes                         #########################
from sklearn import neural_network                      #  |-----------------|  #
from sklearn import svm                                 # || MODEL SELECTION || #
from sklearn import tree                                #  |-----------------|  #
from sklearn import linear_model                        #########################

# from PrunedCV import PrunedCV

# ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

from sklearn.model_selection import StratifiedKFold     ##########################
from sklearn.model_selection import ParameterGrid       # || MODEL VALIDATION || #
                                                        ##########################

%matplotlib inline


from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import re
from datetime import datetime, timedelta


In [2]:
#\-- SET ENVIRONMENT --/#
# Before starting we need to store the data properly. We define an ad-hoc folder where we will store everything.
main_PATH = os.getcwd()

# We check whether we already have the data.                        
if 'data' not in os.listdir():                                      
                                                                    
    # Unzip files.
    with zipfile.ZipFile(r'summer_project_dataset.zip') as zip_ref:

        os.mkdir(main_PATH + '/data')   # We create the 'data' directory,
        os.chdir(main_PATH + '/data')   # we change directory,
    
        data_PATH = os.getcwd()         # we get the data path
        zip_ref.extractall(data_PATH)   # and we unzip there.       #####################
                                                                    # || FILE SYSTEM || #    
    file_PATH = data_PATH + '/summer_project_dataset'               #####################

else:

    # We just build the paths.
    data_PATH = main_PATH + '/data'
    file_PATH = data_PATH + '/summer_project_dataset'

# Finally, we go back to our main path.
os.chdir(main_PATH)

# ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

# We also set a seed for reproducibility purposes.      #####################
SEED = 42                                               # || RANDOM SEED || #
np.random.seed(SEED)                                    #####################

# ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

# LaTeX style plots.
plt.rcParams['figure.figsize'] = (12, 8)
# plt.rcParams['text.usetex']    = True         ############################
# plt.rcParams['font.family']    = 'serif'      # || DEFAULT PARAMETERS || #
# plt.rcParams['font.size']      = '10'         ############################

pd.set_option('display.max_rows', 20)
# pd.set_option('display.max_rows', 10)

In [3]:
#\-- DATASET LOADING AND PREPROCESSING --/#
# Aome variables are stored as float, but they are actually int. Two reasons why:
#       -) nan values are considered as float --> first estimate them and then change the data type.
#       -) there are inconsistencies, especially in kw_max_min where some int values are float instead.
# for the moment just let's store everything as float, but further inspections are needed.

data_types = {
              'url' : str, 'timedelta' : int, 'shares' : int, 'data_channel' : str, 'weekday' : str, 
              
              'n_tokens_title'          : int, 'n_tokens_content'       : int, 'n_unique_tokens' : float, 'n_non_stop_words' : float,
              'n_non_stop_unique_tokens': float, 'average_token_length' : float,

              'num_hrefs' : int, 'num_self_hrefs' : int, 'num_imgs' : float, 'num_videos' : float,
              
              'kw_min_min' : float, 'kw_max_min' : float, 'kw_avg_min' : float, 'kw_min_max' : float, 'kw_max_max'   : float,
              'kw_avg_max' : float, 'kw_min_avg' : float, 'kw_max_avg' : float, 'kw_avg_avg' : float, 'num_keywords' : float,
              
              'self_reference_min_shares' : float, 'self_reference_max_shares' : float, 'self_reference_avg_sharess' : float,
              
              'LDA_00' : float, 'LDA_01' : float, 'LDA_02' : float, 'LDA_03' : float, 'LDA_04' : float,
              
              'global_subjectivity' : float, 'global_sentiment_polarity' : float, 'global_rate_positive_words' : float, 'global_rate_negative_words' : float,
              
              'rate_positive_words' : float, 'rate_negative_words' : float,
              
              'avg_positive_polarity' : float, 'min_positive_polarity' : float, 'max_positive_polarity' : float, 'avg_negative_polarity' : float,
              'min_negative_polarity' : float, 'max_negative_polarity' : float,

              'title_subjectivity' : float, 'title_sentiment_polarity' : float, 'abs_title_subjectivity' : float, 'abs_title_sentiment_polarity' : float,
              }                                                    
                                                                   
                                                                   
data = pd.read_csv(file_PATH + r'/development.csv',                 
                   usecols = lambda column: column != 'id', dtype = data_types)              
data


Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,data_channel,weekday
0,http://mashable.com/2014/09/08/safest-cabbies-...,121,12,1015,0.422018,1.0,0.545031,10,6,33.0,...,-0.160714,-0.500000,-0.071429,0.000000,0.000,0.500000,0.000,2900,bus,tuesday
1,http://mashable.com/2013/07/25/3d-printed-rifle/,532,9,503,0.569697,1.0,0.737542,9,0,,...,-0.157500,-0.250000,-0.100000,0.000000,0.000,0.500000,0.000,1300,tech,thursday
2,http://mashable.com/2013/10/30/digital-dinosau...,435,9,232,0.646018,1.0,0.748428,12,3,4.0,...,-0.427500,-1.000000,-0.187500,0.000000,0.000,0.500000,0.000,17700,lifestyle,wednesday
3,http://mashable.com/2014/08/27/homer-simpson-i...,134,12,171,0.722892,1.0,0.867925,9,5,0.0,...,-0.216667,-0.250000,-0.166667,0.400000,-0.250,0.100000,0.250,1500,bus,wednesday
4,http://mashable.com/2013/01/10/creepy-robotic-...,728,11,286,0.652632,1.0,0.800000,5,2,,...,-0.251786,-0.500000,-0.100000,0.200000,-0.100,0.300000,0.100,1400,tech,thursday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31710,http://mashable.com/2014/11/30/star-wars-guard...,37,11,440,0.564103,1.0,0.718978,10,2,,...,-0.209167,-0.316667,-0.050000,0.000000,0.000,0.500000,0.000,1000,world,tuesday
31711,http://mashable.com/2014/11/14/uk-floods/,52,14,0,0.000000,0.0,0.000000,0,0,0.0,...,0.000000,0.000000,0.000000,0.666667,-0.700,0.166667,0.700,11000,lifestyle,monday
31712,http://mashable.com/2014/09/08/paypal-bitcoin-...,121,9,969,0.489583,1.0,0.710623,6,5,2.0,...,-0.400000,-1.000000,-0.050000,0.000000,0.000,0.500000,0.000,2400,tech,tuesday
31713,http://mashable.com/2013/08/23/mashable-androi...,503,11,1976,0.412308,1.0,0.621080,21,3,1.0,...,-0.323413,-1.000000,-0.050000,0.700000,-0.400,0.200000,0.400,6000,lifestyle,friday


In [4]:
# to_be_scraped = np.array(data[data['num_imgs'].isna()])
# print(to_be_scraped)

# def split_array(arr):
#     # Determine the split length
#     split_length = len(arr) // 3

#     # Ensure the array can be evenly split into three parts
#     if len(arr) % 3 != 0:
#         raise ValueError("The array length is not divisible by 3.")

#     # Split the array into three equal parts
#     array_parts = np.array_split(arr, 3)

#     return array_parts[0], array_parts[1], array_parts[2]

# # Example usage

# result1, result2, result3 = split_array(np.array(to_be_scraped))

# # print(result1)  # [1 2 3 4]
# # print(result2)  # [5 6 7 8]
# # print(result3)  # [ 9 10 11 12]
# url_partition = []
# url_partition.append(result1)
# url_partition.append(result2)
# url_partition.append(result3)

In [5]:
np.array(data[data['num_imgs'].isna()])

array([['http://mashable.com/2013/07/25/3d-printed-rifle/', 532, 9, ...,
        1300, 'tech', 'thursday'],
       ['http://mashable.com/2013/01/10/creepy-robotic-spider-dress/',
        728, 11, ..., 1400, 'tech', 'thursday'],
       ['http://mashable.com/2014/04/08/childrens-book-morals-adulthood/',
        275, 12, ..., 17800, 'lifestyle', 'tuesday'],
       ...,
       ['http://mashable.com/2013/02/03/allure-magazine-linda-wells/',
        704, 10, ..., 1100, 'bus', 'sunday'],
       ['http://mashable.com/2014/10/14/taylor-swift-out-of-the-woods/',
        85, 12, ..., 464, 'entertainment', 'wednesday'],
       ['http://mashable.com/2014/11/30/star-wars-guardians-of-the-galaxy-video/',
        37, 11, ..., 1000, 'world', 'tuesday']], dtype=object)

In [4]:

from Scraper import Scraper

to_be_scraped = data[data['num_imgs'].isna()]
scrap = Scraper()
scrap.set_url(to_be_scraped['url'])
scrap.start_driver()


DRIVER ONLINE


In [5]:
# from datetime import datetime
# import tzlocal  # $ pip install tzlocal

# unix_timestamp = float("1685951662473")
# unix_timestamp /= 1000
# local_timezone = tzlocal.get_localzone() # get pytz timezone
# local_time = datetime.fromtimestamp(unix_timestamp, local_timezone)

# print(local_time.strftime("%Y-%m-%d %H:%M:%S"))
# print(local_time.strftime("%B %d %Y"))  # print date in your format

# KEEP TRACK OF WHEN THE SCRAPER FAILED (JUST IN CASE).


In [5]:
scrap.scrape()

Current URL: http://mashable.com/2013/07/25/3d-printed-rifle/

		zzz...zzz...zzz...

		       !!!!

from 2013	  HTML ACQUIRED!

		     Failure.

		zzz...zzz...zzz...

		       !!!!

from 2013	  HTML ACQUIRED!

		     Success!

		zzz...zzz...zzz...

		       !!!!

from 2014	  HTML ACQUIRED!

		     Success!

		zzz...zzz...zzz...

		       !!!!

from 2015	  HTML ACQUIRED!

		     Success!
Current URL: http://mashable.com/2013/01/10/creepy-robotic-spider-dress/

		zzz...zzz...zzz...

		       !!!!

from 2013	  HTML ACQUIRED!

		     Failure.

		zzz...zzz...zzz...

		       !!!!

from 2013	  HTML ACQUIRED!

		     Failure.

		zzz...zzz...zzz...

		       !!!!

from 2013	  HTML ACQUIRED!

		     Failure.

		zzz...zzz...zzz...

		       !!!!

from 2013	  HTML ACQUIRED!

		     Success!

		zzz...zzz...zzz...

		       !!!!

from 2014	  HTML ACQUIRED!

		     Failure.

		zzz...zzz...zzz...

		       !!!!

from 2014	  HTML ACQUIRED!

		     Failure.

		zzz...zzz...zzz...

		       !!!!

from 20

WebDriverException: Message: Failed to decode response from marionette


In [14]:
scrap.__url_html__['http://mashable.com/2013/07/25/3d-printed-rifle/']['2013']

<html lang="en"><head><style type="text/css">:root, :host {
  --fa-font-solid: normal 900 1em/1 "Font Awesome 6 Solid";
  --fa-font-regular: normal 400 1em/1 "Font Awesome 6 Regular";
  --fa-font-light: normal 300 1em/1 "Font Awesome 6 Light";
  --fa-font-thin: normal 100 1em/1 "Font Awesome 6 Thin";
  --fa-font-duotone: normal 900 1em/1 "Font Awesome 6 Duotone";
  --fa-font-brands: normal 400 1em/1 "Font Awesome 6 Brands";
}

svg:not(:root).svg-inline--fa, svg:not(:host).svg-inline--fa {
  overflow: visible;
  box-sizing: content-box;
}

.svg-inline--fa {
  display: var(--fa-display, inline-block);
  height: 1em;
  overflow: visible;
  vertical-align: -0.125em;
}
.svg-inline--fa.fa-2xs {
  vertical-align: 0.1em;
}
.svg-inline--fa.fa-xs {
  vertical-align: 0em;
}
.svg-inline--fa.fa-sm {
  vertical-align: -0.0714285705em;
}
.svg-inline--fa.fa-lg {
  vertical-align: -0.2em;
}
.svg-inline--fa.fa-xl {
  vertical-align: -0.25em;
}
.svg-inline--fa.fa-2xl {
  vertical-align: -0.3125em;
}
.svg

In [6]:
candidate_dates = scrap.get_snap_dates()
candidate_dates

{'http://mashable.com/2013/07/25/3d-printed-rifle/': [datetime.date(2013, 7, 28),
  datetime.date(2013, 7, 31),
  datetime.date(2013, 8, 1),
  datetime.date(2013, 8, 7),
  datetime.date(2013, 8, 9),
  datetime.date(2013, 8, 14),
  datetime.date(2013, 8, 21),
  datetime.date(2013, 8, 29),
  datetime.date(2013, 10, 24),
  datetime.date(2013, 10, 31),
  datetime.date(2014, 2, 8),
  datetime.date(2014, 3, 16),
  datetime.date(2014, 4, 10),
  datetime.date(2014, 5, 31),
  datetime.date(2014, 9, 13),
  datetime.date(2014, 10, 13),
  datetime.date(2014, 10, 14),
  datetime.date(2015, 1, 12)],
 'http://mashable.com/2013/01/10/creepy-robotic-spider-dress/': [datetime.date(2013, 1, 13),
  datetime.date(2013, 1, 14),
  datetime.date(2013, 1, 15),
  datetime.date(2013, 1, 20),
  datetime.date(2013, 1, 27),
  datetime.date(2013, 2, 3),
  datetime.date(2013, 2, 10),
  datetime.date(2013, 2, 17),
  datetime.date(2013, 3, 5),
  datetime.date(2013, 3, 17),
  datetime.date(2013, 4, 1),
  datetime.date(2

In [7]:
shifted_dates = scrap.shift_dates(data['url'], data['timedelta'])
shifted_dates

[datetime.date(2015, 1, 7),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 4),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 7),
 datetime.date(2015, 1, 7),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 7),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 7),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 7),
 datetime.date(2015, 1, 7),
 datetime.date(2015, 1, 6),
 datetime.date(2015, 1, 8),
 datetime.date(2015, 1, 8),
 datetime.date(2015,

In [8]:
closest = []
for i, key in zip(range(len(data['url'])), candidate_dates.keys()):
    closest.append(scrap.get_closest(shifted_dates[i], candidate_dates[key]))

scraping_dates = {k: v for k, v in zip(data['url'], closest)}
scraping_dates

{'http://mashable.com/2014/09/08/safest-cabbies-nyc/': datetime.date(2014, 10, 14),
 'http://mashable.com/2013/07/25/3d-printed-rifle/': datetime.date(2014, 12, 26),
 'http://mashable.com/2013/10/30/digital-dinosaur-movements/': datetime.date(2014, 12, 31),
 'http://mashable.com/2014/08/27/homer-simpson-ice-bucket/': datetime.date(2014, 9, 7),
 'http://mashable.com/2013/01/10/creepy-robotic-spider-dress/': datetime.date(2014, 2, 23),
 'http://mashable.com/2014/11/27/prank-cash-thanksgiving/': datetime.date(2014, 10, 13),
 'http://mashable.com/2013/01/17/pokki-windows-8-2/': datetime.date(2014, 12, 10),
 'http://mashable.com/2014/03/20/emma-stone-spice-girls/': datetime.date(2014, 12, 31),
 'http://mashable.com/2014/07/28/three-out-of-print-j-d-salinger-stories-republished/': datetime.date(2014, 12, 20),
 'http://mashable.com/2014/09/16/robot-cheetah-freed/': datetime.date(2014, 12, 31),
 'http://mashable.com/2013/02/14/tesla-vs-nyt/': datetime.date(2015, 1, 7),
 'http://mashable.com/20

In [9]:
import re

list_1 = [data['url'].iloc[0], data['url'].iloc[1], data['url'].iloc[2]]
list_2 = [str(scraping_dates['http://mashable.com/2014/09/08/safest-cabbies-nyc/']).replace("-", "")]

pattern = r"(https://web.archive.org/web/)\d{8}"
updated_list_1 = [re.sub(pattern, rf"\g<1>{list_2[0]}", url) for url in list_1]

print(updated_list_1)
list_2

['http://mashable.com/2014/09/08/safest-cabbies-nyc/', 'http://mashable.com/2013/07/25/3d-printed-rifle/', 'http://mashable.com/2013/10/30/digital-dinosaur-movements/']


['20141014']

In [9]:
switched = scrap.switch_date(to_be_scraped['url'], scraping_dates)
switched

{'http://mashable.com/2013/07/25/3d-printed-rifle/': 'https://web.archive.org/web/20141014/http://mashable.com/2013/07/25/3d-printed-rifle/',
 'http://mashable.com/2013/01/10/creepy-robotic-spider-dress/': 'https://web.archive.org/web/20141226/http://mashable.com/2013/01/10/creepy-robotic-spider-dress/',
 'http://mashable.com/2014/04/08/childrens-book-morals-adulthood/': 'https://web.archive.org/web/20141231/http://mashable.com/2014/04/08/childrens-book-morals-adulthood/',
 'http://mashable.com/2014/07/20/apollo-11-45th-anniversary/': 'https://web.archive.org/web/20140907/http://mashable.com/2014/07/20/apollo-11-45th-anniversary/',
 'http://mashable.com/2013/07/12/serval-project/': 'https://web.archive.org/web/20140223/http://mashable.com/2013/07/12/serval-project/',
 'http://mashable.com/2013/01/23/infrascanner-detects-brain-trauma/': 'https://web.archive.org/web/20141013/http://mashable.com/2013/01/23/infrascanner-detects-brain-trauma/',
 'http://mashable.com/2013/09/10/twitter-fight

In [81]:
!python3 scrape.py

Traceback (most recent call last):
  File "/home/mattizza/Documents/DSL-Online-News-Popularity/scrape.py", line 88, in <module>
    scrap.start_driver()
  File "/home/mattizza/Documents/DSL-Online-News-Popularity/Scraper2.py", line 41, in start_driver
    self.__driver__ = webdriver.Firefox()
  File "/home/mattizza/Documents/DSL-Online-News-Popularity/venv/lib/python3.10/site-packages/selenium/webdriver/firefox/webdriver.py", line 201, in __init__
    super().__init__(command_executor=executor, options=options, keep_alive=True)
  File "/home/mattizza/Documents/DSL-Online-News-Popularity/venv/lib/python3.10/site-packages/selenium/webdriver/remote/webdriver.py", line 286, in __init__
    self.start_session(capabilities, browser_profile)
  File "/home/mattizza/Documents/DSL-Online-News-Popularity/venv/lib/python3.10/site-packages/selenium/webdriver/remote/webdriver.py", line 378, in start_session
    response = self.execute(Command.NEW_SESSION, parameters)
  File "/home/mattizza/Documents

In [16]:
import pickle
with open('variables.pkl', 'rb') as file:
    url_html, candidate_dates, shifted_dates, closest, scraping_dates, switched = pickle.load(file)


FileNotFoundError: [Errno 2] No such file or directory: 'variables.pkl'

In [13]:
list(switched.values())

['https://web.archive.org/web/20141014/http://mashable.com/2013/07/25/3d-printed-rifle/',
 'https://web.archive.org/web/20141226/http://mashable.com/2013/01/10/creepy-robotic-spider-dress/',
 'https://web.archive.org/web/20141231/http://mashable.com/2014/04/08/childrens-book-morals-adulthood/']

In [17]:
# from Scraper import ScrapePast

# scrap_past = ScrapePast()
# scrap_past.set_url(list(switched.values()))
# scrap_past.start_driver()
# url_html, url_keywords = scrap_past.scrape()

DRIVER ONLINE
URL: 2

START SCRAPING -- EXPECTED TIME REQUIRED: 36s
URL: https://web.archive.org/web/20141014/http://mashable.com/2013/07/25/3d-printed-rifle/
zzz...zzz...zzz...
HTML STORED!
URL: https://web.archive.org/web/20131211/http://mashable.com/2013/01/10/creepy-robotic-spider-dress/
zzz...zzz...zzz...
HTML STORED!


In [10]:
from Scraper import ScrapePast
scrap_past = ScrapePast()
scrap_past.recall_past(switched.values())

URL: https://web.archive.org/web/20141014/http://mashable.com/2013/07/25/3d-printed-rifle/

		 -- REQUEST SENT --

		-- HTML ACQUIRED! --

Time: 3.9779

URL: https://web.archive.org/web/20141226/http://mashable.com/2013/01/10/creepy-robotic-spider-dress/

		 -- REQUEST SENT --

		-- HTML ACQUIRED! --

Time: 2.5946

URL: https://web.archive.org/web/20141231/http://mashable.com/2014/04/08/childrens-book-morals-adulthood/

		 -- REQUEST SENT --

		-- HTML ACQUIRED! --

Time: 1.6365

URL: https://web.archive.org/web/20140907/http://mashable.com/2014/07/20/apollo-11-45th-anniversary/

		 -- REQUEST SENT --

		-- HTML ACQUIRED! --

Time: 1.9129

URL: https://web.archive.org/web/20140223/http://mashable.com/2013/07/12/serval-project/

		 -- REQUEST SENT --

		-- HTML ACQUIRED! --

Time: 2.2057

URL: https://web.archive.org/web/20141013/http://mashable.com/2013/01/23/infrascanner-detects-brain-trauma/

		 -- REQUEST SENT --

		-- HTML ACQUIRED! --

Time: 1.8084

URL: https://web.archive.org/we

({'https://web.archive.org/web/20141014/http://mashable.com/2013/07/25/3d-printed-rifle/': <!DOCTYPE html>
  
  <!--
  o o     o     +              o
  +   +     +             o     +       +
              +
  o  +    +        o  +           +        +
       __  __           _           _     _
  ~_,-|  \/  | __ _ ___| |__   __ _| |__ | | ___
      | |\/| |/ _` / __| '_ \ / _` | '_ \| |/ _ \,-~_,- - - ,
  ~_,-| |  | | (_| \__ \ | | | (_| | |_) | |  __/    |   /\_/\
      |_|  |_|\__,_|___/_| |_|\__,_|_.__/|_|\___|  ~=|__( ^ .^)
  ~_,-~_,-~_,-~_,-~_,-~_,-~_,-~_,-~_,-~_,-~_,-~_,-~_,""   ""
  o o     o     +              o
  +   +     +             o     +       +
              +
  o  +    +        o  +           +        +
  -->
  <html data-env="production" lang="en" xml:lang="en">
  <head><script charset="utf-8" src="/_static/js/bundle-playback.js?v=1WaXNDFE" type="text/javascript"></script>
  <script charset="utf-8" src="/_static/js/wombat.js?v=txqj7nKC" type="text/javascript"></scri

In [82]:
# Given a mashable home, retrieves all the channels that appear in the home. 

url = 'https://web.archive.org/web/20141229153334/http://mashable.com/'
html = requests.get(url)

soup = BeautifulSoup(html.text, 'html.parser')

import re

text = html.text

match = re.findall(r'(?<="channel":")[^"]*', text)

print(match[1:])


['home', 'Watercooler', 'World', 'World', 'Business', 'World', 'World', 'World', 'World', 'World', 'Business', 'Business', 'Lifestyle', 'World', 'World', 'Watercooler', 'Tech', 'Lifestyle', 'World', 'Watercooler', 'World', 'Business', 'Entertainment', 'World', 'Lifestyle', 'World', 'World', 'World', 'World', 'World', 'Watercooler', 'World', 'Business', 'World', 'Business', 'World', 'Business', 'World', 'Business']


In [13]:
scrap_past.__url_info__

{'https://web.archive.org/web/20141014/http://mashable.com/2013/07/25/3d-printed-rifle/': {'keywords': ['3d-printers',
   'uncategorized',
   'video',
   'gadgets',
   'us-world',
   'rifle'],
  'imgs': 0,
  'videos': 0},
 'https://web.archive.org/web/20141226/http://mashable.com/2013/01/10/creepy-robotic-spider-dress/': {'keywords': ['fashion',
   'robots',
   'robotics',
   'spiders',
   'uncategorized',
   'tech',
   'dev-design',
   'bc-video-lead'],
  'imgs': 1,
  'videos': 0},
 'https://web.archive.org/web/20141231/http://mashable.com/2014/04/08/childrens-book-morals-adulthood/': {'keywords': ['books',
   'lists',
   'children-s-books',
   'uncategorized',
   'nostalgia',
   'watercooler',
   'family-parenting'],
  'imgs': 10,
  'videos': 0},
 'https://web.archive.org/web/20140907/http://mashable.com/2014/07/20/apollo-11-45th-anniversary/': {'keywords': ['space',
   'nasa',
   'apollo',
   'mars',
   'apollo-11',
   'uncategorized',
   'us-world',
   'us'],
  'imgs': 23,
  'video

In [24]:
soup =  scrap_past.__old_url_html__['https://web.archive.org/web/20141226/http://mashable.com/2013/01/10/creepy-robotic-spider-dress/']
article = soup.find('article', class_='full post')
section = article.find('section', class_ ='article-content')
img = section.find('img')
img

<img class="fullwidth" data-fragment="m!ced7" data-image="https://web.archive.org/web/20141226145235/http://rack.0.mshcdn.com/media/ZgkyMDEzLzAxLzExLzA5L1NjcmVlblNob3QyLmFiNTg4LnBuZwpwCXRodW1iCTEyMDB4OTYwMD4/18df6a4c/992/Screen-Shot-2013-01-11-at-1.04.43-PM.png" data-micro="1" data-width="626" src="https://web.archive.org/web/20141226145235im_/http://rack.0.mshcdn.com/media/ZgkyMDEzLzAxLzExLzA5L1NjcmVlblNob3QyLmFiNTg4LnBuZwpwCXRodW1iCTEyMDB4OTYwMD4/18df6a4c/992/Screen-Shot-2013-01-11-at-1.04.43-PM.png"/>

In [127]:
import requests
html = requests.get('https://web.archive.org/web/20150102221734/http://mashable.com/2014/12/03/instagram-sports-athletes-2014/', timeout=5)


In [128]:
soup = BeautifulSoup(html.text)
imgs = soup.select('figure', class_ = 'article-image')
print(len(imgs))
videos = soup.find_all('iframe')
print(len(videos))


# video in header. Every video has allowfullscreen=""
article_img = soup.select_one('div', class_ = 'article_image')
article_img.select('iframe')


1
1


[<iframe frameborder="0" id="donato-if" scrolling="no" src="https://archive.org/includes/donate.php?as_page=1&amp;platform=wb&amp;referer=https%3A//web.archive.org/web/20150102221734/http%3A//mashable.com/2014/12/03/instagram-sports-athletes-2014/" style="width:100%; height:100%">
 </iframe>]

In [None]:
# MAKE A DISTINCTION FOR PAGES FOR WHICH THE PAST IS NOT AVAILABLE
# CHECK HEADER
# LOOK FOR VIDEOS BY LOOKING AT IFRAME AND ALLOWFULLSCREEN=""
# CHECK THE REST OF THE ARTICLE


In [None]:
soup

In [104]:

def count_images(soup):
    
    image_tags = soup.find_all('img')
    return len(image_tags)

image_count = count_images(soup)
print("Number of images:", image_count - 2)

# Need to discriminate between pages in the past and page in the future, different structures. This code is good for present,
# for substract two.


Number of images: 5


In [59]:
html = requests.get('https://web.archive.org/web/20130829025639/http://mashable.com/2013/02/27/facebook-apple-google-same-sex-marriage/')

In [60]:
soup = BeautifulSoup(html.content, 'html.parser')
article = soup.select('figure', class_ = 'article-image')
len(article)

10

In [48]:
html = requests.get('https://web.archive.org/web/20141126084036/http://mashable.com/2014/04/10/saturday-night-live-time-lapse/')

In [70]:
# USE . TO GET CLASS, COUNT IMGS    
soup = BeautifulSoup(html.content, 'html.parser')
soup
# soup.select('.article-content iframe, img')
# article = soup.select('figure', class_ = 'article-image')
# len(article)
soup.select('#more-in-channel')[0]

[]

In [24]:
meta_tag = soup.find('meta', attrs={'name': 'keywords', 'data-page-subject': 'true'})

# Extract the content attribute value as a string
keywords_string = meta_tag['content']

# Split the keywords into a list
keywords_list = keywords_string.split(', ')
keywords_list

['3d-printers', 'uncategorized', 'video', 'gadgets', 'us-world', 'rifle']

In [12]:
url_keywords

{'https://web.archive.org/web/20141014/http://mashable.com/2013/07/25/3d-printed-rifle/': ['3d-printers',
  'uncategorized',
  'video',
  'gadgets',
  'us-world',
  'rifle'],
 'https://web.archive.org/web/20131211/http://mashable.com/2013/01/10/creepy-robotic-spider-dress/': ['fashion',
  'robots',
  'robotics',
  'spiders',
  'uncategorized',
  'tech',
  'dev-design',
  'bc-video-lead']}

In [13]:
soup = scrap_past.__url_html__['https://web.archive.org/web/20131211/http://mashable.com/2013/01/10/creepy-robotic-spider-dress/']

scrap_past.__url_html__
# css_selector = '[class="month"]'
# highlighted_month = soup.select(css_selector)

{'https://web.archive.org/web/20141014/http://mashable.com/2013/07/25/3d-printed-rifle/': <html class="js flexbox flexboxlegacy canvas canvastext no-touch postmessage hashchange history websockets rgba hsla multiplebgs backgroundsize borderimage cssanimations csscolumns cssgradients no-cssreflections csstransforms csstransforms3d csstransitions fontface generatedcontent video audio localstorage sessionstorage webworkers applicationcache overthrow-enabled" data-env="production" lang="en" style="" xml:lang="en"><head><script src="https://web.archive.org/web/20141014201706js_/http://srv-2014-10-14-20.config.parsely.com/config/mashable.com"></script><script async="" src="https://web.archive.org/web/20141014201706js_/http://aax.amazon-adsystem.com/e/dtb/bid?src=3158&amp;u=http%3A%2F%2Fmashable.com%2F2013%2F07%2F25%2F3d-printed-rifle%2F&amp;cb=3142018" type="text/javascript"></script><script async="" src="https://web.archive.org/web/20141014201706js_/https://apis.google.com/_/scs/apps-static

In [None]:
# RETRIEVE "WHAT'S HOT SECTION"

In [14]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import concurrent.futures


<module 'concurrent.futures' from '/usr/lib/python3.10/concurrent/futures/__init__.py'>

In [17]:
def scrape_website(url):
    driver = webdriver.Firefox()
    driver.get(url)
    # Perform any necessary interactions or waits using the driver
    # Extract the page content using BeautifulSoup
    page_content = driver.page_source
    soup = BeautifulSoup(page_content, 'html.parser')
    # Perform the desired scraping operations on the soup object
    # Close the driver
    driver.quit()
    return soup

urls = data['url'].iloc[0:3]

with concurrent.futures.ThreadPoolExecutor() as executor:
    # Submit the scraping tasks to the executor
    results = [executor.submit(scrape_website, url) for url in urls]
    # Retrieve the results as they complete
    for future in concurrent.futures.as_completed(results):
        scraped_data = future.result()
        # Process or store the scraped data as desired


NoSuchWindowException: Message: Browsing context has been discarded
Stacktrace:
RemoteError@chrome://remote/content/shared/RemoteError.sys.mjs:8:8
WebDriverError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:183:5
NoSuchWindowError@chrome://remote/content/shared/webdriver/Errors.sys.mjs:468:5
assert.that/<@chrome://remote/content/shared/webdriver/Assert.sys.mjs:485:13
assert.open@chrome://remote/content/shared/webdriver/Assert.sys.mjs:147:4
GeckoDriver.prototype.getPageSource@chrome://remote/content/marionette/driver.sys.mjs:920:15
despatch@chrome://remote/content/marionette/server.sys.mjs:304:40
execute@chrome://remote/content/marionette/server.sys.mjs:275:16
onPacket/<@chrome://remote/content/marionette/server.sys.mjs:248:20
onPacket@chrome://remote/content/marionette/server.sys.mjs:249:9
_onJSONObjectReady/<@chrome://remote/content/marionette/transport.sys.mjs:496:20
observe@resource://gre/modules/AsyncShutdown.sys.mjs:576:16
