In [28]:
#\-- IMPORT MODULES, CLASSES AND METHODS --/#

import zipfile                          #############################
import os                               # || FILE SYSTEM / UTILS || #
import copy                             #############################
from prettytable import PrettyTable

# ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

import numpy as np                  ###################################
import pandas as pd                 # || EXPLORATIVE DATA ANALYSIS || #
import matplotlib.pyplot as plt     ###################################
import seaborn as sns
# https://towardsdatascience.com/handling-missing-data-like-a-pro-part-3-model-based-multiple-imputation-methods-bdfe85f93087 NumPyro, impyute,

# ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

import sklearn
import re
import importlib
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from sklearn import naive_bayes                         #########################
from sklearn import neural_network                      #  |-----------------|  #
from sklearn import svm                                 # || MODEL SELECTION || #
from sklearn import tree                                #  |-----------------|  #
from sklearn import linear_model                        #########################

# from PrunedCV import PrunedCV

# ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

from sklearn.model_selection import StratifiedKFold     ##########################
from sklearn.model_selection import ParameterGrid       # || MODEL VALIDATION || #
                                                        ##########################

%matplotlib inline


from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import re
from datetime import datetime, timedelta


In [29]:
#\-- SET ENVIRONMENT --/#
# Before starting we need to store the data properly. We define an ad-hoc folder where we will store everything.
main_PATH = os.getcwd()

# We check whether we already have the data.                        
if 'data' not in os.listdir():                                      
                                                                    
    # Unzip files.
    with zipfile.ZipFile(r'summer_project_dataset.zip') as zip_ref:

        os.mkdir(main_PATH + '/data')   # We create the 'data' directory,
        os.chdir(main_PATH + '/data')   # we change directory,
    
        data_PATH = os.getcwd()         # we get the data path
        zip_ref.extractall(data_PATH)   # and we unzip there.       #####################
                                                                    # || FILE SYSTEM || #    
    file_PATH = data_PATH + '/summer_project_dataset'               #####################

else:

    # We just build the paths.
    data_PATH = main_PATH + '/data'
    file_PATH = data_PATH + '/summer_project_dataset'

# Finally, we go back to our main path.
os.chdir(main_PATH)

# ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

# We also set a seed for reproducibility purposes.      #####################
SEED = 42                                               # || RANDOM SEED || #
np.random.seed(SEED)                                    #####################

# ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

# LaTeX style plots.
plt.rcParams['figure.figsize'] = (12, 8)
# plt.rcParams['text.usetex']    = True         ############################
# plt.rcParams['font.family']    = 'serif'      # || DEFAULT PARAMETERS || #
# plt.rcParams['font.size']      = '10'         ############################

pd.set_option('display.max_rows', 20)
# pd.set_option('display.max_rows', 10)

In [30]:
#\-- DATASET LOADING AND PREPROCESSING --/#
# Aome variables are stored as float, but they are actually int. Two reasons why:
#       -) nan values are considered as float --> first estimate them and then change the data type.
#       -) there are inconsistencies, especially in kw_max_min where some int values are float instead.
# for the moment just let's store everything as float, but further inspections are needed.

data_types = {
              'url' : str, 'timedelta' : int, 'shares' : int, 'data_channel' : str, 'weekday' : str, 
              
              'n_tokens_title'          : int, 'n_tokens_content'       : int, 'n_unique_tokens' : float, 'n_non_stop_words' : float,
              'n_non_stop_unique_tokens': float, 'average_token_length' : float,

              'num_hrefs' : int, 'num_self_hrefs' : int, 'num_imgs' : float, 'num_videos' : float,
              
              'kw_min_min' : float, 'kw_max_min' : float, 'kw_avg_min' : float, 'kw_min_max' : float, 'kw_max_max'   : float,
              'kw_avg_max' : float, 'kw_min_avg' : float, 'kw_max_avg' : float, 'kw_avg_avg' : float, 'num_keywords' : float,
              
              'self_reference_min_shares' : float, 'self_reference_max_shares' : float, 'self_reference_avg_sharess' : float,
              
              'LDA_00' : float, 'LDA_01' : float, 'LDA_02' : float, 'LDA_03' : float, 'LDA_04' : float,
              
              'global_subjectivity' : float, 'global_sentiment_polarity' : float, 'global_rate_positive_words' : float, 'global_rate_negative_words' : float,
              
              'rate_positive_words' : float, 'rate_negative_words' : float,
              
              'avg_positive_polarity' : float, 'min_positive_polarity' : float, 'max_positive_polarity' : float, 'avg_negative_polarity' : float,
              'min_negative_polarity' : float, 'max_negative_polarity' : float,

              'title_subjectivity' : float, 'title_sentiment_polarity' : float, 'abs_title_subjectivity' : float, 'abs_title_sentiment_polarity' : float,
              }                                                    
                                                                   
                                                                   
data = pd.read_csv(file_PATH + r'/development.csv',                 
                   usecols = lambda column: column != 'id', dtype = data_types)              
data


Unnamed: 0,url,timedelta,n_tokens_title,n_tokens_content,n_unique_tokens,n_non_stop_words,n_non_stop_unique_tokens,num_hrefs,num_self_hrefs,num_imgs,...,avg_negative_polarity,min_negative_polarity,max_negative_polarity,title_subjectivity,title_sentiment_polarity,abs_title_subjectivity,abs_title_sentiment_polarity,shares,data_channel,weekday
0,http://mashable.com/2014/09/08/safest-cabbies-...,121,12,1015,0.422018,1.0,0.545031,10,6,33.0,...,-0.160714,-0.500000,-0.071429,0.000000,0.000,0.500000,0.000,2900,bus,tuesday
1,http://mashable.com/2013/07/25/3d-printed-rifle/,532,9,503,0.569697,1.0,0.737542,9,0,,...,-0.157500,-0.250000,-0.100000,0.000000,0.000,0.500000,0.000,1300,tech,thursday
2,http://mashable.com/2013/10/30/digital-dinosau...,435,9,232,0.646018,1.0,0.748428,12,3,4.0,...,-0.427500,-1.000000,-0.187500,0.000000,0.000,0.500000,0.000,17700,lifestyle,wednesday
3,http://mashable.com/2014/08/27/homer-simpson-i...,134,12,171,0.722892,1.0,0.867925,9,5,0.0,...,-0.216667,-0.250000,-0.166667,0.400000,-0.250,0.100000,0.250,1500,bus,wednesday
4,http://mashable.com/2013/01/10/creepy-robotic-...,728,11,286,0.652632,1.0,0.800000,5,2,,...,-0.251786,-0.500000,-0.100000,0.200000,-0.100,0.300000,0.100,1400,tech,thursday
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31710,http://mashable.com/2014/11/30/star-wars-guard...,37,11,440,0.564103,1.0,0.718978,10,2,,...,-0.209167,-0.316667,-0.050000,0.000000,0.000,0.500000,0.000,1000,world,tuesday
31711,http://mashable.com/2014/11/14/uk-floods/,52,14,0,0.000000,0.0,0.000000,0,0,0.0,...,0.000000,0.000000,0.000000,0.666667,-0.700,0.166667,0.700,11000,lifestyle,monday
31712,http://mashable.com/2014/09/08/paypal-bitcoin-...,121,9,969,0.489583,1.0,0.710623,6,5,2.0,...,-0.400000,-1.000000,-0.050000,0.000000,0.000,0.500000,0.000,2400,tech,tuesday
31713,http://mashable.com/2013/08/23/mashable-androi...,503,11,1976,0.412308,1.0,0.621080,21,3,1.0,...,-0.323413,-1.000000,-0.050000,0.700000,-0.400,0.200000,0.400,6000,lifestyle,friday


In [52]:
# from Scraper import Scraper

# scrap = Scraper()
# scrap.set_url(to_be_scraped['url'])
# scrap.start_driver()


DRIVER ONLINE


In [53]:
# scrap.scrape()

YEARS: ['2013', '2014', '2015']
URL: 6375

START SCRAPING -- EXPECTED TIME REQUIRED: 114750s
URL: http://mashable.com/2013/07/25/3d-printed-rifle/
CURRENT YEAR: 2013
http://mashable.com/2013/07/25/3d-printed-rifle/
SCRAPED 2013
zzz...zzz...zzz...
HTML STORED!
CURRENT YEAR: 2014
http://mashable.com/2013/07/25/3d-printed-rifle/
SCRAPED 2014
zzz...zzz...zzz...
HTML STORED!
CURRENT YEAR: 2015
http://mashable.com/2013/07/25/3d-printed-rifle/
SCRAPED 2015
zzz...zzz...zzz...
HTML STORED!
URL: http://mashable.com/2013/01/10/creepy-robotic-spider-dress/
CURRENT YEAR: 2013
http://mashable.com/2013/01/10/creepy-robotic-spider-dress/
SCRAPED 2013
zzz...zzz...zzz...
HTML STORED!
CURRENT YEAR: 2014
http://mashable.com/2013/01/10/creepy-robotic-spider-dress/
SCRAPED 2014
zzz...zzz...zzz...
HTML STORED!
CURRENT YEAR: 2015
http://mashable.com/2013/01/10/creepy-robotic-spider-dress/
SCRAPED 2015
zzz...zzz...zzz...
HTML STORED!
URL: http://mashable.com/2014/04/08/childrens-book-morals-adulthood/
CURRENT 

KeyboardInterrupt: 

In [54]:
# candidate_dates = scrap.get_snap_dates()

In [55]:
# shifted_dates = scrap.shift_dates(data['url'], data['timedelta'])
# shifted_dates[1]

datetime.date(2015, 1, 8)

In [56]:
# closest = []
# for i, key in zip(range(len(data['url'])), candidate_dates.keys()):
#     closest.append(scrap.get_closest(shifted_dates[i], candidate_dates[key]))

# scraping_dates = {k: v for k, v in zip(data['url'], closest)}
# scraping_dates

{'http://mashable.com/2014/09/08/safest-cabbies-nyc/': datetime.date(2014, 10, 14),
 'http://mashable.com/2013/07/25/3d-printed-rifle/': datetime.date(2014, 12, 26),
 'http://mashable.com/2013/10/30/digital-dinosaur-movements/': datetime.date(2014, 12, 31),
 'http://mashable.com/2014/08/27/homer-simpson-ice-bucket/': datetime.date(2014, 9, 7),
 'http://mashable.com/2013/01/10/creepy-robotic-spider-dress/': datetime.date(2014, 2, 23),
 'http://mashable.com/2014/11/27/prank-cash-thanksgiving/': datetime.date(2014, 10, 13),
 'http://mashable.com/2013/01/17/pokki-windows-8-2/': datetime.date(2014, 12, 10),
 'http://mashable.com/2014/03/20/emma-stone-spice-girls/': datetime.date(2014, 12, 31),
 'http://mashable.com/2014/07/28/three-out-of-print-j-d-salinger-stories-republished/': datetime.date(2014, 12, 20),
 'http://mashable.com/2014/09/16/robot-cheetah-freed/': datetime.date(2014, 12, 31),
 'http://mashable.com/2013/02/14/tesla-vs-nyt/': datetime.date(2015, 1, 7),
 'http://mashable.com/20

In [57]:
# import re

# list_1 = [data['url'].iloc[0], data['url'].iloc[1], data['url'].iloc[2]]
# list_2 = [str(scraping_dates['http://mashable.com/2014/09/08/safest-cabbies-nyc/']).replace("-", "")]

# pattern = r"(https://web.archive.org/web/)\d{8}"
# updated_list_1 = [re.sub(pattern, rf"\g<1>{list_2[0]}", url) for url in list_1]

# print(updated_list_1)
# list_2

['http://mashable.com/2014/09/08/safest-cabbies-nyc/', 'http://mashable.com/2013/07/25/3d-printed-rifle/', 'http://mashable.com/2013/10/30/digital-dinosaur-movements/']


['20141014']

In [None]:
# scrap.switch_date(to_be_scraped['url'], scraping_dates)

In [81]:
!python3 scrape.py

Traceback (most recent call last):
  File "/home/mattizza/Documents/DSL-Online-News-Popularity/scrape.py", line 88, in <module>
    scrap.start_driver()
  File "/home/mattizza/Documents/DSL-Online-News-Popularity/Scraper2.py", line 41, in start_driver
    self.__driver__ = webdriver.Firefox()
  File "/home/mattizza/Documents/DSL-Online-News-Popularity/venv/lib/python3.10/site-packages/selenium/webdriver/firefox/webdriver.py", line 201, in __init__
    super().__init__(command_executor=executor, options=options, keep_alive=True)
  File "/home/mattizza/Documents/DSL-Online-News-Popularity/venv/lib/python3.10/site-packages/selenium/webdriver/remote/webdriver.py", line 286, in __init__
    self.start_session(capabilities, browser_profile)
  File "/home/mattizza/Documents/DSL-Online-News-Popularity/venv/lib/python3.10/site-packages/selenium/webdriver/remote/webdriver.py", line 378, in start_session
    response = self.execute(Command.NEW_SESSION, parameters)
  File "/home/mattizza/Documents

In [77]:
import pickle
with open('variables.pkl', 'rb') as file:
    url_html, candidate_dates, shifted_dates, closest, scraping_dates, switched = pickle.load(file)
