In [1]:
#\-- IMPORT MODULES, CLASSES AND METHODS --/#

import zipfile                          #############################
import os                               # || FILE SYSTEM / UTILS || #
import copy                             #############################
from prettytable import PrettyTable
import copy

# ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

import numpy as np                  ###################################
import pandas as pd                 # || EXPLORATIVE DATA ANALYSIS || #
import matplotlib.pyplot as plt     ###################################
import seaborn as sns
import matplotlib
# https://towardsdatascience.com/handling-missing-data-like-a-pro-part-3-model-based-multiple-imputation-methods-bdfe85f93087 NumPyro, impyute,

# ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

import sklearn
import re
import importlib
import time

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score

from sklearn import naive_bayes                         #########################
from sklearn import neural_network                      #  |-----------------|  #
from sklearn import svm                                 # || MODEL SELECTION || #
from sklearn import tree                                #  |-----------------|  #
from sklearn import linear_model                        #########################

# from PrunedCV import PrunedCV

# ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

from sklearn.model_selection import StratifiedKFold     ##########################
from sklearn.model_selection import ParameterGrid       # || MODEL VALIDATION || #
                                                        ##########################
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder
from sklearn.datasets import make_classification

%matplotlib inline


from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import re
from datetime import datetime, timedelta

In [2]:
#\-- SET ENVIRONMENT --/#
# Before starting we need to store the data properly. We define an ad-hoc folder where we will store everything.
main_PATH = os.getcwd()

# We check whether we already have the data.                        
if 'data' not in os.listdir():                                      
                                                                    
    # Unzip files.
    with zipfile.ZipFile(r'summer_project_dataset.zip') as zip_ref:

        os.mkdir(main_PATH + '/data')   # We create the 'data' directory,
        os.chdir(main_PATH + '/data')   # we change directory,
    
        data_PATH = os.getcwd()         # we get the data path
        zip_ref.extractall(data_PATH)   # and we unzip there.       #####################
                                                                    # || FILE SYSTEM || #    
    file_PATH = data_PATH + '/summer_project_dataset'               #####################

else:

    # We just build the paths.
    data_PATH = main_PATH + '/data'
    file_PATH = data_PATH + '/summer_project_dataset'

# Finally, we go back to our main path.
os.chdir(main_PATH)

# ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

# We also set a seed for reproducibility purposes.      #####################
SEED = 42                                               # || RANDOM SEED || #
np.random.seed(SEED)                                    #####################

# ////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

# LaTeX style plots.
plt.rcParams['figure.figsize'] = (12, 8)
# plt.rcParams['text.usetex']    = True         ############################
# plt.rcParams['font.family']    = 'serif'      # || DEFAULT PARAMETERS || #
# plt.rcParams['font.size']      = '10'         ############################

pd.set_option('display.max_rows', 20)
# pd.set_option('display.max_rows', 10)

In [3]:
#\-- DATASET LOADING AND PREPROCESSING --/#
# Aome variables are stored as float, but they are actually int. Two reasons why:
#       -) nan values are considered as float --> first estimate them and then change the data type.
#       -) there are inconsistencies, especially in kw_max_min where some int values are float instead.
# for the moment just let's store everything as float, but further inspections are needed.

data_types = {
              'url' : str, 'timedelta' : int, 'shares' : int, 'data_channel' : str, 'weekday' : str, 
              
              'n_tokens_title'          : int, 'n_tokens_content'       : int, 'n_unique_tokens' : float, 'n_non_stop_words' : float,
              'n_non_stop_unique_tokens': float, 'average_token_length' : float,

              'num_hrefs' : int, 'num_self_hrefs' : int, 'num_imgs' : float, 'num_videos' : float,
              
              'kw_min_min' : float, 'kw_max_min' : float, 'kw_avg_min' : float, 'kw_min_max' : float, 'kw_max_max'   : float,
              'kw_avg_max' : float, 'kw_min_avg' : float, 'kw_max_avg' : float, 'kw_avg_avg' : float, 'num_keywords' : float,
              
              'self_reference_min_shares' : float, 'self_reference_max_shares' : float, 'self_reference_avg_sharess' : float,
              
              'LDA_00' : float, 'LDA_01' : float, 'LDA_02' : float, 'LDA_03' : float, 'LDA_04' : float,
              
              'global_subjectivity' : float, 'global_sentiment_polarity' : float, 'global_rate_positive_words' : float, 'global_rate_negative_words' : float,
              
              'rate_positive_words' : float, 'rate_negative_words' : float,
              
              'avg_positive_polarity' : float, 'min_positive_polarity' : float, 'max_positive_polarity' : float, 'avg_negative_polarity' : float,
              'min_negative_polarity' : float, 'max_negative_polarity' : float,

              'title_subjectivity' : float, 'title_sentiment_polarity' : float, 'abs_title_subjectivity' : float, 'abs_title_sentiment_polarity' : float,
              }                                                    
                                                                   
                                                                   
df = pd.read_csv(file_PATH + r'/development.csv',                 
                   usecols = lambda column: column != 'id', dtype = data_types)

# Drop columns

We drop all the correlated or useless features and we perform the train/test split.

In [4]:

X = df.drop(['shares'], axis=1)
y = df['shares']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 3000, random_state = 42)


We keep the test aside and we never touch it. We then perform a train/validation split.

In [5]:
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, test_size = 2000, random_state = 42)

In [6]:
print(f'Initial data: {df.shape[0]}\nTrain set size: {X_train.shape[0]}\t\t({(X_train.shape[0] / df.shape[0]) * 100:.2f}%)\n\
Validation set size: {X_validation.shape[0]}\t({(X_validation.shape[0] / df.shape[0]) * 100:.2f}%)\n\
Test set size: {X_test.shape[0]}\t\t({(X_test.shape[0] / df.shape[0]) * 100:.2f}%)')

Initial data: 31715
Train set size: 26715		(84.23%)
Validation set size: 2000	(6.31%)
Test set size: 3000		(9.46%)


We choose these proportions for three important reasons:
* More training data for more complex models
* The validation data are not that important, we need them just to tune the hyperparameters.
* The test data are more important in order to evaluate the model, so we need more of them.

In [7]:
mapping = {value: index for index, value in enumerate(df.columns)}
mapping

class columnDropperTransformer():
    def __init__(self,columns):
        self.columns=columns

    def transform(self,X,y=None):
        return X.drop(self.columns,axis=1)

    def fit(self, X, y=None):
        return self 


In [9]:

# DROP FEATURES
drop_features = ['kw_max_min', 'kw_max_avg', 'kw_min_min', 'url', 'timedelta', 'n_non_stop_words',
                'n_tokens_content', 'n_non_stop_unique_tokens', 'self_reference_max_shares',
                'self_reference_min_shares', 'rate_positive_words', 'rate_negative_words',
                'max_positive_polarity', 'min_positive_polarity', 'min_negative_polarity',
                'max_negative_polarity', 'abs_title_subjectivity', 'abs_title_sentiment_polarity',
                'kw_min_max', 'kw_max_max', 'kw_min_avg']
drop_indices = [mapping[value] for value in drop_features]

# -------------------------------------------------------------------------------------------------------------------------------------------

# APPLY LOGARITHMS
log1p_features = ['num_hrefs', 'num_self_hrefs', 'num_imgs', 'num_videos', 'kw_avg_min', 'kw_avg_avg', 'self_reference_avg_sharess']
log1p_indices = [mapping[value] for value in log1p_features]

log_feature = ['shares']
log_index = [mapping[value] for value in log_feature]

# -------------------------------------------------------------------------------------------------------------------------------------------

# DISCRETIZE KW_AVG_MAX
conditions = [
              (np.log1p(df['kw_avg_max']) < 2),
              ((np.log1p(df['kw_avg_max']) < 11) & (np.log1p(df['kw_avg_max']) > 0)),
              (np.log1p(df['kw_avg_max']) > 11)
]

labels = ['kw_avg_max_none', 'kw_avg_max_medium', 'kw_avg_max_high']

df['kw_avg_max'] = np.where(conditions[0], labels[0],
                            np.where(conditions[1], labels[1],
                                     np.where(conditions[2], labels[2], None)))

# ------------------------------------------------------------------------------------------------------------------------------------------

# DISCRETIZE TITLE_SUBJECTIVITY AND TITLE_SENTIMENT_POLARITY
subjectivity_bin_values = {
                           1 : 'no_subjectivity',
                           2 : 'low_subjectivity',
                           3 : 'medium_subjectvity',
                           4 : 'high_subjectivity'
                          }

subjectivity_bins = [-0.000001, 0.000001, 0.334, 0.667, 1]


df['title_sentiment_polarity'] = pd.cut(df['title_sentiment_polarity'],
                                        bins=[-1.00001, -0.5, -0.000000001, +0.000000001, 0.5, 1],
                                        labels=['high_negative_polarity', 'low_negative_polarity', 'neutral_polarity', 'low_positive_polarity', 'high_positive_polarity'],
                                        right = True)

polarity_bin_values = {
                           1 : 'high_negative_polarity',
                           2 : 'low_negative_polarity',
                           3 : 'neutral_polarity',
                           4 : 'low_positive_polarity',
                           5 : 'high_positive_polarity'
                          }

polarity_bins = [-1.00001, -0.5, -0.000000001, +0.000000001, 0.5, 1]

# Define a function to assign values to bins
def discretize_title_subjectivity(X):
    bin_indices = np.digitize(X[:, 0], subjectivity_bins, right = True)
    bin_labels = np.array([subjectivity_bin_values[i] for i in bin_indices]).reshape(-1, 1)
    return bin_labels

def discretize_title_sentiment_polarity(X):
    bin_indices = np.digitize(X[:, 0], polarity_bins, right = True)
    bin_labels = np.array([polarity_bin_values[i] for i in bin_indices]).reshape(-1, 1)
    return bin_labels

# ------------------------------------------------------------------------------------------------------------------------------------------

# DISCRETIZE WEEKDAY
df['weekday'] = np.where(df['weekday'].isin(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']), 'Not Weekend', 'Weekend')

# ------------------------------------------------------------------------------------------------------------------------------------------

# APPLY PIPLINE
preprocessing_pipeline = Pipeline([
    ('preprocessing', ColumnTransformer([
        ("columnDropper", columnDropperTransformer, drop_features),                                                                             # Drop useless/correlated features.
        ('log1p', FunctionTransformer(func = np.log1p, inverse_func = np.expm1, validate = False), log1p_indices),  # Apply log1p.
        ('log', FunctionTransformer(func = np.log, inverse_func = np.exp, validate = False), log_index),            # Apply logp.
        ('discretize_subjectivity', FunctionTransformer(func = discretize_title_subjectivity, validate = False), [42]),
        ('discretize_sentiment_polarity', FunctionTransformer(func = discretize_title_sentiment_polarity, validate = False), [43]),
        ('onehot', OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore'), [18, 42, 43, 47, 48]),
        ('regressor', GradientBoostingRegressor())],
    remainder = 'passthrough'))
    ])



In [11]:
# preprocessed_data = preprocessing_pipeline.fit(X_train, y_train)

ValueError: not enough values to unpack (expected 3, got 2)