In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the development set
devp_df = pd.read_csv('development.csv')

# Load the evaluation set
eval_df = pd.read_csv('evaluation.csv')

# 1. Data Exploration

In [23]:
# Check the data structure and dimensions
#devp_df = devp_df.drop_duplicates()
print(devp_df.shape)

(31715, 50)


In [24]:
print(devp_df.head())

   id                                                url  timedelta  \
0   0  http://mashable.com/2014/09/08/safest-cabbies-...      121.0   
1   1   http://mashable.com/2013/07/25/3d-printed-rifle/      532.0   
2   2  http://mashable.com/2013/10/30/digital-dinosau...      435.0   
3   3  http://mashable.com/2014/08/27/homer-simpson-i...      134.0   
4   4  http://mashable.com/2013/01/10/creepy-robotic-...      728.0   

   n_tokens_title  n_tokens_content  n_unique_tokens  n_non_stop_words  \
0            12.0            1015.0         0.422018               1.0   
1             9.0             503.0         0.569697               1.0   
2             9.0             232.0         0.646018               1.0   
3            12.0             171.0         0.722892               1.0   
4            11.0             286.0         0.652632               1.0   

   n_non_stop_unique_tokens  num_hrefs  num_self_hrefs  ...  \
0                  0.545031       10.0             6.0  ...   
1 

In [25]:
# Compute basic statistics
print(devp_df.describe())

                id     timedelta  n_tokens_title  n_tokens_content  \
count  31715.00000  31715.000000    31715.000000      31715.000000   
mean   15857.00000    354.058206       10.390730        544.048179   
std     9155.47623    214.314223        2.115643        467.730448   
min        0.00000      8.000000        2.000000          0.000000   
25%     7928.50000    163.000000        9.000000        246.000000   
50%    15857.00000    338.000000       10.000000        409.000000   
75%    23785.50000    542.000000       12.000000        713.000000   
max    31714.00000    731.000000       23.000000       8474.000000   

       n_unique_tokens  n_non_stop_words  n_non_stop_unique_tokens  \
count     31715.000000      31715.000000              31715.000000   
mean          0.530754          0.970140                  0.672973   
std           0.137106          0.170203                  0.154128   
min           0.000000          0.000000                  0.000000   
25%           0.471

# 1. Preprocessing

Feature selection is performed to remove non-predictive features and those that have the lowest importance in the random forest regression algorithm which will be used later.

Removing outliers using the IQR methods. Some tuning is necessary in veryfing the range of values to keep, to avoid removing too much data.

In [26]:
devp_df = devp_df.drop(["id", "url", "kw_max_max", "kw_min_min",'abs_title_sentiment_polarity', 'max_positive_polarity'], axis = 1)
Q1 = devp_df.quantile(0.25, numeric_only=True)
Q3 = devp_df.quantile(0.75, numeric_only=True)
IQR = Q3 - Q1

# # Align DataFrame and Series before comparing
devp_df, IQR = devp_df.align(IQR, axis=1)

# # Only keep rows in dataframe that have values within 7*IQR of Q1 and Q3
devp_df =devp_df[~((devp_df < (Q1 - 7 * IQR)) | (devp_df > (Q3 + 7 * IQR))).any(axis=1)]


In [27]:
# Check the data structure and dimensions
print(devp_df.shape)

(24455, 44)


In [28]:
# Get the names of all columns in the dataset
all_columns = devp_df.columns

# Identify the numeric columns by excluding any non-numeric columns
numeric_columns = [col for col in all_columns if devp_df[col].dtype in ['int64', 'float64']]
print (numeric_columns)

['LDA_00', 'LDA_01', 'LDA_02', 'LDA_03', 'LDA_04', 'abs_title_subjectivity', 'average_token_length', 'avg_negative_polarity', 'avg_positive_polarity', 'global_rate_negative_words', 'global_rate_positive_words', 'global_sentiment_polarity', 'global_subjectivity', 'kw_avg_avg', 'kw_avg_max', 'kw_avg_min', 'kw_max_avg', 'kw_max_min', 'kw_min_avg', 'kw_min_max', 'max_negative_polarity', 'min_negative_polarity', 'min_positive_polarity', 'n_non_stop_unique_tokens', 'n_non_stop_words', 'n_tokens_content', 'n_tokens_title', 'n_unique_tokens', 'num_hrefs', 'num_imgs', 'num_keywords', 'num_self_hrefs', 'num_videos', 'rate_negative_words', 'rate_positive_words', 'self_reference_avg_sharess', 'self_reference_max_shares', 'self_reference_min_shares', 'shares', 'timedelta', 'title_sentiment_polarity', 'title_subjectivity']


Check the number of missing values and which features contain missing values for both sets separately

In [29]:
# Check for missing values for both sets
missing_values = (devp_df.isnull().sum())
print (missing_values)

missing_values_eval = (eval_df.isnull().sum())

print(missing_values_eval)

LDA_00                           0
LDA_01                           0
LDA_02                           0
LDA_03                           0
LDA_04                           0
abs_title_subjectivity           0
average_token_length             0
avg_negative_polarity            0
avg_positive_polarity            0
data_channel                     0
global_rate_negative_words       0
global_rate_positive_words       0
global_sentiment_polarity        0
global_subjectivity              0
kw_avg_avg                       0
kw_avg_max                       0
kw_avg_min                       0
kw_max_avg                       0
kw_max_min                       0
kw_min_avg                       0
kw_min_max                       0
max_negative_polarity            0
min_negative_polarity            0
min_positive_polarity            0
n_non_stop_unique_tokens         0
n_non_stop_words                 0
n_tokens_content                 0
n_tokens_title                   0
n_unique_tokens     

Feature selection on the evaluation set. The 'id' column will be reused later to return the final dataset: ('Id', 'Predicted')

In [30]:
# removing the non-predictive column i.e. url
to_remove_columns_eval = ["id", "url", "kw_max_max", "kw_min_min",'abs_title_sentiment_polarity', 'max_positive_polarity']
#devp_df = devp_df.drop(non_predictive_cols, axis=1)
id_col = eval_df['id']
eval_df = eval_df.drop(to_remove_columns_eval, axis = 1)


Merging the two dataset for preprocessing before splitting them again later for training and testing

In [31]:
# Remember the number of rows in each dataset
n_dev = devp_df.shape[0]
n_eval = eval_df.shape[0]

# Concatenate the two datasets
full_df = pd.concat([devp_df, eval_df])

Text processing on the 'url' feature (attempted, not used in the model)

In [32]:
#from urllib.parse import urlparse
#import datetime
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.feature_extraction.text import TfidfTransformer
#from sklearn.pipeline import Pipeline

# Parse the URLs to extract the domain and path
#full_df['parsed_url'] = full_df['url'].apply(lambda x: urlparse(x))
#full_df['domain'] = full_df['parsed_url'].apply(lambda x: x.netloc)
#full_df['path'] = full_df['parsed_url'].apply(lambda x: x.path)

# Extract the publication year, month, and day from the path
#full_df['year'] = full_df['path'].apply(lambda x: x.split('/')[1])
#full_df['month'] = full_df['path'].apply(lambda x: x.split('/')[2])
#full_df['day'] = full_df['path'].apply(lambda x: x.split('/')[3])

# Extract the topic from the path
#full_df['topic'] = full_df['path'].apply(lambda x: x.split('/')[4])

# Convert the year, month, and day to a datetime object
#full_df['date'] = pd.to_datetime(full_df[['year', 'month', 'day']], errors='coerce')

# Extract the day of the week from the date
#full_df['day_of_week'] = full_df['date'].dt.dayofweek

# Drop the original 'url' and 'parsed_url' columns, as well as the 'date' column
#full_df = full_df.drop(['url', 'parsed_url', 'date', 'domain', 'day_of_week', 'path', 'year'], axis=1)

# Create a pipeline to transform the 'topic' feature
#pipeline = Pipeline([
#    ('vect', CountVectorizer()),  # Convert the text into a matrix of token counts
#    ('tfidf', TfidfTransformer())  # Convert the counts into TF-IDF features
#])

# Fit the pipeline to the 'topic' feature and transform the data
#topic_features = pipeline.fit_transform(full_df['topic'])

# Reset the index of full_df
#full_df.reset_index(drop=True, inplace=True)

# Convert the sparse matrix of features into a DataFrame
#topic_features_df = pd.DataFrame(topic_features.toarray(), columns=pipeline.named_steps['vect'].get_feature_names_out())

# Concatenate the original DataFrame with the new features
#full_df = pd.concat([full_df, topic_features_df], axis=1)

# Drop the original 'topic' column
#full_df = full_df.drop('topic', axis=1)


Handling missing values

In [33]:

# Replace missing values in the development set with the mean for each feature
full_df = full_df.fillna(full_df.mean(numeric_only = True))


  full_df = full_df.fillna(full_df.mean())


In [34]:
import pandas as pd

print("Total number of features: ", full_df.shape[1])

# Get the data types of each column
data_types = full_df.dtypes

# Filter columns with object or string data types
categorical_cols = data_types[data_types == "object"].index.tolist()

# Print the number of categorical features
print("Number of categorical features: ", categorical_cols)


Total number of features:  44
Number of categorical features:  ['data_channel', 'weekday']


Separate the target features for the stacked dataset

In [35]:
# Separate the target variable (shares)
y_full = full_df["shares"]
X_full = full_df.drop("shares", axis=1)

print(X_full.head)


<bound method NDFrame.head of         LDA_00    LDA_01    LDA_02    LDA_03    LDA_04  \
1     0.020007  0.020008  0.325602  0.020004  0.614379   
4     0.214708  0.025062  0.025016  0.025187  0.710028   
5     0.020002  0.278532  0.020001  0.661337  0.020127   
6     0.040032  0.040014  0.040015  0.040008  0.839932   
7     0.025036  0.268234  0.025006  0.155403  0.526322   
...        ...       ...       ...       ...       ...   
7912  0.033752  0.033453  0.033383  0.033750  0.865662   
7913  0.028572  0.529119  0.028572  0.385166  0.028571   
7914  0.025001  0.899943  0.025001  0.025055  0.025000   
7915  0.025039  0.025068  0.278784  0.646107  0.025003   
7916  0.033337  0.531415  0.202306  0.033746  0.199197   

      abs_title_subjectivity  average_token_length  avg_negative_polarity  \
1                   0.500000              4.576541              -0.157500   
4                   0.300000              5.006993              -0.251786   
5                   0.500000              

Encoding of categorical features with one-hot encoding

In [36]:
#import pandas as pd

# Assuming df is your DataFrame
#X_full['year'] = X_full['year'].astype(int)
#X_full['month'] = X_full['month'].astype(int)
#X_full['day'] = X_full['day'].astype(int)


# Select only the categorical columns
categorical_columns = X_full.select_dtypes(include=['object']).columns
print(categorical_columns)
# Perform one-hot encoding on the categorical columns
X_full = pd.get_dummies(X_full, columns=categorical_columns)

print("Total number of features: ", X_full.shape[1])


Index(['data_channel', 'weekday'], dtype='object')
Total number of features:  54


Split the dataset again

In [37]:
y_dev = y_full.iloc[:n_dev]

X_dev_processed = X_full.iloc[:n_dev]

X_eval_processed = X_full.iloc[n_dev:]

print(X_eval_processed.shape)
print(X_dev_processed.shape)


(7917, 54)
(24455, 54)


# 2 Regression

In [38]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler


# Create the scaler
scaler = StandardScaler()

# Split the data into training and validation set
X_train, X_val, y_train, y_val = train_test_split(X_dev_processed, y_dev, test_size=0.2, random_state=42)

# Fit the scaler to the training data and transform the training data
X_train = scaler.fit_transform(X_train)
# Transform the validation data using the same scaler
X_val = scaler.transform(X_val)

# dbscan = DBSCAN(eps=0.5, min_samples=5)
# clusters = dbscan.fit_predict(X_train)

# # Identify the core samples
# core_samples_mask = np.zeros_like(dbscan.labels_, dtype=bool)
# core_samples_mask[dbscan.core_sample_indices_] = True

# Remove the outliers from the training data
#X_train= X_train[core_samples_mask]
#y_train = y_train[core_samples_mask]


X_eval_processed = scaler.transform(X_eval_processed)

# pca = PCA(n_components=0.95)
# X_train = pca.fit_transform(X_train)
# X_val = pca.transform(X_val)
# X_eval_processed = pca.transform(X_eval_processed)

#print(X_train.shape)
# Define the model
model = RandomForestRegressor(random_state=42)

# Define the parameters to tune
param_grid = {
    #the commented paramters have been used for parameter tuning
    'n_estimators': [50], #62 and 100 other parameters used
    'max_depth': [None] #10, 20 other parameters used
}


# Create a GridSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv = 5 , n_jobs=-1, verbose=2) #5 folds have been used turing tuning, but reduced to 2 for speed in the final evaluation

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Get the best estimator
best_model = grid_search.best_estimator_

# Make predictions on the validation set
y_val_pred = best_model.predict(X_val)

# Evaluate the model using RMSE on validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Root Mean Squared Error on Validation set: ", rmse_val)

#The random foret regressor we have trained can help us determine how useful each feature is for
#the model. We can extract this feature importance and sort them in descending order.
#print(sorted(zip(X_dev_processed.columns, best_model.feature_importances_), key=lambda x: x[1],reverse=True))

Fitting 2 folds for each of 1 candidates, totalling 2 fits
Root Mean Squared Error on Validation set:  2297.8687025597997
[('kw_avg_avg', 0.0825497363469677), ('kw_max_avg', 0.03473059391288893), ('average_token_length', 0.03317319289502724), ('kw_avg_max', 0.03230908039719175), ('timedelta', 0.03228941626840139), ('global_subjectivity', 0.031013968533578778), ('kw_avg_min', 0.030302681216337158), ('LDA_00', 0.03018356545398294), ('LDA_01', 0.029592102885157314), ('self_reference_min_shares', 0.02945272636832846), ('LDA_04', 0.029402415094702688), ('global_rate_positive_words', 0.028176054748499314), ('LDA_03', 0.027727243406407316), ('avg_positive_polarity', 0.0273123671610327), ('n_unique_tokens', 0.0271617184551294), ('n_non_stop_unique_tokens', 0.026909558866630608), ('global_sentiment_polarity', 0.02634781993350155), ('LDA_02', 0.025667528967522602), ('self_reference_avg_sharess', 0.024748698719953676), ('num_hrefs', 0.02437358620062374), ('kw_max_min', 0.024329095158616268), ('av

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures
import numpy as np

# Create the scaler
scaler = StandardScaler()

# Create the PolynomialFeatures transformer
#poly = PolynomialFeatures(degree=2)

# Split the data into training and validation set
X_train, X_val, y_train, y_val = train_test_split(X_dev_processed, y_dev, test_size=0.2, random_state=42)

# Fit the scaler to the training data and transform the training data
X_train = scaler.fit_transform(X_train)

# Transform the training data to polynomial features
#X_train = poly.fit_transform(X_train)

# Transform the validation data using the same scaler and PolynomialFeatures transformer
X_val = scaler.transform(X_val)
#X_val = poly.transform(X_val)

X_eval_processed = scaler.transform(X_eval_processed)
#X_eval_processed = poly.transform(X_eval_processed)

# Define the model
model = Ridge(random_state=42)

# Define the parameters to tune
param_grid = {'alpha': np.logspace(-4, 4, 20)}

# Create a RandomizedSearchCV object
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv = 5, n_jobs=-1, verbose=2)

# Fit the RandomizedSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
best_params = grid_search.best_params_

# Get the best estimator
best_model_ridge = grid_search.best_estimator_

# Make predictions on the validation set
y_val_pred = best_model_ridge.predict(X_val)

# Evaluate the model using RMSE on validation set
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Root Mean Squared Error on Validation set: ", rmse_val)
print("Best value of alpha: ", best_params['alpha'])


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Root Mean Squared Error on Validation set:  2250.080846755386
Best value of alpha:  206.913808111479


In [40]:
# Use the model to make predictions
shares_pred = best_model.predict(X_eval_processed)
print(shares_pred.shape)
# Add the predicted values as a new 'shares' column in the new data
eval_df['Predicted'] = shares_pred
eval_df['Id'] = id_col
# Assuming new_data is your DataFrame
columns_to_keep = ['Id', 'Predicted']
result = eval_df[columns_to_keep]

# Save the new data with the predicted values to a new CSV file
result.to_csv('result.csv', index=False)

(7917,)
