### Facebook post comets prediction based on post features.

#### Dowload the data zip from : https://archive.ics.uci.edu/ml/datasets/Facebook+Comment+Volume+Dataset

In [None]:
import pandas as pd

In [2]:
file = "C:/Users/inbarg/Desktop/Msc Intelligence Systems new/xgboost opt/Dataset/Training/Features_Variant_1.csv"
df = pd.read_csv(file, header=None)
df.sample(n=5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,44,45,46,47,48,49,50,51,52,53
7423,8413,30646,355,32,0.0,90.0,8.315789,0.0,20.463196,0.0,...,0,0,0,0,0,0,1,0,0,0
7578,23985,178,2773,36,0.0,130.0,16.557018,10.0,19.253191,0.0,...,0,0,0,0,0,0,1,0,0,0
38483,23055,0,67,36,0.0,455.0,19.818182,12.0,38.153958,0.0,...,0,1,0,0,0,0,0,0,1,9
22187,3626570,0,41233,8,29.0,455.0,238.647059,252.0,142.095221,1.0,...,1,0,0,0,0,0,0,0,1,32
29039,812001,0,95068,24,12.0,832.0,189.3,95.5,209.942635,1.0,...,0,0,0,0,0,1,0,0,0,6


In [4]:
#Check size : 

print("Dataset has {} entries and {} features".format(*df.shape))

Dataset has 40949 entries and 54 features


#### Spliting the data set

In [5]:
X, y = df.loc[:,:52].values, df.loc[:,53].values

In [6]:
# 90 % training. 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.1, random_state=42)

#### Loading data into DMatrices

In [7]:
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

#### Building a baseline model ( using MAE )

In [8]:
from sklearn.metrics import mean_absolute_error


In [11]:
#For our baseline, we will keep things simple and predict that each new post will get the mean number of comments that we observed in the training set.
import numpy as np
# "Learn" the mean from the training data
mean_train = np.mean(y_train)
# Get predictions on the test set
baseline_predictions = np.ones(y_test.shape) * mean_train
# Compute MAE
mae_baseline = mean_absolute_error(y_test, baseline_predictions)
print("Baseline MAE is {:.2f}".format(mae_baseline))


Baseline MAE is 11.31


### Training and Tuning an XGBoost model

#### The params dictionary

In [12]:
params = {
    # Parameters that we are going to tune.
    'max_depth':6,
    'min_child_weight': 1,
    'eta':.3,
    'subsample': 1,
    'colsample_bytree': 1,
    # Other parameters
    'objective':'reg:linear',
}

#### Parameters num_boost_round and early_stopping_rounds

In [13]:
params['eval_metric'] = "mae"

In [14]:
num_boost_round = 999

In [15]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:5.97478
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:5.03359
[2]	Test-mae:4.64572
[3]	Test-mae:4.42331
[4]	Test-mae:4.39328
[5]	Test-mae:4.35544
[6]	Test-mae:4.31315
[7]	Test-mae:4.33087
[8]	Test-mae:4.37164
[9]	Test-mae:4.38774
[10]	Test-mae:4.39443
[11]	Test-mae:4.40661
[12]	Test-mae:4.39124
[13]	Test-mae:4.39088
[14]	Test-mae:4.39827
[15]	Test-mae:4.39104
[16]	Test-mae:4.40307
Stopping. Best iteration:
[6]	Test-mae:4.31315



In [16]:
print("Best MAE: {:.2f} with {} rounds".format(
                 model.best_score,
                 model.best_iteration+1))

Best MAE: 4.31 with 7 rounds


#### Using XGBoost’s CV 
#### to tune : num_boost_round + early_stopping_rounds 

In [17]:
cv_results = xgb.cv(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    seed=42,
    nfold=5,
    metrics={'mae'},
    early_stopping_rounds=10
)
cv_results



Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,5.604765,0.064495,5.689189,0.270149
1,4.622477,0.065106,4.849525,0.271883
2,4.05971,0.065772,4.468342,0.239475
3,3.722983,0.06082,4.268584,0.224462
4,3.510303,0.061203,4.192448,0.189762
5,3.367213,0.061021,4.172856,0.189612
6,3.245549,0.060276,4.15786,0.192572
7,3.151495,0.062612,4.143254,0.19444
8,3.082321,0.05902,4.147843,0.196197
9,3.016803,0.057321,4.144657,0.189785


In [18]:
cv_results['test-mae-mean'].min()

4.0827876000000005

#### Parameters max_depth and min_child_weight

In [19]:
#Let’s make a list containing all the combinations max_depth/min_child_weight that we want to tr:

# You can try wider intervals with a larger step between
# each value and then narrow it down. Here after several
# iteration I found that the optimal value was in the
# following ranges.
gridsearch_params = [
    (max_depth, min_child_weight)
    for max_depth in range(9,12)
    for min_child_weight in range(5,8)
]

In [20]:
#Let’s run cross validation on each of those pairs: 

# Define initial best params and MAE 
min_mae = float("Inf")
best_params = None
for max_depth, min_child_weight in gridsearch_params:
    print("CV with max_depth={}, min_child_weight={}".format(
                             max_depth,
                             min_child_weight))
    # Update our parameters
    params['max_depth'] = max_depth
    params['min_child_weight'] = min_child_weight
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best MAE
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (max_depth,min_child_weight)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with max_depth=9, min_child_weight=5


The current behaviour of 'Series.argmin' is deprecated, use 'idxmin'
instead.
The behavior of 'argmin' will be corrected to return the positional
minimum in the future. For now, use 'series.values.argmin' or
'np.argmin(np.array(values))' to get the position of the minimum
row.


	MAE 4.04524 for 6 rounds
CV with max_depth=9, min_child_weight=6
	MAE 4.0764622 for 5 rounds
CV with max_depth=9, min_child_weight=7
	MAE 4.0753928 for 5 rounds
CV with max_depth=10, min_child_weight=5
	MAE 4.0805826000000005 for 5 rounds
CV with max_depth=10, min_child_weight=6
	MAE 4.035100600000001 for 5 rounds
CV with max_depth=10, min_child_weight=7
	MAE 4.0872416000000005 for 5 rounds
CV with max_depth=11, min_child_weight=5
	MAE 4.062633 for 5 rounds
CV with max_depth=11, min_child_weight=6
	MAE 4.054831999999999 for 5 rounds
CV with max_depth=11, min_child_weight=7
	MAE 4.0581036 for 5 rounds
Best params: 10, 6, MAE: 4.035100600000001


In [21]:
# Lets update our parameters : 
params['max_depth'] = 10
params['min_child_weight'] = 6

#### Parameters subsample and colsample_bytree

In [22]:
gridsearch_params = [
    (subsample, colsample)
    for subsample in [i/10. for i in range(7,11)]
    for colsample in [i/10. for i in range(7,11)]
]

In [23]:
# Run the optmizer : 
min_mae = float("Inf")
best_params = None
# We start by the largest values and go down to the smallest
for subsample, colsample in reversed(gridsearch_params):
    print("CV with subsample={}, colsample={}".format(
                             subsample,
                             colsample))
    # We update our parameters
    params['subsample'] = subsample
    params['colsample_bytree'] = colsample
    # Run CV
    cv_results = xgb.cv(
        params,
        dtrain,
        num_boost_round=num_boost_round,
        seed=42,
        nfold=5,
        metrics={'mae'},
        early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = (subsample,colsample)
print("Best params: {}, {}, MAE: {}".format(best_params[0], best_params[1], min_mae))

CV with subsample=1.0, colsample=1.0


The current behaviour of 'Series.argmin' is deprecated, use 'idxmin'
instead.
The behavior of 'argmin' will be corrected to return the positional
minimum in the future. For now, use 'series.values.argmin' or
'np.argmin(np.array(values))' to get the position of the minimum
row.


	MAE 4.035100600000001 for 5 rounds
CV with subsample=1.0, colsample=0.9
	MAE 4.0827408 for 6 rounds
CV with subsample=1.0, colsample=0.8
	MAE 4.1105364 for 5 rounds
CV with subsample=1.0, colsample=0.7
	MAE 4.120717999999999 for 6 rounds
CV with subsample=0.9, colsample=1.0
	MAE 4.030346199999999 for 5 rounds
CV with subsample=0.9, colsample=0.9
	MAE 4.1151646 for 7 rounds
CV with subsample=0.9, colsample=0.8
	MAE 4.1586564 for 5 rounds
CV with subsample=0.9, colsample=0.7
	MAE 4.2172154 for 6 rounds
CV with subsample=0.8, colsample=1.0
	MAE 4.1490338 for 5 rounds
CV with subsample=0.8, colsample=0.9
	MAE 4.1830254 for 6 rounds
CV with subsample=0.8, colsample=0.8


	MAE 4.1945664 for 6 rounds
CV with subsample=0.8, colsample=0.7
	MAE 4.3428186 for 6 rounds
CV with subsample=0.7, colsample=1.0
	MAE 4.0902778 for 5 rounds
CV with subsample=0.7, colsample=0.9
	MAE 4.206033199999999 for 6 rounds
CV with subsample=0.7, colsample=0.8
	MAE 4.216972 for 7 rounds
CV with subsample=0.7, colsample=0.7
	MAE 4.3464134 for 6 rounds
Best params: 0.9, 1.0, MAE: 4.030346199999999


In [24]:
# Updating : 
params['subsample'] = .8
params['colsample_bytree'] = 1.

#### Parameters ETA ( learning rate )

In [35]:
# The ETA parameter controls the learning rate. It corresponds to the shrinkage of the weights associated to features after each round, in other words it defines the amount of "correction" we make at each step
%time

# This can take some time…
min_mae = float("Inf")
best_params = None
for eta in [.3, .2, .1, .05, .01, .005]:
    print("CV with eta={}".format(eta))
    # We update our parameters
    params['eta'] = eta
    # Run and time CV
    %time
    cv_results = xgb.cv(
            params,
            dtrain,
            num_boost_round=num_boost_round,
            seed=42,
            nfold=5,
            metrics=['mae'],
            early_stopping_rounds=10
    )
    # Update best score
    mean_mae = cv_results['test-mae-mean'].min()
    boost_rounds = cv_results['test-mae-mean'].argmin()
    print("\tMAE {} for {} rounds\n".format(mean_mae, boost_rounds))
    if mean_mae < min_mae:
        min_mae = mean_mae
        best_params = eta
        
print("Best params: {}, MAE: {}".format(best_params, min_mae))

Wall time: 0 ns
CV with eta=0.3
Wall time: 0 ns


The current behaviour of 'Series.argmin' is deprecated, use 'idxmin'
instead.
The behavior of 'argmin' will be corrected to return the positional
minimum in the future. For now, use 'series.values.argmin' or
'np.argmin(np.array(values))' to get the position of the minimum
row.


	MAE 4.1490338 for 5 rounds

CV with eta=0.2
Wall time: 0 ns
	MAE 4.0048612 for 10 rounds

CV with eta=0.1
Wall time: 0 ns
	MAE 3.924338 for 19 rounds

CV with eta=0.05
Wall time: 0 ns
	MAE 3.8693964000000003 for 46 rounds

CV with eta=0.01
Wall time: 0 ns
	MAE 3.833641 for 235 rounds

CV with eta=0.005
Wall time: 0 ns
	MAE 3.8281052000000004 for 479 rounds

Best params: 0.005, MAE: 3.8281052000000004


In [36]:
#update parameter ( we select .01 because .005 take much more time and just slightly improve our MAE)
params['eta'] = .01

### Results

In [37]:
params
{'colsample_bytree': 1.0,
 'eta': 0.01,
 'eval_metric': 'mae',
 'max_depth': 10,
 'min_child_weight': 6,
 'objective': 'reg:linear',
 'subsample': 0.8}

{'colsample_bytree': 1.0,
 'eta': 0.01,
 'eval_metric': 'mae',
 'max_depth': 10,
 'min_child_weight': 6,
 'objective': 'reg:linear',
 'subsample': 0.8}

#### Train a model with results

In [38]:
model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")],
    early_stopping_rounds=10
)

[0]	Test-mae:7.68841
Will train until Test-mae hasn't improved in 10 rounds.
[1]	Test-mae:7.61732
[2]	Test-mae:7.55078
[3]	Test-mae:7.48079
[4]	Test-mae:7.41113
[5]	Test-mae:7.34253
[6]	Test-mae:7.27393
[7]	Test-mae:7.21039
[8]	Test-mae:7.14517
[9]	Test-mae:7.08459
[10]	Test-mae:7.02052
[11]	Test-mae:6.95744
[12]	Test-mae:6.89505
[13]	Test-mae:6.83499
[14]	Test-mae:6.77818
[15]	Test-mae:6.72115
[16]	Test-mae:6.66352
[17]	Test-mae:6.60645
[18]	Test-mae:6.55216
[19]	Test-mae:6.49965
[20]	Test-mae:6.44402
[21]	Test-mae:6.39312
[22]	Test-mae:6.34359
[23]	Test-mae:6.29254
[24]	Test-mae:6.24314
[25]	Test-mae:6.19703
[26]	Test-mae:6.15496
[27]	Test-mae:6.11418
[28]	Test-mae:6.07282
[29]	Test-mae:6.02809
[30]	Test-mae:5.98644
[31]	Test-mae:5.94343
[32]	Test-mae:5.89988
[33]	Test-mae:5.86016
[34]	Test-mae:5.81864
[35]	Test-mae:5.77842
[36]	Test-mae:5.74007
[37]	Test-mae:5.70285
[38]	Test-mae:5.66454
[39]	Test-mae:5.62736
[40]	Test-mae:5.5892
[41]	Test-mae:5.55657
[42]	Test-mae:5.52424
[43]	Test

### Saving your model

In [39]:
num_boost_round = model.best_iteration + 1
best_model = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=[(dtest, "Test")]
)

[0]	Test-mae:7.68841
[1]	Test-mae:7.61732
[2]	Test-mae:7.55078
[3]	Test-mae:7.48079
[4]	Test-mae:7.41113
[5]	Test-mae:7.34253
[6]	Test-mae:7.27393
[7]	Test-mae:7.21039
[8]	Test-mae:7.14517
[9]	Test-mae:7.08459
[10]	Test-mae:7.02052
[11]	Test-mae:6.95744
[12]	Test-mae:6.89505
[13]	Test-mae:6.83499
[14]	Test-mae:6.77818
[15]	Test-mae:6.72115
[16]	Test-mae:6.66352
[17]	Test-mae:6.60645
[18]	Test-mae:6.55216
[19]	Test-mae:6.49965
[20]	Test-mae:6.44402
[21]	Test-mae:6.39312
[22]	Test-mae:6.34359
[23]	Test-mae:6.29254
[24]	Test-mae:6.24314
[25]	Test-mae:6.19703
[26]	Test-mae:6.15496
[27]	Test-mae:6.11418
[28]	Test-mae:6.07282
[29]	Test-mae:6.02809
[30]	Test-mae:5.98644
[31]	Test-mae:5.94343
[32]	Test-mae:5.89988
[33]	Test-mae:5.86016
[34]	Test-mae:5.81864
[35]	Test-mae:5.77842
[36]	Test-mae:5.74007
[37]	Test-mae:5.70285
[38]	Test-mae:5.66454
[39]	Test-mae:5.62736
[40]	Test-mae:5.5892
[41]	Test-mae:5.55657
[42]	Test-mae:5.52424
[43]	Test-mae:5.49026
[44]	Test-mae:5.45495
[45]	Test-mae:5.42271

#### Test the best model : 

In [40]:
mean_absolute_error(best_model.predict(dtest), y_test)

3.9386062708402436

In [41]:
best_model.save_model("my_model.model")

### Load the Model: 

In [42]:
#### You can then load the model later with:
loaded_model = xgb.Booster()
loaded_model.load_model("my_model.model")
# And use it for predictions.
loaded_model.predict(dtest)



array([4.149846  , 0.34182256, 1.962074  , ..., 3.668127  , 0.11346319,
       3.47246   ], dtype=float32)