# Baseline training on full features including content

In [1]:
import numpy as np
import pandas as pd

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier

from sklearn.preprocessing import Normalizer

# Data loading

In [2]:
# Data paths
TRAIN = "../../data/prepared/full_train_df.pkl"
VAL = "../../data/prepared/full_val_df.pkl"
TEST = "../../data/prepared/full_test_df.pkl"

In [3]:
train_df = pd.read_pickle(TRAIN)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159544 entries, 0 to 173184
Columns: 819 entries, id_str to 767
dtypes: float32(768), float64(43), int64(7), object(1)
memory usage: 530.7+ MB


In [4]:
train_df.head()

Unnamed: 0,id_str,entities.urls,entities.media,user_in_net,has_covid_keyword,tweets_keywords_3_in_degree,tweets_keywords_3_out_degree,tweets_keywords_3_in_strength,tweets_keywords_3_out_strength,tweets_keywords_3_eigenvector_in,...,758,759,760,761,762,763,764,765,766,767
0,1339076855064915968,0,0,0,0,-0.206248,-0.375587,0.994688,0.944173,-0.180504,...,-0.065715,-0.392403,-0.265053,-0.388531,-0.226297,-0.222182,0.781448,-0.242079,-0.309126,0.04621
1,1337506645904142336,1,0,0,0,1.693304,2.200119,0.948712,0.954298,-0.180504,...,-0.186057,-0.259318,-0.00666,-0.345143,-0.380387,-0.266276,0.508922,-0.029825,-0.168484,-0.183772
2,1335934602532298752,1,0,0,1,-0.647823,-0.651175,-0.967028,-0.954585,-0.180504,...,-0.377129,-0.029135,0.307227,-0.189083,-0.16638,-0.324002,0.719398,-0.213774,-0.615862,-0.176414
3,1334842452096786432,0,1,0,0,-0.647823,-0.651175,-0.967028,-0.954585,-0.180504,...,-0.631918,0.676647,0.148769,-0.04723,-0.288221,-0.55141,0.326829,0.00776,-0.007832,-0.264898
5,1333647712064057088,0,1,0,0,-0.647823,-0.651175,-0.967028,-0.954585,-0.180504,...,-0.927304,0.65523,0.536625,-0.184766,-0.001629,-0.590369,0.995412,-0.092071,-0.069974,-0.18276


In [5]:
train_df_labels = train_df.retweet_label
train_df.drop(["retweet_label", "id_str"], axis=1, inplace=True)

In [6]:
val_df = pd.read_pickle(VAL)
val_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19943 entries, 173185 to 194714
Columns: 819 entries, id_str to 767
dtypes: float32(768), float64(43), int64(7), object(1)
memory usage: 66.3+ MB


In [7]:
val_df_labels = val_df.retweet_label
val_df.drop(["retweet_label", "id_str"], axis=1, inplace=True)

In [8]:
test_df = pd.read_pickle(TEST)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19944 entries, 194715 to 214669
Columns: 819 entries, id_str to 767
dtypes: float32(768), float64(43), int64(7), object(1)
memory usage: 66.3+ MB


In [9]:
test_df_labels = test_df.retweet_label
test_df.drop(["retweet_label", "id_str"], axis=1, inplace=True)

# Content vector scaling

In [10]:
vec_cols = list(range(768))
train_df[vec_cols].describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
count,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,...,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0
mean,0.19536,0.09522,-0.35285,0.146493,0.079543,-0.268018,-0.15266,-0.399202,0.242943,0.348565,...,-0.333801,0.024535,0.04096,-0.213138,-0.340657,-0.283257,0.437981,-0.097761,-0.250533,-0.167678
std,0.293173,0.214114,0.233547,0.254018,0.187463,0.336944,0.167259,0.312014,0.318916,0.315775,...,0.33748,0.242701,0.37964,0.211717,0.230099,0.23149,0.516731,0.362651,0.217527,0.292303
min,-1.193549,-0.862558,-1.433369,-1.287081,-0.862269,-2.027415,-1.076601,-1.6307,-1.594843,-1.052173,...,-1.922168,-1.079849,-1.99762,-1.54253,-1.496642,-1.311073,-3.092209,-2.276905,-1.25556,-1.746912
25%,-0.000639,-0.042778,-0.511164,-0.014581,-0.040123,-0.491702,-0.260796,-0.627653,0.045554,0.145942,...,-0.56573,-0.134558,-0.192809,-0.353925,-0.490374,-0.44642,0.103474,-0.325314,-0.394385,-0.358993
50%,0.178249,0.092149,-0.354339,0.156912,0.081831,-0.279836,-0.155351,-0.431011,0.26441,0.358412,...,-0.32865,0.01123,0.073178,-0.209669,-0.327011,-0.286031,0.392544,-0.078316,-0.251108,-0.163843
75%,0.37691,0.240091,-0.196517,0.321197,0.194621,-0.063111,-0.049406,-0.196222,0.460196,0.557124,...,-0.105623,0.166766,0.295857,-0.066293,-0.183453,-0.126617,0.730486,0.142456,-0.105713,0.031476
max,1.780094,0.975826,1.108052,1.300397,1.140064,1.299382,1.04101,1.164399,1.753418,2.136066,...,1.218108,1.140285,2.073954,0.858104,0.673883,0.758497,2.970299,1.877951,0.836245,1.073741


In [11]:
scaler = Normalizer()
transformed = scaler.fit_transform(train_df[vec_cols].values)
train_df[vec_cols] = transformed

transformed_val = scaler.transform(val_df[vec_cols].values)
val_df[vec_cols] = transformed_val

transformed_test = scaler.transform(test_df[vec_cols].values)
test_df[vec_cols] = transformed_test

In [12]:
train_df[vec_cols].describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
count,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,...,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0,159544.0
mean,0.0152,0.007507,-0.02858,0.011676,0.006295,-0.022028,-0.012424,-0.032978,0.020101,0.02857,...,-0.026178,0.00143,0.003647,-0.017256,-0.027182,-0.022219,0.035228,-0.007656,-0.02029,-0.012839
std,0.022639,0.016913,0.018773,0.020016,0.01501,0.026821,0.013482,0.025655,0.025218,0.02549,...,0.025914,0.018903,0.029669,0.016981,0.018152,0.017773,0.040105,0.028283,0.017467,0.022753
min,-0.080297,-0.065725,-0.109652,-0.092831,-0.06293,-0.143057,-0.080418,-0.123465,-0.102206,-0.07559,...,-0.125459,-0.083214,-0.133192,-0.101022,-0.109637,-0.090948,-0.175592,-0.149472,-0.100577,-0.115395
25%,-5.3e-05,-0.003482,-0.041618,-0.001186,-0.003256,-0.039989,-0.021262,-0.052149,0.003675,0.01166,...,-0.043702,-0.011015,-0.015577,-0.028709,-0.039282,-0.035115,0.008446,-0.026308,-0.032005,-0.02831
50%,0.014484,0.007532,-0.028875,0.012569,0.006594,-0.022661,-0.012449,-0.035135,0.021636,0.029054,...,-0.026761,0.000913,0.005967,-0.01703,-0.026397,-0.023177,0.032194,-0.006395,-0.020329,-0.013188
75%,0.030071,0.019347,-0.01576,0.025681,0.015603,-0.005066,-0.003966,-0.015423,0.037736,0.045743,...,-0.008714,0.013302,0.024081,-0.005349,-0.014919,-0.01047,0.059403,0.011611,-0.008487,0.002606
max,0.124338,0.069053,0.066742,0.091264,0.08313,0.091769,0.077764,0.081185,0.117348,0.164405,...,0.085552,0.074494,0.123961,0.064772,0.053168,0.053977,0.210355,0.127449,0.065561,0.090006


# MLP Training

In [13]:
clf = MLPClassifier(
    max_iter=50,
    hidden_layer_sizes=(512,),
    random_state=1,
    verbose=True,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=5,
    learning_rate_init=0.001,
    batch_size=256,
    alpha=0.001)
clf.fit(train_df.values, train_df_labels.values)

Iteration 1, loss = 0.60246838
Validation score: 0.692510
Iteration 2, loss = 0.58407648
Validation score: 0.691570
Iteration 3, loss = 0.57670078
Validation score: 0.691069
Iteration 4, loss = 0.57134825
Validation score: 0.696521
Iteration 5, loss = 0.56624809
Validation score: 0.693012
Iteration 6, loss = 0.56291106
Validation score: 0.697462
Iteration 7, loss = 0.55870844
Validation score: 0.697399
Iteration 8, loss = 0.55490506
Validation score: 0.692009
Iteration 9, loss = 0.55175634
Validation score: 0.696396
Iteration 10, loss = 0.54786775
Validation score: 0.699091
Iteration 11, loss = 0.54371448
Validation score: 0.694202
Iteration 12, loss = 0.54055150
Validation score: 0.692636
Iteration 13, loss = 0.53717560
Validation score: 0.695080
Iteration 14, loss = 0.53317858
Validation score: 0.697587
Iteration 15, loss = 0.52958391
Validation score: 0.693952
Iteration 16, loss = 0.52558038
Validation score: 0.693137
Validation score did not improve more than tol=0.000100 for 5 con

MLPClassifier(alpha=0.001, batch_size=256, early_stopping=True,
              hidden_layer_sizes=(512,), max_iter=50, n_iter_no_change=5,
              random_state=1, verbose=True)

In [14]:
clf.best_validation_score_

0.6990911939830774

In [15]:
clf.score(train_df.values, train_df_labels.values)

0.7266835481121195

In [16]:
clf.score(val_df.values, val_df_labels.values)

0.6893646893646893

In [17]:
val_predictions = clf.predict(val_df.values)
val_predictions.shape

(19943,)

In [18]:
out = classification_report(val_df_labels.values, val_predictions, output_dict=False)
print(out)

              precision    recall  f1-score   support

           0       0.70      0.77      0.73     10954
           1       0.68      0.59      0.63      8989

    accuracy                           0.69     19943
   macro avg       0.69      0.68      0.68     19943
weighted avg       0.69      0.69      0.69     19943



## MLP Classifier (2)

In [19]:
clf_2 = MLPClassifier(
    max_iter=50,
    hidden_layer_sizes=(1000,50,30),
    random_state=1,
    verbose=True,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=5,
    learning_rate_init=0.001,
    batch_size=32,
    alpha=0.001)
clf_2.fit(train_df.values, train_df_labels.values)

Iteration 1, loss = 0.61317215
Validation score: 0.688750
Iteration 2, loss = 0.59969468
Validation score: 0.687120
Iteration 3, loss = 0.59409519
Validation score: 0.690505
Iteration 4, loss = 0.59036136
Validation score: 0.691319
Iteration 5, loss = 0.58741700
Validation score: 0.689251
Iteration 6, loss = 0.58474160
Validation score: 0.687935
Iteration 7, loss = 0.58276091
Validation score: 0.692761
Iteration 8, loss = 0.58098337
Validation score: 0.692259
Iteration 9, loss = 0.57954971
Validation score: 0.696396
Iteration 10, loss = 0.57838327
Validation score: 0.693137
Iteration 11, loss = 0.57710035
Validation score: 0.693012
Iteration 12, loss = 0.57605357
Validation score: 0.697211
Iteration 13, loss = 0.57521887
Validation score: 0.693137
Iteration 14, loss = 0.57445013
Validation score: 0.697336
Iteration 15, loss = 0.57356258
Validation score: 0.696960
Iteration 16, loss = 0.57316001
Validation score: 0.696772
Iteration 17, loss = 0.57201924
Validation score: 0.694328
Iterat

MLPClassifier(alpha=0.001, batch_size=32, early_stopping=True,
              hidden_layer_sizes=(1000, 50, 30), max_iter=50,
              n_iter_no_change=5, random_state=1, verbose=True)

In [20]:
clf_2.best_validation_score_

0.6991538702601066

In [21]:
clf_2.score(train_df.values, train_df_labels.values)

0.7145426966855538

In [22]:
clf_2.score(val_df.values, val_df_labels.values)

0.6861054003911147

In [23]:
val_predictions = clf_2.predict(val_df.values)
val_predictions.shape

(19943,)

In [24]:
out = classification_report(val_df_labels.values, val_predictions, output_dict=False)
print(out)

              precision    recall  f1-score   support

           0       0.69      0.78      0.73     10954
           1       0.68      0.58      0.62      8989

    accuracy                           0.69     19943
   macro avg       0.68      0.68      0.68     19943
weighted avg       0.69      0.69      0.68     19943



# Random Forest

In [26]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=25,
    min_samples_split=0.00002,
    min_samples_leaf=0.00002,
    random_state=1,
    verbose=True,
    n_jobs=-1,
    max_samples=0.4,
    max_features=0.2,
    oob_score=True,
    class_weight="balanced",
    min_impurity_decrease=0.00008
    )
rf.fit(train_df.values, train_df_labels.values)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  2.0min finished


RandomForestClassifier(class_weight='balanced', max_depth=25, max_features=0.2,
                       max_samples=0.4, min_impurity_decrease=8e-05,
                       min_samples_leaf=2e-05, min_samples_split=2e-05,
                       n_estimators=200, n_jobs=-1, oob_score=True,
                       random_state=1, verbose=True)

In [27]:
rf.score(train_df.values, train_df_labels.values)

[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.4s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.5s finished


0.8533696033696033

In [28]:
rf.score(val_df.values, val_df_labels.values)

[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.1s finished


0.6843503986361129

In [29]:
rf_val_predictions = rf.predict(val_df.values)

[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.1s finished


In [30]:
out = classification_report(val_df_labels.values, rf_val_predictions, output_dict=False, digits=3)
print(out)

              precision    recall  f1-score   support

           0      0.698     0.749     0.723     10954
           1      0.664     0.606     0.634      8989

    accuracy                          0.684     19943
   macro avg      0.681     0.677     0.678     19943
weighted avg      0.683     0.684     0.683     19943



# Dummy classifier

In [31]:
s_dummy = DummyClassifier(strategy="stratified", random_state=1)
s_dummy.fit(train_df.values, train_df_labels.values)

DummyClassifier(random_state=1, strategy='stratified')

In [32]:
dummy_val_predictions = s_dummy.predict(val_df.values)

In [33]:
out = classification_report(val_df_labels.values, dummy_val_predictions, output_dict=False)
print(out)

              precision    recall  f1-score   support

           0       0.55      0.58      0.56     10954
           1       0.45      0.41      0.43      8989

    accuracy                           0.51     19943
   macro avg       0.50      0.50      0.50     19943
weighted avg       0.50      0.51      0.50     19943

