# Baseline training on network and other features besides text content

In [1]:
import numpy as np
import pandas as pd

from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier

# Data loading

In [2]:
# Data paths
TRAIN = "../../data/prepared/train_features.pkl"
VAL = "../../data/prepared/val_features.pkl"
TEST = "../../data/prepared/test_features.pkl"

In [3]:
train_df = pd.read_pickle(TRAIN)
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 159544 entries, 0 to 173184
Data columns (total 51 columns):
 #   Column                                Non-Null Count   Dtype  
---  ------                                --------------   -----  
 0   id_str                                159544 non-null  object 
 1   entities.urls                         159544 non-null  int64  
 2   entities.media                        159544 non-null  int64  
 3   user_in_net                           159544 non-null  int64  
 4   has_covid_keyword                     159544 non-null  int64  
 5   tweets_keywords_3_in_degree           159544 non-null  float64
 6   tweets_keywords_3_out_degree          159544 non-null  float64
 7   tweets_keywords_3_in_strength         159544 non-null  float64
 8   tweets_keywords_3_out_strength        159544 non-null  float64
 9   tweets_keywords_3_eigenvector_in      159544 non-null  float64
 10  tweets_keywords_3_eigenvector_out     159544 non-null  float64
 11  

In [4]:
train_df_labels = train_df.retweet_label
train_df.drop(["retweet_label", "id_str"], axis=1, inplace=True)

In [5]:
val_df = pd.read_pickle(VAL)
val_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19943 entries, 173185 to 194714
Data columns (total 51 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   id_str                                19943 non-null  object 
 1   entities.urls                         19943 non-null  int64  
 2   entities.media                        19943 non-null  int64  
 3   user_in_net                           19943 non-null  int64  
 4   has_covid_keyword                     19943 non-null  int64  
 5   tweets_keywords_3_in_degree           19943 non-null  float64
 6   tweets_keywords_3_out_degree          19943 non-null  float64
 7   tweets_keywords_3_in_strength         19943 non-null  float64
 8   tweets_keywords_3_out_strength        19943 non-null  float64
 9   tweets_keywords_3_eigenvector_in      19943 non-null  float64
 10  tweets_keywords_3_eigenvector_out     19943 non-null  float64
 11  tweets_ke

In [6]:
val_df_labels = val_df.retweet_label
val_df.drop(["retweet_label", "id_str"], axis=1, inplace=True)

In [7]:
test_df = pd.read_pickle(TEST)
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19944 entries, 194715 to 214669
Data columns (total 51 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   id_str                                19944 non-null  object 
 1   entities.urls                         19944 non-null  int64  
 2   entities.media                        19944 non-null  int64  
 3   user_in_net                           19944 non-null  int64  
 4   has_covid_keyword                     19944 non-null  int64  
 5   tweets_keywords_3_in_degree           19944 non-null  float64
 6   tweets_keywords_3_out_degree          19944 non-null  float64
 7   tweets_keywords_3_in_strength         19944 non-null  float64
 8   tweets_keywords_3_out_strength        19944 non-null  float64
 9   tweets_keywords_3_eigenvector_in      19944 non-null  float64
 10  tweets_keywords_3_eigenvector_out     19944 non-null  float64
 11  tweets_ke

In [8]:
test_df_labels = test_df.retweet_label
test_df.drop(["retweet_label", "id_str"], axis=1, inplace=True)

# MLP Training

In [9]:
clf = MLPClassifier(
    max_iter=50,
    hidden_layer_sizes=(512,),
    random_state=1,
    verbose=True,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=5,
    learning_rate_init=0.001,
    batch_size=256)
clf.fit(train_df.values, train_df_labels.values)

Iteration 1, loss = 0.61166617
Validation score: 0.664933
Iteration 2, loss = 0.59752586
Validation score: 0.671388
Iteration 3, loss = 0.59348806
Validation score: 0.667941
Iteration 4, loss = 0.58964721
Validation score: 0.679160
Iteration 5, loss = 0.58725558
Validation score: 0.671890
Iteration 6, loss = 0.58548920
Validation score: 0.677907
Iteration 7, loss = 0.58373419
Validation score: 0.681416
Iteration 8, loss = 0.58216362
Validation score: 0.682482
Iteration 9, loss = 0.58083020
Validation score: 0.673394
Iteration 10, loss = 0.57973822
Validation score: 0.680100
Iteration 11, loss = 0.57859947
Validation score: 0.681354
Iteration 12, loss = 0.57749225
Validation score: 0.683924
Iteration 13, loss = 0.57645485
Validation score: 0.684425
Iteration 14, loss = 0.57565323
Validation score: 0.683297
Iteration 15, loss = 0.57460824
Validation score: 0.683046
Iteration 16, loss = 0.57398114
Validation score: 0.684049
Iteration 17, loss = 0.57308703
Validation score: 0.687183
Iterat

MLPClassifier(batch_size=256, early_stopping=True, hidden_layer_sizes=(512,),
              max_iter=50, n_iter_no_change=5, random_state=1, verbose=True)

In [10]:
clf.best_validation_score_

0.68718270134754

In [11]:
clf.score(train_df.values, train_df_labels.values)

0.6984405555834128

In [12]:
clf.score(val_df.values, val_df_labels.values)

0.6815423958281102

In [13]:
val_predictions = clf.predict(val_df.values)
val_predictions.shape

(19943,)

In [14]:
out = classification_report(val_df_labels.values, val_predictions, output_dict=False)
print(out)

              precision    recall  f1-score   support

           0       0.69      0.76      0.72     10954
           1       0.67      0.58      0.62      8989

    accuracy                           0.68     19943
   macro avg       0.68      0.67      0.67     19943
weighted avg       0.68      0.68      0.68     19943



## MLP Classifier (2)

In [15]:
clf_2 = MLPClassifier(
    max_iter=50,
    hidden_layer_sizes=(1000,50,30),
    random_state=1,
    verbose=True,
    early_stopping=True,
    validation_fraction=0.1,
    n_iter_no_change=5,
    learning_rate_init=0.001,
    batch_size=32)
clf_2.fit(train_df.values, train_df_labels.values)

Iteration 1, loss = 0.61074513
Validation score: 0.681605
Iteration 2, loss = 0.59739695
Validation score: 0.680414
Iteration 3, loss = 0.59217806
Validation score: 0.687183
Iteration 4, loss = 0.58874952
Validation score: 0.685553
Iteration 5, loss = 0.58599592
Validation score: 0.685553
Iteration 6, loss = 0.58403848
Validation score: 0.685553
Iteration 7, loss = 0.58234483
Validation score: 0.687997
Iteration 8, loss = 0.58047310
Validation score: 0.687496
Iteration 9, loss = 0.57881630
Validation score: 0.691257
Iteration 10, loss = 0.57701676
Validation score: 0.688812
Iteration 11, loss = 0.57592531
Validation score: 0.690128
Iteration 12, loss = 0.57453660
Validation score: 0.685428
Iteration 13, loss = 0.57346691
Validation score: 0.691257
Iteration 14, loss = 0.57216164
Validation score: 0.688938
Iteration 15, loss = 0.57178475
Validation score: 0.685992
Validation score did not improve more than tol=0.000100 for 5 consecutive epochs. Stopping.


MLPClassifier(batch_size=32, early_stopping=True,
              hidden_layer_sizes=(1000, 50, 30), max_iter=50,
              n_iter_no_change=5, random_state=1, verbose=True)

In [16]:
clf_2.best_validation_score_

0.6912566593544344

In [17]:
clf_2.score(train_df.values, train_df_labels.values)

0.6986912701198416

In [18]:
clf_2.score(val_df.values, val_df_labels.values)

0.6793361079075365

In [19]:
val_predictions = clf_2.predict(val_df.values)
val_predictions.shape

(19943,)

In [20]:
out = classification_report(val_df_labels.values, val_predictions, output_dict=False)
print(out)

              precision    recall  f1-score   support

           0       0.69      0.77      0.72     10954
           1       0.67      0.57      0.62      8989

    accuracy                           0.68     19943
   macro avg       0.68      0.67      0.67     19943
weighted avg       0.68      0.68      0.68     19943



# Random Forest

In [22]:
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=25,
    min_samples_split=0.00002,
    min_samples_leaf=0.00002,
    random_state=1,
    verbose=True,
    n_jobs=-1,
    max_samples=0.4,
    max_features=0.2,
    oob_score=True,
    class_weight="balanced",
    min_impurity_decrease=0.00008
    )
rf.fit(train_df.values, train_df_labels.values)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.7s
[Parallel(n_jobs=-1)]: Done 152 tasks      | elapsed:    4.3s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    5.4s finished


RandomForestClassifier(class_weight='balanced', max_depth=25, max_features=0.2,
                       max_samples=0.4, min_impurity_decrease=8e-05,
                       min_samples_leaf=2e-05, min_samples_split=2e-05,
                       n_estimators=200, n_jobs=-1, oob_score=True,
                       random_state=1, verbose=True)

In [23]:
rf.score(train_df.values, train_df_labels.values)

[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.2s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.6s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.8s finished


0.6902609938324225

In [24]:
rf.score(val_df.values, val_df_labels.values)

[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.2s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.2s finished


0.6780323923181066

In [25]:
rf_val_predictions = rf.predict(val_df.values)

[Parallel(n_jobs=24)]: Using backend ThreadingBackend with 24 concurrent workers.
[Parallel(n_jobs=24)]: Done   2 tasks      | elapsed:    0.1s
[Parallel(n_jobs=24)]: Done 152 tasks      | elapsed:    0.2s
[Parallel(n_jobs=24)]: Done 200 out of 200 | elapsed:    0.2s finished


In [26]:
out = classification_report(val_df_labels.values, rf_val_predictions, output_dict=False, digits=3)
print(out)

              precision    recall  f1-score   support

           0      0.723     0.671     0.696     10954
           1      0.631     0.687     0.658      8989

    accuracy                          0.678     19943
   macro avg      0.677     0.679     0.677     19943
weighted avg      0.682     0.678     0.679     19943



# Dummy classifier

In [27]:
s_dummy = DummyClassifier(strategy="stratified", random_state=1)
s_dummy.fit(train_df.values, train_df_labels.values)

DummyClassifier(random_state=1, strategy='stratified')

In [28]:
dummy_val_predictions = s_dummy.predict(val_df.values)

In [29]:
out = classification_report(val_df_labels.values, dummy_val_predictions, output_dict=False)
print(out)

              precision    recall  f1-score   support

           0       0.55      0.58      0.56     10954
           1       0.45      0.41      0.43      8989

    accuracy                           0.51     19943
   macro avg       0.50      0.50      0.50     19943
weighted avg       0.50      0.51      0.50     19943

