In [1]:
features = [
    "text_tokens",
    "hashtags",
    "tweet_id",
    "present_media",
    "present_links",
    "present_domains",
    "tweet_type",
    "language",
    "tweet_timestamp",
    "engaged_with_user_id",
    "engaged_with_user_follower_count",
    "engaged_with_user_following_count",
    "engaged_with_user_is_verified",
    "engaged_with_user_account_creation",
    "engaging_user_id",
    "engaging_user_follower_count",
    "engaging_user_following_count",
    "engaging_user_is_verified",
    "engaging_user_account_creation",
    "engagee_follows_engager",
]

features_idx = dict(zip(features, range(len(features))))

labels_idx = {
    "reply_timestamp": 20,
    "retweet_timestamp": 21,
    "retweet_with_comment_timestamp": 22,
    "like_timestamp": 23,
}
names = features + list(labels_idx)

In [330]:
from sklearn.metrics import average_precision_score, log_loss

def calculate_ctr(gt):
    positive = len([x for x in gt if x == 1])
    ctr = positive/float(len(gt))
    return ctr

def relative_cross_entropy_score(gt, pred):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

def compute_score(y_true, y_score):
    ap = average_precision_score(y_true, y_score)
    rce = relative_cross_entropy_score(y_true, y_score)
    return ap, rce

def pretty_evaluation(results):
    mAP = 0.25 * (
        results['retweet_AP']
        +results['reply_AP']
        +results['like_AP']
        +results['retweet_with_comment_AP']
    )
    mRCE = 0.25 * (
        results['retweet_RCE']
        +results['reply_RCE']
        +results['like_RCE']
        +results['retweet_with_comment_RCE']
    )
    return f"""
---------------------------------
AP Retweet:                {results['retweet_AP']:.4f}
RCE Retweet:               {results['retweet_RCE']:.4f}
---------------------------------
AP Reply:                  {results['reply_AP']:.4f}
RCE Reply:                 {results['reply_RCE']:.4f}
---------------------------------
AP Like:                   {results['like_AP']:.4f}
RCE Like:                  {results['like_RCE']:.4f}
---------------------------------
AP RT with comment:        {results['retweet_with_comment_AP']:.4f}
RCE RT with comment:       {results['retweet_with_comment_RCE']:.4f}

---------------------------------

mAP                        {mAP:.4f}
mRCE                       {mRCE:.4f}

""".strip()

In [3]:
import xgboost as xgb
import pandas as pd

In [4]:
df = pd.read_csv('../data/raw/sample_200k_rows', names=names, sep='\x01')
df = df.assign(**{
    "reply": df["reply_timestamp"].notna(),
    "retweet": df["retweet_timestamp"].notna(),
    "retweet_with_comment": df["retweet_with_comment_timestamp"].notna(),
    "like": df["like_timestamp"].notna()
})

In [5]:
targets = ["reply", "retweet", "retweet_with_comment", "like"]
enabled_features = [
    # Tweet features
    "tweet_type",
    "language",
    "tweet_timestamp",
    # Engaged-with User (i.e., Engagee) Features
    "engaged_with_user_follower_count",
    "engaged_with_user_following_count",
    "engaged_with_user_is_verified",
    "engaged_with_user_account_creation",
    # Engaging User (i.e., Engager) Features
    "engaging_user_follower_count",
    "engaging_user_following_count",
    "engaging_user_is_verified",
    "engaging_user_account_creation",
    # Engagement features
    "engagee_follows_engager"
]

In [235]:
df

Unnamed: 0,text_tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,...,engaging_user_account_creation,engagee_follows_engager,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp,reply,retweet,retweet_with_comment,like
0,101\t10289\t19348\t18519\t784\t59269\t16849\t16498\t14634\t23523\t...,,D798864A80C4B28B7A75B8E411C24728,,,,Quote,9A78FC330083E72BE0DD1EA92656F3B5,1613912916,044FED4C9797C84D59C33585CF6EC690,...,1548003258,False,,,,1.613913e+09,False,False,False,True
1,101\t56898\t137\t171\t20246\t168\t194\t46671\t131\t1981\t22946\t22...,,3F3153D1EEE6C4B3C1546BA9997AC1D7,Photo,,,Retweet,E7F038DE3EAD397AEC9193686C911677,1613783112,999C0EEFEDB7E7C3585CC33EB0FE65DC,...,1458890236,False,,,,,False,False,False,False
2,101\t14535\t189\t58768\t82856\t191\t10911\t11371\t13677\t169\t175\...,,26DC4B642DA4DA97E469305A03D20829,,,,TopLevel,488B32D24BD4BB44172EB981C1BCA6FA,1613290787,5F1CAD78B474A0F5888552ECD92A3DCC,...,1246210670,False,,,,1.613321e+09,False,False,False,True
3,101\t150\t86888\t41275\t110909\t10738\t142\t39774\t11403\t19282\t6...,,FB828E786FDDDEFB5827832967496B61,Photo,72D7397E286C3CD4BD41D0CF2C76A835,F595B7DE8992A3D8C7948B4E81419D78,TopLevel,B8B04128918BBF54E2E178BFF1ABA833,1612471288,4273D91B4A3FA9464C2CFBA738D4080D,...,1292185341,False,,,,,False,False,False,False
4,101\t1919\t27843\t101955\t18226\t12230\t22396\t10634\t22820\t47155...,,ADC4085E3313C32E2F9151134FE37674,,,,Quote,E7F038DE3EAD397AEC9193686C911677,1613998385,1A89120910A5DC201E746DFB2E43F773,...,1529061495,False,,,,1.614003e+09,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
224094,101\t56898\t137\t74004\t83779\t168\t49409\t11010\t131\t29162\t1435...,,9E85CCA16A556C2599F93FFA477C1C28,Video,,,Retweet,B8B04128918BBF54E2E178BFF1ABA833,1613245388,70566B1733BB045EFCAEBB9B8B372CF0,...,1580507486,False,,,,,False,False,False,False
224095,101\t97593\t24512\t761\t16498\t34414\t10461\t59901\t20109\t787\t36...,,7D52A1609D717EA16C7B7C35A5C9FC72,,,,TopLevel,9A78FC330083E72BE0DD1EA92656F3B5,1612827555,B7EDB18056E3C2388D0BD8F062ADD649,...,1564593210,False,,,,,False,False,False,False
224096,101\t31301\t183\t119\t91327\t10107\t100\t108\t77603\t10288\t10731\...,F6EBCDE6330715CA874FCB6A075F9ABA,6DC57A2E891088D7FACE14A59DBBA19F,,,,Quote,488B32D24BD4BB44172EB981C1BCA6FA,1612531477,5D36A0CC44532FD3DAABC3636548A712,...,1510873142,False,,,,,False,False,False,False
224097,101\t56898\t137\t13518\t11517\t11889\t50555\t131\t59901\t46310\t53...,,CA9B0D5A77F18111DF18BCCE1112E682,,,,Retweet,9A78FC330083E72BE0DD1EA92656F3B5,1613128148,22132BE53D4E6224E7329F1D11656BFF,...,1503746706,True,,,,,False,False,False,False


In [383]:
%%time
# Holdout split
df_train = df.iloc[0                 :int(len(df) * 0.6)]
df_valid = df.iloc[int(len(df) * 0.6):int(len(df) * 0.8)]
df_test = df.iloc [int(len(df) * 0.8):                  ]

# Feature Selection
df_train_features = df_train[enabled_features]
df_valid_features = df_valid[enabled_features]
df_test_features = df_test[enabled_features]

assert df_train_features.notna().all().all(), "no nulls in any column"

# Feature Engineering
for tmp_df in [df_train_features, df_valid_features, df_test_features]:
    tmp_df['tweet_type'] = tmp_df['tweet_type'].astype('category').cat.codes
    tmp_df['language'] = tmp_df['language'].astype('category').cat.codes

assert df_train_features.shape[1] == df_valid_features.shape[1] == df_test_features.shape[1], \
    "train, valid, test should have the same number of columns"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  app.launch_new_instance()


CPU times: user 69.6 ms, sys: 0 ns, total: 69.6 ms
Wall time: 68 ms


In [None]:
# df_train_features = pd.DataFrame.sparse.from_spmatrix(
#     ohenc.transform(df_train_features[categorical_features])
# ).join(df_train_features.drop(categorical_features, axis=1))
# df_valid_features = pd.DataFrame.sparse.from_spmatrix(
#     ohenc.transform(df_valid_features[categorical_features])
# ).join(df_valid_features.drop(categorical_features, axis=1))
# df_test_features = pd.DataFrame.sparse.from_spmatrix(
#     ohenc.transform(df_test_features[categorical_features])
# ).join(df_test_features.drop(categorical_features, axis=1))

---

## What I should be aiming for: H2O XGBoost
```
---------------------------------
AP Retweet:                0.2123
RCE Retweet:               7.2745
---------------------------------
AP Reply:                  0.0787
RCE Reply:                 9.2055
---------------------------------
AP Like:                   0.5615
RCE Like:                  7.1613
---------------------------------
AP RT with comment:        0.0091
RCE RT with comment:       -1.7001

---------------------------------

mAP                        0.2154
mRCE                       5.4853
```

In [387]:
%%time
# Training and Evaluation
dtest = xgb.DMatrix(df_test_features)
results = {}
models = {}
for target in targets:
    dtrain = xgb.DMatrix(df_train_features, df_train[target])
    dvalid = xgb.DMatrix(df_valid_features, df_valid[target])
    model = xgb.train(params={
        'objective': 'binary:logistic',
        'eval_metric': 'logloss',
        },
        evals=[(dvalid, "validation")],
        early_stopping_rounds=10,
        dtrain=dtrain,
        num_boost_round=50
    )
    AP, RCE = compute_score(df_test[target], model.predict(dtest))
    results[f"{target}_AP"] = AP
    results[f"{target}_RCE"] = RCE
print(pretty_evaluation(results))

[0]	validation-logloss:0.46627
[1]	validation-logloss:0.34237
[2]	validation-logloss:0.26626
[3]	validation-logloss:0.21682
[4]	validation-logloss:0.18415
[5]	validation-logloss:0.16216
[6]	validation-logloss:0.14746
[7]	validation-logloss:0.13742
[8]	validation-logloss:0.13084
[9]	validation-logloss:0.12640
[10]	validation-logloss:0.12342
[11]	validation-logloss:0.12172
[12]	validation-logloss:0.12059
[13]	validation-logloss:0.11998
[14]	validation-logloss:0.11974
[15]	validation-logloss:0.11954
[16]	validation-logloss:0.11942
[17]	validation-logloss:0.11943
[18]	validation-logloss:0.11941
[19]	validation-logloss:0.11950
[20]	validation-logloss:0.11943
[21]	validation-logloss:0.11940
[22]	validation-logloss:0.11943
[23]	validation-logloss:0.11994
[24]	validation-logloss:0.11999
[25]	validation-logloss:0.11997
[26]	validation-logloss:0.12004
[27]	validation-logloss:0.12005
[28]	validation-logloss:0.12008
[29]	validation-logloss:0.12021
[30]	validation-logloss:0.12031
[0]	validation-log

### experiment 9

- added validation and early stopping

```
model = xgb.train(params={
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    },
    evals=[(dvalid, "validation")],
    early_stopping_rounds=10,
    dtrain=dtrain,
    num_boost_round=50
)
---------------------------------
AP Retweet:                0.1408
RCE Retweet:               3.4300
---------------------------------
AP Reply:                  0.0702
RCE Reply:                 8.0398
---------------------------------
AP Like:                   0.5582
RCE Like:                  6.8853
---------------------------------
AP RT with comment:        0.0093
RCE RT with comment:       -0.1598

---------------------------------

mAP                        0.1946
mRCE                       4.5488
```

### experiment 8

```
model = xgb.train(params={
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    },
    dtrain=dtrain,
    num_boost_round=50
)
---------------------------------
AP Retweet:                0.1364
RCE Retweet:               3.0463
---------------------------------
AP Reply:                  0.0672
RCE Reply:                 6.9264
---------------------------------
AP Like:                   0.5580
RCE Like:                  6.8670
---------------------------------
AP RT with comment:        0.0089
RCE RT with comment:       -1.9622

---------------------------------

mAP                        0.1926
mRCE                       3.7193
```

### experiment 7
- all categoricals are one-hot encoded

```
model = xgb.train(params={
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    },
    dtrain=dtrain,
)
---------------------------------
AP Retweet:                0.1131
RCE Retweet:               -6.3097
---------------------------------
AP Reply:                  0.0278
RCE Reply:                 -46.4546
---------------------------------
AP Like:                   0.4905
RCE Like:                  2.4500
---------------------------------
AP RT with comment:        0.0072
RCE RT with comment:       -66.6947
```

### experiment 6
```
model = xgb.train(params={
    'tree_method': 'gpu_hist',
    'eta': 0.3,
    'objective': 'binary:logistic',
    'min_child_weight': 1.0,
    'nthread': 4,
    'seed': 519928949,
    'max_bin': 256,
    'max_depth': 6,
    'colsample_bytree': 1.0,
    'eval_metric': 'logloss',
    'lambda': 1.0,
    'gamma': 0.0,
    'gpu_id': 0,
    'alpha': 0.0,
    'booster': 'gbtree',
    'grow_policy': 'depthwise',
    'subsample': 1.0,
    'max_delta_step': 0.0,
    'colsample_bylevel': 1.0
    },
    dtrain=dtrain,
    num_boost_round=50
)
---------------------------------
AP Retweet:                0.1372
RCE Retweet:               3.0160
---------------------------------
AP Reply:                  0.0681
RCE Reply:                 6.4550
---------------------------------
AP Like:                   0.5573
RCE Like:                  6.7681
---------------------------------
AP RT with comment:        0.0089
RCE RT with comment:       -2.0667

---------------------------------

mAP                        0.1929
mRCE                       3.5431
```


### experiments 2,3,4,5
```
params={
        'objective': 'binary:logistic'
        'eval_metric': 'logloss' # same for 'map', same for 'auc', same for 'aucpr'
    }, dtrain=dtrain, num_boost_round=10, evals=()

---------------------------------
AP Retweet:                0.1403
RCE Retweet:               2.5160
---------------------------------
AP Reply:                  0.0708
RCE Reply:                 1.5804
---------------------------------
AP Like:                   0.5508
RCE Like:                  6.4905
---------------------------------
AP RT with comment:        0.0103
RCE RT with comment:       -31.8315
CPU times: user 9.41 s, sys: 28.5 ms, total: 9.44 s
Wall time: 3.22 s
```

### experiment 1
```
params={}, dtrain=dtrain, num_boost_round=10, evals=()

---------------------------------
AP Retweet:                0.1426
RCE Retweet:               2.9966
---------------------------------
AP Reply:                  0.0715
RCE Reply:                 4.2641
---------------------------------
AP Like:                   0.5517
RCE Like:                  6.4758
---------------------------------
AP RT with comment:        0.0129
RCE RT with comment:       -18.5040

---------------------------------

mAP                        0.1947
mRCE                       -1.1919
```

---

---

# Replicate H2O

In [46]:
import h2o

In [187]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,1 hour 44 mins
H2O_cluster_timezone:,Europe/Rome
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.1.2
H2O_cluster_version_age:,"7 days, 21 hours and 52 minutes"
H2O_cluster_name:,H2O_from_python_andrea_6bdrsu
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.200 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [188]:
h2o_retweet_frame_train = h2o.H2OFrame(df_train[enabled_features+['retweet']])
h2o_retweet_frame_test = h2o.H2OFrame(df_test[enabled_features+['retweet']])

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [189]:
h2o_retweet_model = h2o.estimators.H2OXGBoostEstimator()
h2o_retweet_model.train(training_frame=h2o_retweet_frame_train, y='retweet')

xgboost Model Build progress: |███████████████████████████████████████████| 100%


In [190]:
compute_score(
    df_test['retweet'],
    h2o_retweet_model.predict(test_data=h2o_retweet_frame_test).as_data_frame()['True']
)

xgboost prediction progress: |████████████████████████████████████████████| 100%




(0.21230837947411613, 7.2744628175698045)

In [191]:
h2o_retweet_xgboost_params, h2o_retweet_xgboost_num_boost_round = \
    h2o_retweet_model.convert_H2OXGBoostParams_2_XGBoostParams()

Copied from [H2O's source code](http://docs.h2o.ai/h2o/latest-stable/h2o-py/docs/_modules/h2o/frame.html#H2OFrame.convert_H2OFrame_2_DMatrix) to fix the `as_matrix` issues.


In [193]:
h2o_retweet_dtrain = daw(h2o_retweet_frame_train, enabled_features, 'retweet', h2o_retweet_model)
h2o_retweet_dtest = daw(h2o_retweet_frame_test, enabled_features, 'retweet', h2o_retweet_model)
h2o_retweet_dtrain, h2o_retweet_dtest

(<xgboost.core.DMatrix at 0x7f2470bcf310>,
 <xgboost.core.DMatrix at 0x7f2470bcf610>)

In [194]:
h2o_retweet_xgboost_model = xgb.train(
    params=h2o_retweet_xgboost_params,
    num_boost_round=h2o_retweet_xgboost_num_boost_round,
    dtrain=h2o_retweet_dtrain
)

Parameters: { "nround", "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [195]:
compute_score(
    df_test['retweet'],
    h2o_retweet_xgboost_model.predict(
        data=h2o_retweet_dtest,
        ntree_limit=h2o_retweet_xgboost_num_boost_round
    )
)



(0.0764872810964932, -24.017710164543658)

In [196]:
def daw(zelf, predictors, yresp, h2oXGBoostModel):
    def generatePandaEnumCols(pandaFtrain, cname, nrows, domainL):
        """
        For an H2O Enum column, we perform one-hot-encoding here and add one more column, "missing(NA)" to it.

        :param pandaFtrain: panda frame derived from H2OFrame
        :param cname: column name of enum col
        :param nrows: number of rows of enum col
        :return: panda frame with enum col encoded correctly for native XGBoost
        """
        import numpy as np
        import pandas as pd

        cmissingNames=[cname+".missing(NA)"]
        tempnp = np.zeros((nrows,1), dtype=np.int)
        # check for nan and assign it correct value
        colVals = pandaFtrain[cname]
        for ind in range(nrows):
            try:
                if not(colVals[ind] in domainL):
                    tempnp[ind]=1
            except ValueError:
                pass
        zeroFrame = pd.DataFrame(tempnp)
        zeroFrame.columns=cmissingNames
        temp = pd.get_dummies(pandaFtrain[cname], prefix=cname, drop_first=False)
        tempNames = list(temp)  # get column names
        colLength = len(tempNames)
        newNames = ['a']*colLength

        for ind in range(0,colLength):
            newNames[ind]=cname+"_"+domainL[ind]
        ftemp = temp[newNames]
        ctemp = pd.concat([ftemp, zeroFrame], axis=1)
        return ctemp
    

    import xgboost as xgb
    import pandas as pd
    import numpy as np
    from scipy.sparse import csr_matrix

    assert isinstance(predictors, list) or isinstance(predictors, tuple)
    assert h2oXGBoostModel._model_json['algo'] == 'xgboost', \
        "convert_H2OFrame_2_DMatrix is used for H2OXGBoost model only."

    tempFrame = zelf[predictors].cbind(zelf[yresp])
    colnames = tempFrame.names
    if type(predictors[0])==type(1): # convert integer indices to column names
        temp = []
        for colInd in predictors:
            temp.append(colnames[colInd])
        predictors = temp

    if (type(yresp) == type(1)):
        tempy = colnames[yresp]
        yresp = tempy # column name of response column

    enumCols = [] # extract enum columns out to process them
    enumColsIndices = []     # store enum column indices
    typeDict = zelf.types
    for predName in predictors:
        if str(typeDict[predName])=='enum':
            enumCols.append(predName)
            enumColsIndices.append(colnames.index(predName))

    pandaFtrain = tempFrame.as_data_frame(use_pandas=True, header=True)
    nrows = tempFrame.nrow

    # convert H2OFrame to DMatrix starts here
    if len(enumCols) > 0:   # enumCols contain all enum column names
        allDomain = tempFrame.levels() # list all domain levels with column indices
        domainLen = []
        for enumIndex in enumColsIndices:
            if len(allDomain[enumIndex])>0:
                domainLen.append(len(allDomain[enumIndex])*-1)
        incLevel = np.argsort(domainLen) # indices of enum column indices with decreasing domain length

        # need to move enum columns to the front, highest level first
        c2 = tempFrame[enumCols[incLevel[0]]]
        tempFrame = tempFrame.drop(enumCols[incLevel[0]])
        for index in range(1, len(incLevel)):
            c2 = c2.cbind(tempFrame[enumCols[incLevel[index]]])
            tempFrame = tempFrame.drop(enumCols[incLevel[index]])

        enumCols = c2.names
        tempFrame = c2.cbind(tempFrame)
        pandaFtrain = tempFrame.as_data_frame(use_pandas=True, header=True) # redo translation from H2O to panda

        pandaTrainPart = generatePandaEnumCols(pandaFtrain, enumCols[0], nrows, tempFrame[enumCols[0]].categories())
        pandaFtrain.drop([enumCols[0]], axis=1, inplace=True)

        for colInd in range(1, len(enumCols)):
            cname=enumCols[colInd]
            ctemp = generatePandaEnumCols(pandaFtrain, cname,  nrows, tempFrame[enumCols[colInd]].categories())
            pandaTrainPart=pd.concat([pandaTrainPart, ctemp], axis=1)
            pandaFtrain.drop([cname], axis=1, inplace=True)

        pandaFtrain = pd.concat([pandaTrainPart, pandaFtrain], axis=1)

    c0= tempFrame[yresp].asnumeric().as_data_frame(use_pandas=True, header=True)
    pandaFtrain.drop([yresp], axis=1, inplace=True)
    pandaF = pd.concat([c0, pandaFtrain], axis=1)
    pandaF.rename(columns={c0.columns[0]:yresp}, inplace=True)
    newX = list(pandaFtrain.columns.values)
    data = pandaF[newX]
    label = pandaF[[yresp]]

#     return xgb.DMatrix(data=csr_matrix(data), label=label) \
#         if h2oXGBoostModel._model_json['output']['sparse'] else xgb.DMatrix(data=data, label=label)
    return data, label

In [209]:
data, label = daw(h2o_retweet_frame_train, enabled_features, 'retweet', h2o_retweet_model)

In [210]:
data

Unnamed: 0,language_00304D7356D6C64481190D708D8F739C,language_0BB2C843174730BA7D958C98B763A797,language_105008E45831ADE8AF1DB888319F422A,language_10C6C994C2AD434F9D49D4BE9CFBC613,language_159541FA269CA8A9CDB93658CAEC4CA2,language_1F73BB863A39DB62B4A55B7E558DB1E8,language_23686A079CA538645BF6118A1EF51C8B,language_2573A3CF633EBE6932A1E1010D5CD213,language_2F548E5BE0D7F678E72DDE31DFBEF8E7,language_310ECD7D1E42216E3C1B31EFDDFC72A7,...,engagee_follows_engager_False,engagee_follows_engager_True,engagee_follows_engager.missing(NA),tweet_timestamp,engaged_with_user_follower_count,engaged_with_user_following_count,engaged_with_user_account_creation,engaging_user_follower_count,engaging_user_following_count,engaging_user_account_creation
0,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1613912916,1062,2498,1593549601,50,335,1548003258
1,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1613783112,34662,281,1256004696,38,638,1458890236
2,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1613290787,68605,1056,1488728385,404,178,1246210670
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1612471288,28392,353,1549245787,113,264,1292185341
4,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1613998385,4490,27,1580081977,1123,1220,1529061495
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
134454,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1613122944,14288,13860,1441602051,1185,1549,1558350781
134455,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1613253986,453,404,1513387085,452,566,1421096937
134456,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1613680088,232387,16,1422910513,12,93,1606191663
134457,0,0,0,0,0,0,0,0,0,0,...,1,0,1,1613391470,5594,493,1302347011,688,264,1464642092


In [211]:
[c for c in data.columns]

['language_00304D7356D6C64481190D708D8F739C',
 'language_0BB2C843174730BA7D958C98B763A797',
 'language_105008E45831ADE8AF1DB888319F422A',
 'language_10C6C994C2AD434F9D49D4BE9CFBC613',
 'language_159541FA269CA8A9CDB93658CAEC4CA2',
 'language_1F73BB863A39DB62B4A55B7E558DB1E8',
 'language_23686A079CA538645BF6118A1EF51C8B',
 'language_2573A3CF633EBE6932A1E1010D5CD213',
 'language_2F548E5BE0D7F678E72DDE31DFBEF8E7',
 'language_310ECD7D1E42216E3C1B31EFDDFC72A7',
 'language_313ECD3A1E5BB07406E4249475C2D6D6',
 'language_3228B1FB4BC92E81EF2FE35BDA86C540',
 'language_37342508F52BF4B62CCE3BA25460F9EB',
 'language_3AB05D6A4045A6C37D3E4566CFDFFE26',
 'language_3DF931B225B690508A63FD24133FA0E2',
 'language_3EA57373381A56822CBBC736169D0145',
 'language_41776FB50B812A6775C2F8DEC92A9779',
 'language_440116720BC3A7957E216A77EE5C18CF',
 'language_477ED2ED930405BF1DBF13F9BF973434',
 'language_488B32D24BD4BB44172EB981C1BCA6FA',
 'language_4B55C45CD308068E4D0913DEF1043AD6',
 'language_4CA37504EF8BA4352B03DCB