In [24]:
import numpy as np
import pandas as pd
import re
from utils import split_all_data, prep_data_explode
RANDOM_SEED = 1835

In [25]:
data = split_all_data(.8, random_state=RANDOM_SEED)['train']
tweets_df = prep_data_explode(data[['ID', 'tweet', 'label']]).reset_index(drop=True)
print(tweets_df.shape)
tweets_df.dropna(inplace=True)
tweets_df.sample(10, random_state=RANDOM_SEED)

Loading the data...
Splitting the data...
(1598176, 3)


Unnamed: 0,ID,tweet,label
825172,280830358,RT @BrookingsFP: PODCAST: Bruce Riedel and @na...,1
267749,3317294406,Ribose is good for the heart https://t.co/IWp7...,1
1060772,161088577,The min. I tweeted @justinbieber you should fo...,1
716544,9625592,September’s full moon comes early in the month...,0
285091,1186384930147835904,RT @Eric_Bernard94: Wale Mnaovaaga Vitambulish...,0
610843,2965383022,@nsumida @axelboada\n,1
1027494,455764741,.@UMich students said using police to enforce ...,0
1286931,448998506,RT @BlueJays: W E\nP L A Y\nB A S E B A L L\nT...,0
12192,231726733,Is Deep Learning overhyped? Keras author @fcho...,1
995192,1000830077674840064,RT @SSegayo: DAGA BAH TE RU? 🤣🤣\n\n@ranabelz\n...,1


In [26]:
def extract_num_mentions(tweet, unique=False):
    mentions = re.findall(r'@\w+', tweet)
    if unique:
        return len(set(mentions))
    else:
        return len(mentions)

In [27]:
def is_retweet(tweet):
    return 1 if tweet.startswith('RT') else 0

In [28]:
# using regex expression from https://www.geeksforgeeks.org/extract-urls-present-in-a-given-string/
def extract_num_links(tweet, unique=False):
    # remove comma or peroid as they throw off the regex
    links = re.findall(r'\b((?:https?|ftp|file)://[-a-zA-Z0-9+&@#/%?=~_|!:, .;]*[-a-zA-Z0-9+&@#/%=~_|])', tweet)
    if links:
        links = [link.split() for link in links][0]
        links = [link for link in links if (link.startswith('http') or link.startswith('ftp') or link.startswith('file'))]
    if unique:
        return len(set(links))
    else:
        return len(links)

In [29]:
for tweet in tweets_df['tweet'].sample(10, random_state=RANDOM_SEED):
    print(extract_num_links(tweet))

0
1
0
2
0
0
1
1
1
1


In [30]:
tweets_df.loc[:, 'num_mentions'] = tweets_df['tweet'].apply(extract_num_mentions)
tweets_df.loc[:, 'num_links'] = tweets_df['tweet'].apply(extract_num_links)
tweets_df.loc[:, 'is_retweet'] = tweets_df['tweet'].apply(is_retweet)

In [31]:
tweets_df.sample(10, random_state=RANDOM_SEED)

Unnamed: 0,ID,tweet,label,num_mentions,num_links,is_retweet
825172,280830358,RT @BrookingsFP: PODCAST: Bruce Riedel and @na...,1,3,0,1
267749,3317294406,Ribose is good for the heart https://t.co/IWp7...,1,0,1,0
1060772,161088577,The min. I tweeted @justinbieber you should fo...,1,2,0,0
716544,9625592,September’s full moon comes early in the month...,0,0,2,0
285091,1186384930147835904,RT @Eric_Bernard94: Wale Mnaovaaga Vitambulish...,0,1,0,1
610843,2965383022,@nsumida @axelboada\n,1,2,0,0
1027494,455764741,.@UMich students said using police to enforce ...,0,2,1,0
1286931,448998506,RT @BlueJays: W E\nP L A Y\nB A S E B A L L\nT...,0,1,1,1
12192,231726733,Is Deep Learning overhyped? Keras author @fcho...,1,1,1,0
995192,1000830077674840064,RT @SSegayo: DAGA BAH TE RU? 🤣🤣\n\n@ranabelz\n...,1,3,1,1


In [32]:
tweets_df.sort_values(['num_links'], ascending=False)

Unnamed: 0,ID,tweet,label,num_mentions,num_links,is_retweet
1310248,14506253,The 5th ep of #TheDarkestTimelinePodcast calle...,0,1,5,0
1412616,1021655155,GOING LIVE AT 6:30pm! Who wants to squad up to...,1,0,5,0
521298,1142942118056357888,This is why for securing data and email and me...,1,0,5,0
287536,3518971516,RT @AliceeBrookees: https://t.co/YIyGvVMR9r ht...,1,1,5,1
516847,42226885,Other possibilities: https://t.co/keN0N7ZXGQ o...,0,0,5,0
...,...,...,...,...,...,...
923090,1117931702,@FallGuysGame 🤟🏻🖤\n,0,1,0,0
327420,817376852566126592,@donthoodwinkme I imagine😂😂😂\n,1,1,0,0
327419,817376852566126592,@PugMafia2 @RobinFulghum1 Right?! I have &lt;4...,1,2,0,0
923093,1117931702,@_AJayP_ LOVE IT!!!!\n,0,1,0,0


In [33]:
print(extract_num_links(tweets_df.loc[1310248, 'tweet']))
tweets_df.loc[1310248, 'tweet']

5


'The 5th ep of #TheDarkestTimelinePodcast called #DeathMechanic  We get intense about #covid_19 &amp; then @alisonbrie joins &amp; is as funny as there ever was or will be. Links galore! https://t.co/URAL6rQ2Ju    https://t.co/E1zQRLw0qf    https://t.co/xSZ4gxPkUY https://t.co/s9RjsHY48q https://t.co/kwjxbizl9S\n'

In [34]:
tweets_df['num_tweets'] = 1
user_level = tweets_df[['ID', 'num_mentions', 'num_links', 'is_retweet', 'num_tweets']].groupby(['ID']).sum().reset_index()
user_level.sample(10, random_state=RANDOM_SEED)

Unnamed: 0,ID,num_mentions,num_links,is_retweet,num_tweets
4637,739498500,59,52,18,200
9219,1289004691079274496,42,9,24,33
8421,1158909522141949952,257,63,110,197
2476,118293224,13,180,2,187
8731,1223612480141910016,89,18,83,93
2421,112805276,132,94,40,200
623,18905975,235,189,16,200
2601,137129117,107,174,55,200
4128,435499960,208,91,78,186
4411,552215716,278,61,13,200


In [35]:
for col in user_level.columns[1:-1]:
    user_level[col] = user_level[col] / user_level.iloc[:, -1]
user_level.sample(10, random_state=RANDOM_SEED)

Unnamed: 0,ID,num_mentions,num_links,is_retweet,num_tweets
4637,739498500,0.295,0.26,0.09,200
9219,1289004691079274496,1.272727,0.272727,0.727273,33
8421,1158909522141949952,1.304569,0.319797,0.558376,197
2476,118293224,0.069519,0.962567,0.010695,187
8731,1223612480141910016,0.956989,0.193548,0.892473,93
2421,112805276,0.66,0.47,0.2,200
623,18905975,1.175,0.945,0.08,200
2601,137129117,0.535,0.87,0.275,200
4128,435499960,1.11828,0.489247,0.419355,186
4411,552215716,1.39,0.305,0.065,200
