In [103]:
import pandas as pd
import random as rnd
import numpy as np
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow import keras
from sklearn.feature_extraction.text import TfidfVectorizer

In [104]:
# Read in the dataframe of labeled tweets

tweet_df = pd.read_json('Twibot-20/tweet.json')

# Read in the dataframe of profile information

profile_df = pd.read_json('Twibot-20/profile.json')


In [53]:
# Remerge the profile, tweet data, copy

tw_features_df = tweet_df.merge(profile_df, on=['ID', 'label']).dropna()


In [99]:
# Adding features to the features dataframe

# Min, max, average, and standard deviation of lengths of tweets
tw_features_df['tweet_min_len'] = tw_features_df['tweet'].apply(lambda x: min([ len(t) for t in x ]))
tw_features_df['tweet_max_len'] = tw_features_df['tweet'].apply(lambda x: max([ len(t) for t in x ]))
tw_features_df['tweet_av_len'] = tw_features_df['tweet'].apply(lambda x: np.mean([ len(t) for t in x ]))
tw_features_df['tweet_len_std'] = tw_features_df['tweet'].apply(lambda x: np.std([ len(t) for t in x ]))

# Lengths of user and screen names
tw_features_df['user_name_len'] = tw_features_df['profile'].apply(lambda x: len(x['name']))
tw_features_df['screen_name_len'] = tw_features_df['profile'].apply(lambda x: len(x['screen_name']))

# Number of distinct characters in user name
tw_features_df['user_name_chars'] = tw_features_df['profile'].apply(lambda x: len(set(x['name'])))

# Protected and verified status
# ****** Only two protected accounts!
tw_features_df['protected'] = tw_features_df['profile'].apply(lambda x: int(x['protected'] == 'True '))
tw_features_df['verified'] = tw_features_df['profile'].apply(lambda x: int(x['verified'] == 'True '))

# Is a URL associated with the account
tw_features_df['has_url'] = tw_features_df['profile'].apply(lambda x: int(x['url'] != 'None '))

# Social counts
tw_features_df['followers_count'] = tw_features_df['profile'].apply(lambda x: int(x['followers_count']))
tw_features_df['friends_count'] = tw_features_df['profile'].apply(lambda x: int(x['friends_count']))
tw_features_df['favourites_count'] = tw_features_df['profile'].apply(lambda x: int(x['favourites_count']))

ref_date = pd.to_datetime('May 01 2022')

# How many days before May 1 2022 was the account created
tw_features_df['days_old'] = tw_features_df['profile'].apply(lambda x: (ref_date - pd.to_datetime(x['created_at']).replace(tzinfo=None)).days)


In [102]:
tw_features_df.shape

(11746, 18)

In [105]:
tw_features_df.sample(11)

Unnamed: 0,ID,tweet,label,profile,tweet_min_len,tweet_max_len,tweet_av_len,tweet_len_std,user_name_len,screen_name_len,user_name_chars,protected,verified,has_url,followers_count,friends_count,favourites_count,days_old
4926,1264353400784904192,[RT @NoTurnUnstonedd: God damn what an end to ...,1,"{'id': '1264353400784904199 ', 'id_str': '1264...",21,188,89.709184,37.683999,7,8,5,0,0,0,898,1951,779,706
7258,1259102977052291072,[@AlyTafoyaC JAJAJAJAJA sin miedo al éxito ❤️\...,0,"{'id': '1259102977052291074 ', 'id_str': '1259...",19,179,69.794393,38.641109,15,12,12,0,0,0,31,164,318,721
8847,20609518,[Education today tends to focus on material go...,0,"{'id': '20609518 ', 'id_str': '20609518 ', 'na...",24,285,243.925,46.8776,11,10,7,0,1,1,19363220,0,1,4826
620,26703925,"[Follow @Efficcial he good people.\n, ""Bring y...",1,"{'id': '26703925 ', 'id_str': '26703925 ', 'na...",13,141,55.31,31.248102,16,7,5,0,0,1,1530,318,22,4783
10429,944325809094975488,[RT @TrinityResists: @FLOTUS @TheAtlantic Says...,1,"{'id': '944325809094975488 ', 'id_str': '94432...",25,309,119.620513,43.542724,11,6,7,0,0,0,1805,1757,69802,1590
10624,31191938,[ヒバクシャ国際署名に署名しました。 https://t.co/znf7p3Xcrw #Hi...,1,"{'id': '31191938 ', 'id_str': '31191938 ', 'na...",12,309,124.704545,81.424136,12,9,9,0,0,1,400,266,144,4764
2868,15000261,[@LasVegasUK @TravelZork @VitalVegas @MeltzVeg...,1,"{'id': '15000261 ', 'id_str': '15000261 ', 'na...",10,282,79.315,56.592542,14,13,11,0,0,1,32558,15006,32014,5078
10543,378017431,"[@kingo_white91 Jealous\n, RT @Jinks_7: A son,...",1,"{'id': '378017431 ', 'id_str': '378017431 ', '...",14,296,81.361809,50.341045,3,10,3,0,0,0,464,345,3686,3873
5622,1611378990,[@Lazarus whom do we contact for being a possi...,1,"{'id': '1611378990 ', 'id_str': '1611378990 ',...",20,304,146.4,85.923163,13,12,13,0,0,1,89,663,111,3205
8172,34130295,"[RT @mistergeezy: Excuse you, NuQuil Chicken? ...",1,"{'id': '34130295 ', 'id_str': '34130295 ', 'na...",10,287,100.81,40.49832,10,10,8,0,0,0,179,2028,4433,4756
