In [1]:
import numpy as np
import pandas as pd
import gzip

pd.set_option('display.max_columns', None)

In [2]:
train_path = 'Twibot-20/train.json'
dev_path = 'Twibot-20/dev.json'
test_path = 'Twibot-20/test.json'

In [3]:
test_df = pd.read_json(test_path)

train_df = pd.read_json(train_path)

dev_df = pd.read_json(dev_path)

In [4]:
print(test_df.shape)
print(dev_df.shape)
print(train_df.shape)

(1183, 6)
(2365, 6)
(8278, 6)


In [5]:
twibot_df = pd.concat([train_df, dev_df, test_df], axis=0).reset_index(drop=True)
# shuffle all entries not sure if its necessary
twibot_df = twibot_df.sample(twibot_df.shape[0])

In [6]:
twibot_df.shape

(11826, 6)

In [7]:
twibot_df.sample(5)

Unnamed: 0,ID,profile,tweet,neighbor,domain,label
7325,21754872,"{'id': '21754872 ', 'id_str': '21754872 ', 'na...","[@cl_bloomfield @ECAEurope Congrats Claire\n, ...","{'following': ['1250701602597613568', '8052247...",[Sports],0
8004,247836899,"{'id': '247836899 ', 'id_str': '247836899 ', '...","[@Samanthaaaa_x me your not alone on that\n, @...","{'following': ['7217902', '237873825', '210951...",[Sports],1
4278,2209843016,"{'id': '2209843016 ', 'id_str': '2209843016 ',...",[@VegasBigBoss What the picks boss let me know...,"{'following': ['114870386', '94405288002528460...",[Entertainment],1
11243,51018206,"{'id': '51018206 ', 'id_str': '51018206 ', 'na...","[@seanhannity Not all Republicans, but she’s n...","{'following': ['48873513', '32940704', '190906...",[Business],0
3551,2435905878,"{'id': '2435905878 ', 'id_str': '2435905878 ',...",[RT @VMVMedia: No ordinary #Monday. 🏴 Features...,"{'following': ['127188558', '32501062', '10350...",[Business],1


In [8]:
users_df = twibot_df[['ID', 'profile', 'neighbor', 'domain', 'label']]
tweets_df = twibot_df[['ID', 'tweet', 'label']]
tweets_df.shape

(11826, 3)

### join all of the tweets of a user into a document

In [9]:
def change_tweets_to_doc(row):
    if row is not None:
        return ' '.join(row)
    else:
        return np.nan

In [10]:
tweets_doc = tweets_df.copy()
tweets_doc.loc[:,'tweet'] = tweets_df['tweet'].apply(change_tweets_to_doc)
tweets_doc.shape

(11826, 3)

### Explode all tweets so there is one tweet per line

In [12]:
tweets_explode = tweets_df.copy()
tweets_explode = tweets_explode.explode('tweet')
tweets_explode.shape

(1999868, 3)

### Save all tweet dataframes origional, document, exploded

In [13]:
tweets_df.to_csv('Twibot-20/tweets.csv', index=False)
tweets_doc.to_csv('Twibot-20/tweets_doc.csv', index=False)
tweets_explode.to_csv('Twibot-20/tweets_explode.csv', index=False)

In [14]:
# def expand_profile(row):
#     df_ = pd.DataFrame(row, index=[0])
#     return df_


In [15]:
# exp_user_profile = expand_profile(users_df['profile'][0])
# for row in users_df['profile'].iloc[1:]:
#     exp_user_profile = pd.concat([exp_user_profile, expand_profile(row)], axis=0)

In [16]:
# display(exp_user_profile.shape)

In [17]:
# exp_user_profile.loc[:,'id'] = exp_user_profile['id'].astype('int64')
# users_df.loc['ID'] = users_df['ID'].astype('int64')
# users_df_exp = exp_user_profile.dropna().merge(users_df[['ID', 'neighbor', 'domain', 'label']], how='inner', left_on='id', right_on='ID')
# users_df_exp.head()

In [18]:
# users_df.to_csv('Twibot-20/users.csv', index=False)

In [19]:
support = pd.read_json('Twibot-20/support.json')

In [20]:
support.shape

(217754, 5)

In [21]:
support.head()

Unnamed: 0,ID,profile,tweet,neighbor,domain
0,1082775333336768512,"{'id': '1082775333336768517 ', 'id_str': '1082...",[RT @RandyRRQuaid: #RNC KICKOFF with Randy’s N...,,[Politics]
1,1076983321438142464,"{'id': '1076983321438142464 ', 'id_str': '1076...",[RT @yogagenie: 02/23/20 ~ @GeneStump1 former...,,[Politics]
2,1166391878264246272,"{'id': '1166391878264246272 ', 'id_str': '1166...","[@joaocaetano aí sim ⚡\n, RT @loud_victor: Mai...",,[Politics]
3,103593224,"{'id': '103593224 ', 'id_str': '103593224 ', '...",[RT @TheDemCoalition: FACT: Seven former Trump...,,[Politics]
4,1274010352683016192,"{'id': '1274010352683016196 ', 'id_str': '1274...",[Man Ellen isn’t even funny\nShe literally smi...,,[Politics]


### Convert support tweets to document as above

In [22]:
all_tweets_doc = support[['ID', 'tweet']]
all_tweets_doc.loc[:, 'tweet'] = all_tweets_doc['tweet'].apply(change_tweets_to_doc)
all_tweets_doc.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  all_tweets_doc['tweet'] = all_tweets_doc['tweet'].apply(change_tweets_to_doc)


(217754, 2)

In [24]:
# save using gzip, when loading with pandas.read_csv add compression argument
all_tweets_doc.to_csv('Twibot-20/all_tweets_doc.csv.gz', compression='gzip')

### Exploding all tweets

In [23]:
all_tweets_explode = support[['ID', 'tweet']]
all_tweets_explode = all_tweets_explode.explode('tweet')
all_tweets_explode.shape

(31511861, 2)

In [25]:
all_tweets_explode.to_csv('Twibot-20/all_tweets_explode.csv.gz', compression='gzip')

In [2]:
from utils import split_all_data

In [8]:
results_dict = split_all_data(.8)

Loading the data...
Splitting the data...
Removing from support...


In [9]:
for s in results_dict:
    try:
        print(results_dict[s].shape)
    except AttributeError:
        print(s, ':None')

(9460, 6)
(1183, 6)
(1183, 6)
(217754, 5)
