In [2]:
import torch
import numpy as np
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

In [3]:

df_dev = pd.read_json('./Twibot-20/dev.json')

print('loaded df_dev')
print('df_dev shape', df_dev.shape)

loaded df_dev
df_dev shape (2365, 6)


In [4]:
import re

example_tweet = "RT @andersoncooper Today on @Anderson @PAAK: the debate over #ADHD -- are kids being overmedicated? #Thursday #Anderson\n"

In [5]:
retweetUnameRegex = re.compile(r'(?<=RT @)(\w{1,15})')
mentionsUnameRegex = re.compile(r'(?<!RT @)(?<=@)(\w{1,15})')
# retweetUnameRegex.search(example_tweet)
print(retweetUnameRegex.findall(example_tweet))
print(mentionsUnameRegex.findall(example_tweet))
# find = re.search(r'(?<=RT @)(\w){1,15}', example_tweet)

['andersoncooper']
['Anderson', 'PAAK']


In [6]:
# extracting tweets dataframe from df_dev
# each tweet has a body (text), id (T-userID-tweetNumberByUser), and neighbours - can extract neighbourhoods from userID as well as RT @uname/ @uname for mentions.
## REGULAR EXPRESSIONS
# Regex for capturing Retweet uname: (?<=RT @)(\w{1,15})
# Regex for Mention unames (All inclusive): (?<=@)(\w{1,15})
# Regex for Mention unames WITHOUT retweets: (?<!RT @)(?<=@)(\w{1,15})

# will extract the 3 different sets of neighbours separately then the different version of the dataset can use these freely.
import re


def extractNeighborUnameDict(tweet):
    # extract retweet uname

    retweet_uname = re.findall(r'(?<=RT @)(\w{1,15})', tweet)

    # extract mention unames without retweets
    mentions_no_rt = re.findall(r'(?<!RT @)(?<=@)(\w{1,15})', tweet)

    return {"retweeted": retweet_uname, "mentions": mentions_no_rt}

df_tweet = df_dev[df_dev.tweet.notnull()].apply(lambda x: pd.Series([{"ID": "T"+ str(x['ID']) + "-" + str(i), "Body": tweet, "neighborUsernames": extractNeighborUnameDict(tweet), "tweeterId": x['ID']} for (i,tweet) in enumerate(x['tweet'])]), axis=1)

print(type(df_tweet))
# print(df_tweet[:5])
# unameDict = extractNeighborUnameDict(example_tweet)
# print(unameDict)


<class 'pandas.core.frame.DataFrame'>


In [7]:
print(df_tweet.columns)
print(df_tweet.shape)

print(df_tweet.head().shape)

RangeIndex(start=0, stop=200, step=1)
(2350, 200)
(5, 200)


In [8]:
print(df_tweet.stack())
print(df_tweet.stack().shape)
print(type(df_tweet.stack()))

0     0      {'ID': 'T1224667050301255680-0', 'Body': '@Spa...
      1      {'ID': 'T1224667050301255680-1', 'Body': '@Bar...
      2      {'ID': 'T1224667050301255680-2', 'Body': 'সেদি...
      3      {'ID': 'T1224667050301255680-3', 'Body': 'নিজে...
      4      {'ID': 'T1224667050301255680-4', 'Body': 'ফোন ...
                                   ...                        
2364  195    {'ID': 'T412642667-195', 'Body': 'RT @JaylaCym...
      196    {'ID': 'T412642667-196', 'Body': 'RT @WWE: The...
      197    {'ID': 'T412642667-197', 'Body': 'Cooking, Cla...
      198    {'ID': 'T412642667-198', 'Body': 'RT @TheYBF: ...
      199    {'ID': 'T412642667-199', 'Body': 'RT @TheYBF: ...
Length: 401525, dtype: object
(401525,)
<class 'pandas.core.series.Series'>


In [9]:
df_tweet_alt = pd.DataFrame(list(df_tweet.stack()))
print(df_tweet_alt.shape)

(401525, 4)


In [10]:
print(df_tweet_alt.columns)

Index(['ID', 'Body', 'neighborUsernames', 'tweeterId'], dtype='object')


In [11]:
## Final code for extracting the tweets dataframe from df_dev

df_tweet = df_dev[df_dev.tweet.notnull()].apply(lambda x: pd.Series([{"ID": "T"+ str(x['ID']) + "-" + str(i), "Body": tweet, **extractNeighborUnameDict(tweet), "tweeterId": x['ID']} for (i,tweet) in enumerate(x['tweet'])]), axis=1)
stacked_df_tweet = df_tweet.stack()
df_tweet = pd.DataFrame(list(stacked_df_tweet), index=pd.RangeIndex(df_dev.shape[0], stacked_df_tweet.shape[0] + df_dev.shape[0], 1))

In [12]:
print(df_tweet.columns)
print(df_tweet.shape)

Index(['ID', 'Body', 'retweeted', 'mentions', 'tweeterId'], dtype='object')
(401525, 5)


In [13]:
df_tweet_dict = df_tweet.transpose().to_dict()

# for index,tweet in df_tweet_dict.items():
#     print(index)
#     print(tweet)

In [14]:
print(df_tweet_dict[2365])

{'ID': 'T1224667050301255680-0', 'Body': '@SparklesOnlyme পুরোনো এইদিনের কথা\n', 'retweeted': [], 'mentions': ['SparklesOnlyme'], 'tweeterId': 1224667050301255680}


In [22]:
df_tweet['rowIndex'] = df_tweet.index
print(df_tweet.shape)
print(df_tweet.columns)



(401525, 6)
Index(['ID', 'Body', 'retweeted', 'mentions', 'tweeterId', 'rowIndex'], dtype='object')


In [15]:
# Attempting to concatenate to df_dev (without the tweets column now)
# df_dev = df_dev.iloc[:, [0,1,3,5]]
# print(df_dev.columns)
df_dev_no_tweets = df_dev.iloc[:, [0,1,3,5]]

print(df_dev_no_tweets.columns)

Index(['ID', 'profile', 'neighbor', 'label'], dtype='object')


In [16]:
concat_keep_index = pd.concat([df_dev_no_tweets, df_tweet], axis=1, ignore_index=False)

In [17]:

print(concat_keep_index.columns)


print(concat_keep_index.shape)

# actually, let's NOT concatenate the tweets dataframe to the users dataframe.

Index(['ID', 'profile', 'neighbor', 'label', 'ID', 'Body', 'retweeted',
       'mentions', 'tweeterId'],
      dtype='object')
(403890, 9)


In [18]:
df_dev_profiles = df_dev[df_dev.profile.notnull()].apply(lambda x: pd.Series(x.profile), axis=1)
# print(type(df_dev_profiles))
# print(df_dev_profiles.shape)
# print(df_dev_profiles.head())
print(df_dev_profiles.columns)

screen_names = df_dev_profiles.apply(lambda x: x.screen_name.strip(), axis=1)
print(screen_names.head())

# profile_df = pd.DataFrame(df_dev_with_profiles['profile'])
# print(profile_df.shape)
# print(profile_df.columns)
# print(pd.Series(df_dev_with_profiles.head()['profile']))




Index(['id', 'id_str', 'name', 'screen_name', 'location', 'profile_location',
       'description', 'url', 'entities', 'protected', 'followers_count',
       'friends_count', 'listed_count', 'created_at', 'favourites_count',
       'utc_offset', 'time_zone', 'geo_enabled', 'verified', 'statuses_count',
       'lang', 'contributors_enabled', 'is_translator',
       'is_translation_enabled', 'profile_background_color',
       'profile_background_image_url', 'profile_background_image_url_https',
       'profile_background_tile', 'profile_image_url',
       'profile_image_url_https', 'profile_link_color',
       'profile_sidebar_border_color', 'profile_sidebar_fill_color',
       'profile_text_color', 'profile_use_background_image',
       'has_extended_profile', 'default_profile', 'default_profile_image'],
      dtype='object')
0     sunnyhowlader5
1      Maebha_Racing
2    thepennyhoarder
3           momlogic
4            Variety
dtype: object


In [19]:
df_dev_profiles = df_dev[df_dev.profile.notnull()].apply(lambda x: pd.Series(x.profile), axis=1)
uname2id_dict = {x['screen_name'].strip(): x['id'].strip() for index, x in df_dev_profiles.iloc[:,[3,0]].transpose().to_dict().items()}
# uname2id_dict = {user['profile']['screen_name'].strip():user['ID'] for user in df_dev[df_dev.profile.notnull()][df_dev.profile['screen_name'] is not None]}
print('dimensionality of uname2id dict', len(uname2id_dict))
# print(uname2id_dict)
# print(uname2id_dict)
# print('sample from uname2id dict', uname2id_dict['realDonaldTrump'])

dimensionality of uname2id dict 2365


In [25]:
from TwibotSmallAugmentedTSVDHomogeneous import TwibotSmallAugmentedTSVDHomogeneous

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
homogeneous = TwibotSmallAugmentedTSVDHomogeneous(dev=False, device=device)

des_tensor,tweets_tensor,num_prop,category_prop,edge_index,edge_type,labels,train_idx,val_idx,test_idx=homogeneous.dataloader()

AttributeError: 'str' object has no attribute 'type'