In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('source_tweets.txt', sep="	", header=None)
data.columns = ["id", "content"]

In [3]:
data.head()

Unnamed: 0,id,content
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...
1,714598641827246081,an open letter to trump voters from his top st...
2,691809004356501505,america is a nation of second chances —@potus ...
3,693204708933160960,"brandon marshall visits and offers advice, sup..."
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...


In [4]:
data.shape

(1490, 2)

In [5]:
labels = pd.read_csv('label.txt', sep=":", header=None)
labels.columns = ["label", "id"]

In [6]:
labels.head()

Unnamed: 0,label,id
0,unverified,731166399389962242
1,unverified,714598641827246081
2,non-rumor,691809004356501505
3,non-rumor,693204708933160960
4,true,551099691702956032


In [7]:
labels.shape

(1490, 2)

In [8]:
data = data.join(labels.set_index('id'), on='id')

In [9]:
data.head()

Unnamed: 0,id,content,label
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified
1,714598641827246081,an open letter to trump voters from his top st...,unverified
2,691809004356501505,america is a nation of second chances —@potus ...,non-rumor
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true


In [10]:
data["label"].unique()

array(['unverified', 'non-rumor', 'true', 'false'], dtype=object)

In [11]:
def convert_tweet_tree_to_df(tweet_id):
#     print(tweet_id)
    tweet_tree = pd.read_csv('tree/' + str(tweet_id) + '.txt', header=None, sep=',', engine='python')
#     print(tweet_tree.head())
    tweet_tree.columns = ["P1", "Tw1", "Udf", "Tw2", "T2"]
    tweet_tree['P1'] = tweet_tree['P1'].str[1:]
    temp = tweet_tree['Udf'].str.split("\]->\[", n = 1, expand = True)
#     print(temp.shape)
    tweet_tree['T1'] = temp[0]
    tweet_tree['P2'] = temp[1]
    tweet_tree = tweet_tree[['P1','Tw1','T1','P2','Tw2','T2']]
    tweet_tree['T2'] = tweet_tree['T2'].str[:-1]
#     print(tweet_tree.head())
    return tweet_tree

In [151]:
convert_tweet_tree_to_df(731166399389962242)

Unnamed: 0,P1,Tw1,T1,P2,Tw2,T2
0,'ROOT','ROOT','0.0','431917957','731166399389962242','0.0'
1,'431917957','731166399389962242','0.0','21107092','731168205788471297','7.18'
2,'431917957','731166399389962242','0.0','4318345394','731166399389962242','15.25'
3,'431917957','731166399389962242','0.0','26538104','731166399389962242','15.25'
4,'431917957','731166399389962242','0.0','2906655619','731166399389962242','15.25'
...,...,...,...,...,...,...
143,'431917957','731166399389962242','0.0','4828199628','731166399389962242','171548.17'
144,'431917957','731166399389962242','0.0','228736760','731166399389962242','171573.12'
145,'431917957','731166399389962242','0.0','2401319821','731166399389962242','171628.83'
146,'431917957','731166399389962242','0.0','1305192181','731166399389962242','171756.85'


In [12]:
def get_tweet_publisher(tweet):
    tweet_df = convert_tweet_tree_to_df(tweet['id'])
    return tweet_df['P2'][0][1:-1]

In [13]:
data['publisher'] = data.apply(get_tweet_publisher, axis=1)

In [14]:
data.head()

Unnamed: 0,id,content,label,publisher
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,431917957
1,714598641827246081,an open letter to trump voters from his top st...,unverified,94215989
2,691809004356501505,america is a nation of second chances —@potus ...,non-rumor,30313925
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor,2557521
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true,2883841


In [18]:
def get_users(tweet, users):
    tweet_df = convert_tweet_tree_to_df(tweet['id'])
    u1 = pd.DataFrame(tweet_df["P1"].unique())
    u2 = pd.DataFrame(tweet_df["P2"].unique())
    users = pd.concat([users, u1, u2], ignore_index=True)
    #users = pd.DataFrame(users[0].unique())
    return users

In [152]:
users = pd.DataFrame([])
size = data.shape[0]
print(size)
for i in range(size):
    users = get_users(data.iloc[i], users)
users = users[0].unique()

1490


In [153]:
print(users.shape)

(480988,)


In [21]:
def get_unique_users(tweet, users):
    tweet_df = convert_tweet_tree_to_df(tweet['id'])
    u1 = pd.DataFrame(tweet_df["P1"].unique())
    u2 = pd.DataFrame(tweet_df["P2"].unique())
    users = pd.concat([users, u1, u2], ignore_index=True)
    users = pd.DataFrame(users[0].unique())
    return users

In [22]:
users1 = pd.DataFrame([])
size = data.shape[0]
print(size)
for i in range(size):
    users1 = get_unique_users(data.iloc[i], users1)
print(users1.shape)
print(users1[0].unique().shape)

1490
(480988, 1)
(480988,)


Reshares & Retweets..?

In [149]:
#Only considering re-shareing users
def get_reshare_users(tweet, users):
    tweet_df = convert_tweet_tree_to_df(tweet['id'])
    #print(tweet_df)
    #print(type(tweet['id']))
    #print(tweet['id'])
    tweet_df["Tw1"] = tweet_df["Tw1"].apply(lambda x: x.replace("'ROOT'", "'0'"))
    tweet_df['Tw1'] = tweet_df["Tw1"].str[2:-1].astype(int)
    tweet_df['Tw2'] = tweet_df["Tw2"].str[2:-1].astype(int)
    #print(tweet_df)
    tweet_df = tweet_df.loc[ (tweet_df['Tw2'] == int(tweet['id'])) & (tweet_df['Tw1'] == int(tweet['id']))]
    #print(tweet_df)
    u1 = pd.DataFrame(tweet_df["P1"].unique())
    u2 = pd.DataFrame(tweet_df["P2"].unique())
    users = pd.concat([users, u1, u2], ignore_index=True)
    #users = pd.DataFrame(users[0].unique())
    return users

In [150]:
users2 = pd.DataFrame([])
size = data.shape[0]
print(size)
users2 = get_reshare_users(data.iloc[0], users2)
# for i in range(size):
#     users2 = get_reshare_users(data.iloc[i], users2)
users2 = users2[0].unique()
print(users2)
print(users2.shape)

1490
["'431917957'" "'2906655619'" "'88550047'" "'484477723'" "'3308560736'"
 "'712976695'" "'3709245494'" "'2319339266'" "'1032818844'" "'33270656'"
 "'3353724259'" "'4318345394'" "'26538104'" "'976840800'"
 "'710545371978670081'" "'4006650737'" "'1426967479'" "'3356662759'"
 "'588192356'" "'3248410062'" "'1656400596'" "'4835421670'" "'4176001696'"
 "'721104775689863168'" "'728588534'" "'716286508542652416'"
 "'4769688253'" "'178885682'" "'4551460812'" "'2586878275'" "'993896732'"
 "'319432175'" "'3661479505'" "'726232124798472192'" "'62070111'"
 "'72841205'" "'2327376091'" "'2949051936'" "'723539075697926144'"
 "'702149906119712768'" "'2926894777'" "'1902314210'" "'236182540'"
 "'67595149'" "'3020425427'" "'444994317'" "'107161788'" "'3185264267'"
 "'315994858'" "'4855439824'" "'2580296111'" "'710247867600887808'"
 "'174025101'" "'1849839132'" "'2884079135'" "'4872648885'" "'4618950578'"
 "'33247227'" "'2284614762'" "'3296734251'" "'2477028826'" "'107033885'"
 "'3310686488'" "'997878

In [140]:
#number of tweets without retweet
def get_original_tweets(tweet, users):
    tweet_df = convert_tweet_tree_to_df(tweet['id'])
    #print(tweet_df)
    #print(type(tweet['id']))
    #print(tweet['id'])
#     tweet_df = tweet_df.iloc[1:,:]
    tweet_df["Tw1"] = tweet_df["Tw1"].apply(lambda x: x.replace("'ROOT'", "'0'"))
    tweet_df['Tw1'] = tweet_df["Tw1"].str[2:-1].astype(int)
    tweet_df['Tw2'] = tweet_df["Tw2"].str[2:-1].astype(int)
#     print(tweet_df)
    tweet_df = tweet_df.loc[ (tweet_df['Tw2'] == int(tweet['id'])) & (tweet_df['Tw1'] == int(tweet['id']))]
#     print(tweet_df)
    u1 = pd.DataFrame(tweet_df["Tw1"].unique())
    u2 = pd.DataFrame(tweet_df["Tw2"].unique())
    users = pd.concat([users, u1, u2], ignore_index=True)
    #users = pd.DataFrame(users[0].unique())
    return users

In [143]:
org_tweets = pd.DataFrame([])
size = data.shape[0]
# print(size)
# org_tweets = get_original_tweets(data.iloc[0], org_tweets)
# org_tweets = get_original_tweets(data.iloc[1], org_tweets)
# org_tweets = get_original_tweets(data.iloc[2], org_tweets)
# org_tweets = get_original_tweets(data.iloc[3], org_tweets)
# org_tweets = get_original_tweets(data.iloc[4], org_tweets)
# org_tweets = get_original_tweets(data.iloc[5], org_tweets)
# org_tweets = get_original_tweets(data.iloc[6], org_tweets)
# org_tweets = get_original_tweets(data.iloc[7], org_tweets)
for i in range(size):
    org_tweets = get_original_tweets(data.iloc[i], org_tweets)
org_tweets = org_tweets[0].unique()
print(org_tweets.shape)

(1490,)


In [136]:
data.head(8)

Unnamed: 0,id,content,label,publisher
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,431917957
1,714598641827246081,an open letter to trump voters from his top st...,unverified,94215989
2,691809004356501505,america is a nation of second chances —@potus ...,non-rumor,30313925
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor,2557521
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true,2883841
5,767223774072676354,former 3 doors down guitarist matt roberts has...,non-rumor,428333
6,715515982584881152,craigslist ad: ‘get paid $15 an hour to protes...,unverified,18643437
7,514106273852174337,just in: missing afghan soldiers found trying ...,true,14173315


In [154]:
data["publisher"].unique().shape #number of unique publishers

(704,)