In [1]:
import pandas as pd

## Utility Functions

In [2]:
def convert_tweet_tree_to_df(tweet_id, year):
    tweet_tree = pd.read_csv( 'twitter' + year + '/tree/' + str(tweet_id) + '.txt', 
                             header=None, sep=',', engine='python')
    tweet_tree.columns = ["P1", "Tw1", "Udf", "Tw2", "T2"]
    tweet_tree['P1'] = tweet_tree['P1'].str[1:]
    temp = tweet_tree['Udf'].str.split("\]->\[", n = 1, expand = True)
    tweet_tree['T1'] = temp[0]
    tweet_tree['P2'] = temp[1]
    tweet_tree = tweet_tree[['P1','Tw1','T1','P2','Tw2','T2']]
    tweet_tree['T2'] = tweet_tree['T2'].str[:-1]
    return tweet_tree

In [12]:
def get_tweet_publisher(tweet, year, _):
    tweet_df = convert_tweet_tree_to_df(tweet['id'], year)
    return tweet_df['P2'][0][1:-1]

## Reading Datasets

In [4]:
data_15 = pd.read_csv('twitter15/source_tweets.txt', sep="\t", header=None)
data_15.columns = ["id", "content"]

In [5]:
labels_15 = pd.read_csv('twitter15/label.txt', sep=":", header=None)
labels_15.columns = ["label", "id"]

In [6]:
data_15 = data_15.join(labels_15.set_index('id'), on='id')

In [18]:
data_15['publisher'] = data_15.apply(get_tweet_publisher, axis=1, args=("15", _))

In [19]:
data_15.head()

Unnamed: 0,id,content,label,publisher
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,431917957
1,714598641827246081,an open letter to trump voters from his top st...,unverified,94215989
2,691809004356501505,america is a nation of second chances —@potus ...,non-rumor,30313925
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor,2557521
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true,2883841


In [20]:
data_15.shape

(1490, 4)

In [21]:
data_15.count()

id           1490
content      1490
label        1490
publisher    1490
dtype: int64

In [22]:
data_16 = pd.read_csv('twitter16/source_tweets.txt', sep="\t", header=None)
data_16.columns = ["id", "content"]

In [23]:
labels_16 = pd.read_csv('twitter16/label.txt', sep=":", header=None)
labels_16.columns = ["label", "id"]

In [24]:
data_16 = data_16.join(labels_16.set_index('id'), on='id')

In [25]:
data_16['publisher'] = data_16.apply(get_tweet_publisher, axis=1, args=("16", _))

In [26]:
data_16.head()

Unnamed: 0,id,content,label,publisher
0,656955120626880512,correct predictions in back to the future ii URL,false,1942819082
1,615689290706595840,.@whitehouse in rainbow colors for #scotusmarr...,true,44945327
2,613404935003217920,cops bought the alleged church shooter burger ...,false,14511951
3,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,431917957
4,714598641827246081,an open letter to trump voters from his top st...,unverified,94215989


In [27]:
data_16.shape

(818, 4)

In [28]:
data_16.count()

id           818
content      818
label        818
publisher    818
dtype: int64

In [29]:
data = pd.concat([data_15, data_16], ignore_index=True)

In [30]:
data

Unnamed: 0,id,content,label,publisher
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,431917957
1,714598641827246081,an open letter to trump voters from his top st...,unverified,94215989
2,691809004356501505,america is a nation of second chances —@potus ...,non-rumor,30313925
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor,2557521
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true,2883841
...,...,...,...,...
2303,693546915892428800,jeb bush campaign kicks off 3-state farewell t...,non-rumor,14075928
2304,544269749405097984,breaking: live coverage of hostage situation u...,true,15250661
2305,760109079133990912,“after school satan clubs”? URL,unverified,44945327
2306,779633844680962048,this network of tunnels is from the stone age ...,unverified,918346674


In [31]:
data.shape

(2308, 4)

In [32]:
len(data["id"].unique())

2139

In [33]:
data = data.drop_duplicates(subset=None, keep='first', inplace=False)

In [34]:
data

Unnamed: 0,id,content,label,publisher
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,431917957
1,714598641827246081,an open letter to trump voters from his top st...,unverified,94215989
2,691809004356501505,america is a nation of second chances —@potus ...,non-rumor,30313925
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor,2557521
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true,2883841
...,...,...,...,...
2302,693171092555431936,watch: tommy chong made a pro-bernie sanders v...,non-rumor,16664681
2303,693546915892428800,jeb bush campaign kicks off 3-state farewell t...,non-rumor,14075928
2304,544269749405097984,breaking: live coverage of hostage situation u...,true,15250661
2306,779633844680962048,this network of tunnels is from the stone age ...,unverified,918346674


## Preprocessing