In [76]:
import pandas as pd
from fast_ml.model_development import train_valid_test_split
from string import punctuation
import re
import nltk
from nltk import word_tokenize
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shivamarora/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Utility Functions

In [2]:
def convert_tweet_tree_to_df(tweet_id, year):
    tweet_tree = pd.read_csv( 'twitter' + year + '/tree/' + str(tweet_id) + '.txt', 
                             header=None, sep=',', engine='python')
    tweet_tree.columns = ["P1", "Tw1", "Udf", "Tw2", "T2"]
    tweet_tree['P1'] = tweet_tree['P1'].str[1:]
    temp = tweet_tree['Udf'].str.split("\]->\[", n = 1, expand = True)
    tweet_tree['T1'] = temp[0]
    tweet_tree['P2'] = temp[1]
    tweet_tree = tweet_tree[['P1','Tw1','T1','P2','Tw2','T2']]
    tweet_tree['T2'] = tweet_tree['T2'].str[:-1]
    return tweet_tree

In [3]:
def get_tweet_publisher(tweet, year, _):
    tweet_df = convert_tweet_tree_to_df(tweet['id'], year)
    return tweet_df['P2'][0][1:-1]

In [45]:
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in punctuation])
    return punctuationfree

In [73]:
def tokenization(text):
#     tokens = re.split(' ',text)
#     return tokens
    return word_tokenize(text)
#     tokenized = []

    
#     for sent in text: 
#         if type(sent) is str:
#           texts = [word for word in word_tokenize(texts)] # if word not in stopWords]
#           # print(texts)
#           tokenized.append(texts)
#     # texts = ' '.join(texts)
#     # print(texts)
#     return tokenized

In [74]:
from nltk.corpus import stopwords
stopWords = stopwords.words('english')
def remove_stopwords(text):
    output= [i for i in text if i not in stopWords]
    return output

## Reading Datasets

In [4]:
data_15 = pd.read_csv('twitter15/source_tweets.txt', sep="\t", header=None)
data_15.columns = ["id", "content"]

In [5]:
labels_15 = pd.read_csv('twitter15/label.txt', sep=":", header=None)
labels_15.columns = ["label", "id"]

In [6]:
data_15 = data_15.join(labels_15.set_index('id'), on='id')

In [7]:
data_15['publisher'] = data_15.apply(get_tweet_publisher, axis=1, args=("15", _))

In [8]:
data_15.head()

Unnamed: 0,id,content,label,publisher
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,431917957
1,714598641827246081,an open letter to trump voters from his top st...,unverified,94215989
2,691809004356501505,america is a nation of second chances —@potus ...,non-rumor,30313925
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor,2557521
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true,2883841


In [9]:
data_15.shape

(1490, 4)

In [10]:
data_15.count()

id           1490
content      1490
label        1490
publisher    1490
dtype: int64

In [11]:
data_16 = pd.read_csv('twitter16/source_tweets.txt', sep="\t", header=None)
data_16.columns = ["id", "content"]

In [12]:
labels_16 = pd.read_csv('twitter16/label.txt', sep=":", header=None)
labels_16.columns = ["label", "id"]

In [13]:
data_16 = data_16.join(labels_16.set_index('id'), on='id')

In [14]:
data_16['publisher'] = data_16.apply(get_tweet_publisher, axis=1, args=("16", _))

In [15]:
data_16.head()

Unnamed: 0,id,content,label,publisher
0,656955120626880512,correct predictions in back to the future ii URL,false,1942819082
1,615689290706595840,.@whitehouse in rainbow colors for #scotusmarr...,true,44945327
2,613404935003217920,cops bought the alleged church shooter burger ...,false,14511951
3,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,431917957
4,714598641827246081,an open letter to trump voters from his top st...,unverified,94215989


In [16]:
data_16.shape

(818, 4)

In [17]:
data_16.count()

id           818
content      818
label        818
publisher    818
dtype: int64

In [18]:
data = pd.concat([data_15, data_16], ignore_index=True)

In [19]:
data

Unnamed: 0,id,content,label,publisher
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,431917957
1,714598641827246081,an open letter to trump voters from his top st...,unverified,94215989
2,691809004356501505,america is a nation of second chances —@potus ...,non-rumor,30313925
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor,2557521
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true,2883841
...,...,...,...,...
2303,693546915892428800,jeb bush campaign kicks off 3-state farewell t...,non-rumor,14075928
2304,544269749405097984,breaking: live coverage of hostage situation u...,true,15250661
2305,760109079133990912,“after school satan clubs”? URL,unverified,44945327
2306,779633844680962048,this network of tunnels is from the stone age ...,unverified,918346674


In [20]:
data.shape

(2308, 4)

In [21]:
len(data["id"].unique())

2139

In [22]:
data = data.drop_duplicates(subset=None, keep='first', inplace=False)

In [23]:
data

Unnamed: 0,id,content,label,publisher
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified,431917957
1,714598641827246081,an open letter to trump voters from his top st...,unverified,94215989
2,691809004356501505,america is a nation of second chances —@potus ...,non-rumor,30313925
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor,2557521
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true,2883841
...,...,...,...,...
2302,693171092555431936,watch: tommy chong made a pro-bernie sanders v...,non-rumor,16664681
2303,693546915892428800,jeb bush campaign kicks off 3-state farewell t...,non-rumor,14075928
2304,544269749405097984,breaking: live coverage of hostage situation u...,true,15250661
2306,779633844680962048,this network of tunnels is from the stone age ...,unverified,918346674


In [24]:
len(data["publisher"].unique())

906

## Preprocessing

In [40]:
X_train, y_train, X_valid, y_valid, X_test, y_test = train_valid_test_split(data, target = 'label', 
                                                                            train_size=0.8, valid_size=0.1, test_size=0.1)

In [41]:
print(y_train.value_counts())

non-rumor     484
false         460
true          442
unverified    325
Name: label, dtype: int64


In [42]:
print(y_valid.value_counts())

false         64
true          61
non-rumor     50
unverified    39
Name: label, dtype: int64


In [43]:
print(y_test.value_counts())

true          76
false         51
non-rumor     45
unverified    42
Name: label, dtype: int64


### Content Preprocessing

In [46]:
#Punctuation Removal
data['content']= data['content'].apply(lambda x:remove_punctuation(x))

In [48]:
#lowering the text
data['content']= data['content'].apply(lambda x: x.lower())

In [75]:
#tokenization
data['content']= data['content'].apply(lambda x: tokenization(x))

TypeError: expected string or bytes-like object

In [69]:
#Removing stop words
data['content']= data['content'].apply(lambda x:remove_stopwords(x))

In [70]:
data["content"].head()

0    [🔥ca kkk grand wizard 🔥 endorses hillaryclinto...
1    [an open letter to trump voters from his top s...
2    [america is a nation of second chances —potus ...
3    [brandon marshall visits and offers advice sup...
4    [rip elly may clampett so sad to learn beverly...
Name: content, dtype: object

In [72]:
"is" in stopWords

True