## Twitter API data collection
In this notebook, data from the Twitter API is collected for the users mentioned in the Twitter16\* data set. The following user features are collected:
- `verified profile`
- `account age`
- `#followers`
- `#tweets`.

The following content (tweet) features are collected:
- `#favourites`
- `#retweets`
- `#replies`.
- `length`
- `#hashs`
- `#mentions`
- `#URLs`.

As defined by Voshughi\*, `user_engagement` is defined as (`#tweets` + `#retweets` + `#replies` + `#favourites`) / `account age`. 

The `sentiment_score` is based on the VADER sentiment analysis tool\*\*.

\* Vosoughi, S., Roy, D., and Aral, S.: The spread of true and false news online. *Science* 359, 6380 (2018), 1146–1151.

\*\* https://github.com/cjhutto/vaderSentiment

### Overview of notebook:
1. Load Twitter16 data
2. Content features
3. User features (Twitter API)
4. User engagement metric
5. Sentiment score
6. Export dataframe

In [6]:
import sys
import time
from datetime import datetime


import pandas as pd
import tweepy
from tqdm import tqdm

from IPython.display import clear_output

import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [7]:
def countdown(t, step=1, msg='sleeping'):  # in seconds
    pad_str = ' ' * len('%d' % step)
    for i in range(t, 0, -step):
        print('%s for the next %d seconds %s\r' % (msg, i, pad_str)),
        clear_output(wait=True)
        time.sleep(step)
    print('Done %s for %d seconds!  %s' % (msg, t, pad_str))


Done sleeping for 10 seconds!   


### 1. Load Twitter16 data 
#### Load labels

In [3]:
# read data
df1 = pd.read_csv('src/Twitter16/label.txt', sep=":", header=None) 
df1.columns = ["label", "tweet_id"]

# filter true and false labels
df1 = df1.loc[(df1['label'] == 'false') | (df1['label'] == 'true')]
df1.shape

(412, 2)

In [4]:
mapping_dict = {'false': 1, 'true': 0}
df1['label'] = df1['label'].map(mapping_dict)
df1.head()

Unnamed: 0,label,tweet_id
0,1,656955120626880512
1,0,615689290706595840
2,1,613404935003217920
5,0,614467824313106432
9,1,622891631293935616


#### Load tweets

In [5]:
# read data
df2 = pd.read_csv('src/Twitter16/source_tweets.txt', sep="	", header=None)
df2.columns = ["tweet_id", "tweet"]

#### Merge dataframes

In [8]:
df = pd.merge(df1, df2, on='tweet_id')
df.head()

Unnamed: 0,label,tweet_id,tweet
0,1,656955120626880512,correct predictions in back to the future ii URL
1,0,615689290706595840,.@whitehouse in rainbow colors for #scotusmarr...
2,1,613404935003217920,cops bought the alleged church shooter burger ...
3,0,614467824313106432,god put a rainbow over the white house 🌈 URL
4,1,622891631293935616,#wakeupamerica🇺🇸 who needs a #gun registry whe...


### 2. Determine content features
#### #URLs, #mentions, #hashtags and length

In [9]:
def n_URLs(row):
    return(row.count('URL'))

def n_mentions(row):
    return(row.count('@'))

def n_hashs(row):
    return(row.count('#'))

df['length']= df.apply(lambda x: len(x['tweet']), axis=1)
df['#URLs'] = df.apply(lambda x: n_URLs(x['tweet']), axis=1)
df['#mentions'] = df.apply(lambda x: n_mentions(x['tweet']), axis=1)
df['#hashs'] = df.apply(lambda x: n_hashs(x['tweet']), axis=1)
df.head()

Unnamed: 0,label,tweet_id,tweet,length,#URLs,#mentions,#hashs
0,1,656955120626880512,correct predictions in back to the future ii URL,48,1,0,0
1,0,615689290706595840,.@whitehouse in rainbow colors for #scotusmarr...,96,1,1,1
2,1,613404935003217920,cops bought the alleged church shooter burger ...,75,1,0,0
3,0,614467824313106432,god put a rainbow over the white house 🌈 URL,44,1,0,0
4,1,622891631293935616,#wakeupamerica🇺🇸 who needs a #gun registry whe...,96,2,0,3


#### Add new columns

In [10]:
df['verified'] = [0]*df.shape[0]
df['#followers'] = [0]*df.shape[0]

# user engagement
df['#replies'] = [0]*df.shape[0]
df['#retweets'] = [0]*df.shape[0]
df['#tweets'] = [0]*df.shape[0]
df['#favourites'] = [0]*df.shape[0]
df['account_age'] = [0]*df.shape[0]
df.head()

Unnamed: 0,label,tweet_id,tweet,length,#URLs,#mentions,#hashs,verified,#followers,#replies,#retweets,#tweets,#favourites,account_age
0,1,656955120626880512,correct predictions in back to the future ii URL,48,1,0,0,0,0,0,0,0,0,0
1,0,615689290706595840,.@whitehouse in rainbow colors for #scotusmarr...,96,1,1,1,0,0,0,0,0,0,0
2,1,613404935003217920,cops bought the alleged church shooter burger ...,75,1,0,0,0,0,0,0,0,0,0
3,0,614467824313106432,god put a rainbow over the white house 🌈 URL,44,1,0,0,0,0,0,0,0,0,0
4,1,622891631293935616,#wakeupamerica🇺🇸 who needs a #gun registry whe...,96,2,0,3,0,0,0,0,0,0,0


### 3. User features (Twitter API)
#### Connect to Twitter API

In [11]:
# twitter API
api_key = "DFzJ0JdiaB1R2HdNvoLd1zed6"
api_secrets = "eN6ifjX1odIYy2rae8Gbe25jVWihwk7Z40nvFJ1pLDKbWXnh1j"
access_token = "1542497605141385218-XVZ9BRiWvNeaOAvPxOOJ9SfmMAnqbW"
access_secret = "vNPqtgIs5ruEUr8ZasSTNHXmn0u3dXVYjSCMxfXdi5Fcx"
bearer_token = "AAAAAAAAAAAAAAAAAAAAAIWNkgEAAAAAsNlmvJOaWNjYhDUacKToVogtmw0%3DBv0ZF3lCyKSwQUl7ePH33CHkiXhq5DypPY960jvG3KNCOIraOv"
 
# Authenticate to Twitter
auth = tweepy.OAuthHandler(api_key,api_secrets)
auth.set_access_token(access_token,access_secret)

#### Collect retweet and reply count in two parts to prevent API overload

In [18]:
# first 200 requests
j=0
for i in tqdm(range(0,200)):
    tweet_id = df.iloc[i,1]
    try:
        # connect to API
        client = tweepy.Client(bearer_token=bearer_token)
        client_result = client.get_tweet(tweet_id, \
              tweet_fields=["public_metrics"])
        tweet = client_result.data
        df.iloc[i,9] = tweet.public_metrics['reply_count']
        df.iloc[i,10] = tweet.public_metrics['retweet_count']
    except Exception as e: 
#         print(e)
        j+=1
        pass 
print("exceptions: ",j)

# wait 15+ minutes
countdown(60*15 + 5, 1, 'Sleeping')

# second 200+ requests
k=0
for i in tqdm(range(200,df.shape[0])):
    tweet_id = df.iloc[i,1]
    try:
        client = tweepy.Client(bearer_token=bearer_token)
        client_result = client.get_tweet(tweet_id, \
        tweet_fields=["public_metrics"])
        tweet = client_result.data
        df.iloc[i,9] = tweet.public_metrics['reply_count']
        df.iloc[i,10] = tweet.public_metrics['retweet_count']
    except Exception as e: 
#         print(e)
        k+=1
        pass 
print("exceptions: ",k)

100%|██████████| 200/200 [00:37<00:00,  5.28it/s]


exceptions:  200


100%|██████████| 212/212 [00:42<00:00,  5.04it/s]

exceptions:  212





In [None]:
df.tail(10)

#### Collect verified, #followers, #tweets and #favourites in two parts to prevent API overload

In [19]:
# first 200 requests
l=0
for i in tqdm(range(0,200)):
    tweet_id = df.iloc[i,1]
    try:
        # connect to API
        api = tweepy.API(auth)
        status= api.get_status(id=tweet_id)
#         print(status)
        df.iloc[i,7] = status._json['user']['verified']
        df.iloc[i,8] = status._json['user']['followers_count']
        df.iloc[i,11] = status._json['user']['statuses_count']
        df.iloc[i,12] = status._json['favorite_count']
        
        # account age
        account_created = status._json['user']['created_at']
        tweet_created = status._json['created_at']
        t1 = '-'.join(list(account_created.split(' ')[i] for i in [1,2,5]))
        t2 = '-'.join(list(tweet_created.split(' ')[i] for i in [1,2,5]))
        FMT = '%b-%d-%Y'
        tdelta = datetime.strptime(t2, FMT) - datetime.strptime(t1, FMT)
        df.iloc[i,13] = abs(tdelta.days)
        
    except:
        l+=1
        pass 
    
# wait 15+ minutes
countdown(60*15 + 5, 1, 'Sleeping')
    
# second 200+ requests
m=0
for i in tqdm(range(200,df.shape[0])):
    tweet_id = df.iloc[i,1]
    try:
        # connect to API
        api = tweepy.API(auth)
        status= api.get_status(id=tweet_id)
#         print(status)
        df.iloc[i,7] = status._json['user']['verified']
        df.iloc[i,8] = status._json['user']['followers_count']
        df.iloc[i,11] = status._json['user']['statuses_count']
        df.iloc[i,12] = status._json['favorite_count']
        
        # account age
        account_created = status._json['user']['created_at']
        tweet_created = status._json['created_at']
        t1 = '-'.join(list(account_created.split(' ')[i] for i in [1,2,5]))
        t2 = '-'.join(list(tweet_created.split(' ')[i] for i in [1,2,5]))
        FMT = '%b-%d-%Y'
        tdelta = datetime.strptime(t2, FMT) - datetime.strptime(t1, FMT)
        df.iloc[i,13] = abs(tdelta.days)
        
    except:
        m+=1
        pass 

In [None]:
print(l)
print(m)

In [None]:
df.shape

In [None]:
# drop 96 non-existing profiles
no_exist_idx = df.loc[(df['#replies'] == 0) & (df['#retweets'] == 0) & (df['verified'] == 0) & (df['#followers'] == 0) & (df['#tweets'] == 0) & (df['#favourites'] == 0)].index
df = df.drop(index=no_exist_idx)
df.shape

In [None]:
df2 = df.copy()

In [None]:
# map verified profile column
mapping2_dict = {True: 1, False: 0}
df2['verified'] = df2['verified'].map(mapping2_dict)
df2.head()

### 4. User engagement metric

In [None]:
df2['user_engagement'] = (df2['#tweets']+df2['#retweets']+df2['#replies']+df2['#favourites'])/df2['account_age']
df2.head()

### 5. Sentiment score

In [None]:
# new column
df2['sentiment_score'] = [0]*df2.shape[0]

# compute sentiment score for tweets
tweets = df2['tweet'].to_list()

analyzer = SentimentIntensityAnalyzer()

for i in range(0,df2.shape[0]):
    tweet = df2.iloc[i,2]
    vs = analyzer.polarity_scores(tweet)
    df2.iloc[i,15] = vs['compound']

In [None]:
df2.head()

### 6. Export dataframe

In [None]:
df2.to_csv('./twitter16_full.csv', index=False)