# Explore Twitter news comments
Let's look at the comments on news articles posted on Twitter to determine how representative they are of different audiences.

In [4]:
import pandas as pd
auth_data = pd.read_csv('../../data/auth_data/twitter_auth.csv', sep=',', index_col=False).iloc[0, :]

In [68]:
import twitter
twitter_api = twitter.Api(consumer_key=auth_data.loc['consumer_api_key'],
                          consumer_secret=auth_data.loc['consumer_api_key_secret'],
                          access_token_key=auth_data.loc['auth_access_token'],
                          access_token_secret=auth_data.loc['auth_access_token_secret'])

In [12]:
test_twitter_account = 'nytimes'
test_twitter_timeline = twitter_api.GetUserTimeline(screen_name=test_twitter_account)
print(test_twitter_timeline)

[Status(ID=1358850596200136706, ScreenName=nytimes, Created=Mon Feb 08 18:50:03 +0000 2021, Text='The discovery of a mysterious metal slab in Turkey was a throwback to November, when a shiny monolith appeared in t… https://t.co/nJRrRJjFNZ'), Status(ID=1358849355030142977, ScreenName=nytimes, Created=Mon Feb 08 18:45:07 +0000 2021, Text='Dogecoin, a digital currency created as a joke, is now the world’s 10th largest cryptocurrency. Its price is up by… https://t.co/eKc609Dqpd'), Status(ID=1358848086748053505, ScreenName=nytimes, Created=Mon Feb 08 18:40:04 +0000 2021, Text='Three climbers who set out to scale the summit of K2, the world’s second-tallest mountain, have been missing since… https://t.co/liVhNrIzt1'), Status(ID=1358847932854919168, ScreenName=nytimes, Created=Mon Feb 08 18:39:28 +0000 2021, Text='Rescue efforts were underway on Monday in Uttarakhand, where a Himalayan glacier broke and caused massive flooding.… https://t.co/eWFl2c9SB1'), Status(ID=1358846839076839426, Screen

Let's look for posts within a small time frame.

In [71]:
start_date_str

'2018-04-01'

In [74]:
# help(twitter_api.GetSearch)
from datetime import datetime
test_twitter_account = 'nytimes'
start_date = (2018, 4, 1)
end_date = (2018, 5, 1)
start_date_str = '%d-%.2d-%.2d'%start_date
end_date_str = '%d-%.2d-%.2d'%end_date
## TODO: how to get tweets from time interval from specific account?
# test_raw_query = f'q=from:({test_twitter_account}) until:{end_date_str} since:{start_date_str})'
test_raw_query = f'q=from%3A{test_twitter_account} until%3A{end_date_str}'
# start_time = datetime(*start_date)
# end_time = datetime(*end_date)
# test_posts = twitter_api.GetSearch(term='@nytimes')
test_posts = twitter_api.GetSearch(raw_query=test_raw_query)

TwitterError: [{'code': 32, 'message': 'Could not authenticate you.'}]

Update: to do the full archive search we need to write the query from scratch? Cool.

Following this recipe: https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/master/Full-Archive-Search/full-archive-search.py

In [166]:
import requests
from datetime import datetime
import json
from time import sleep
def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers
SLEEP_TIME=60*5
def connect_to_endpoint(search_url, headers, params):
    success = False
    while(not success):
        response = requests.request("GET", search_url, headers=headers, params=params)
    #     print(response.status_code)
        # rate-limit => sleep to recover
        if(response.status_code == 429):
            print(f'sleeping for {SLEEP_TIME} because rate limit error {response}')
            sleep(SLEEP_TIME)
        elif(response.status_code != 200):
            raise Exception(response.status_code, response.text)
        else:
            success = True
    return response.json()

def collect_all_tweets(search_url, headers, query_params):
    combined_tweets = []
    has_next_page = True
    ctr = 0
    while(has_next_page):
        json_response = connect_to_endpoint(search_url, headers, query_params)
        if('data' in json_response):
            combined_tweets.extend(json_response['data'])
        has_next_page = ('meta' in json_response) and ('next_token' in json_response['meta'])
        if(has_next_page):
            query_params['next_token'] = json_response['meta']['next_token']
        ctr += 1
        if(ctr % 10 == 0):
            print(f'collected {len(combined_tweets)} total')
    return combined_tweets
    
bearer_token = auth_data.loc['bearer_token']
search_url = "https://api.twitter.com/2/tweets/search/all"
# Optional params: start_time,end_time,since_id,until_id,max_results,next_token,
# expansions,tweet.fields,media.fields,poll.fields,place.fields,user.fields
user_account = 'nytimes'
start_date = datetime(year=2018, month=4, day=1)
end_date = datetime(year=2018, month=5, day=1)

time_zone = '-05:00'
date_fmt = f'%Y-%m-%dT%H:%M:%S{time_zone}'
query_params = {'query': f'(from:{user_account})', 
                'start_time' : f'{datetime.strftime(start_date, date_fmt)}', 
                'end_time':f'{datetime.strftime(end_date, date_fmt)}',
                'max_results' : 500,
                'tweet.fields' : 'id,text,conversation_id,created_at',
#                 'media.fields' : ['url'] # TODO: get full URL
               }
headers = create_headers(bearer_token)
query_tweets = collect_all_tweets(search_url, headers, query_params)
print(json.dumps(query_tweets))
# print(len(json_response))
# print(json.dumps(json_response, indent=4, sort_keys=True))



In [169]:
## write to file
import pandas as pd
out_file = '../../data/twitter_data/sample_nytimes_data.gz'
query_tweet_data = pd.DataFrame(query_tweets)
query_tweet_data.to_csv(out_file, sep='\t', compression='gzip', index=False)

In [170]:
query_tweet_data.loc[:, 'text'].iloc[0]

"The new CDC director's $375,000 salary is getting cut https://t.co/8VobyiPlRe"

In [131]:
## expand URLs in tweet
import http.client
# def expandURL(link):
#     conn = http.client.HTTPSConnection(link) #use HTTPS !
#     conn.request('HEAD')
#     response = conn.getresponse()
#     return response.getheader('location')
# import urllib3
test_url = "https://t.co/8VobyiPlRe"
import requests
def expand_url(link):
    response = requests.get(link)
    full_url = response.url
    return full_url
test_full_url = expand_url(test_url)
print(test_full_url)

https://www.nytimes.com/2018/04/30/health/cdc-director-salary-redfield.html?smid=tw-nytimes&smtyp=cur


In [171]:
## expand all URLs
import re
url_matcher = re.compile('https://t\.co[^…#]+$') # ignore ellipsis, hashtags
query_tweet_data = query_tweet_data.assign(**{
    'post_URL' : query_tweet_data.loc[:, 'text'].apply(lambda x: url_matcher.findall(x) if url_matcher.search(x) is not None else None)
})
url_query_tweet_data = query_tweet_data[query_tweet_data.loc[:, 'post_URL'].apply(lambda x: x is not None)]
from time import sleep
SLEEP_TIME = 1 # need sleep time to avoid IP getting blocked lol
def retrieve_urls(links):
    full_URLs = []
    for link in links:
        full_URL = expand_url(link)
        full_URLs.append(full_URL)
        sleep(SLEEP_TIME)
    return full_URLs
from tqdm import tqdm
tqdm.pandas()
url_query_tweet_data = url_query_tweet_data.assign(**{
    'full_URLs' : url_query_tweet_data.loc[:, 'post_URL'].progress_apply(lambda x: retrieve_urls(x))
})
## cleanup URL end text

  from pandas import Panel
100%|██████████| 2930/2930 [1:38:23<00:00,  2.01s/it]


In [172]:
import re
url_suffix_matcher = re.compile('(?<=html)\?.+$')
# test_url = 'https://www.nytimes.com/2018/04/30/us/politics/questions-mueller-wants-to-ask-trump-russia.html?partner=rss&emc=rss&smid=tw-nytimes&smtyp=cur'
# url_suffix_matcher.sub('', test_url)
url_query_tweet_data = url_query_tweet_data.assign(**{
    'clean_URLs' : url_query_tweet_data.loc[:, 'full_URLs'].apply(lambda x: list(map(lambda y: url_suffix_matcher.sub('', y), x)))
})

In [173]:
url_query_tweet_data.loc[:, 'clean_URLs'].iloc[0]

['https://www.nytimes.com/2018/04/30/health/cdc-director-salary-redfield.html']

How many of these tweets overlap with the articles that we already have?

In [174]:
news_article_data = pd.read_csv('../../data/nyt_comments/ArticlesApril2018.csv', sep=',', index_col=False)
display(news_article_data.head())

Unnamed: 0,articleID,articleWordCount,byline,documentType,headline,keywords,multimedia,newDesk,printPage,pubDate,sectionName,snippet,source,typeOfMaterial,webURL
0,5adf6684068401528a2aa69b,781,By JOHN BRANCH,article,Former N.F.L. Cheerleaders’ Settlement Offer: ...,"['Workplace Hazards and Violations', 'Football...",68,Sports,0,2018-04-24 17:16:49,Pro Football,"“I understand that they could meet with us, pa...",The New York Times,News,https://www.nytimes.com/2018/04/24/sports/foot...
1,5adf653f068401528a2aa697,656,By LISA FRIEDMAN,article,E.P.A. to Unveil a New Rule. Its Effect: Less ...,"['Environmental Protection Agency', 'Pruitt, S...",68,Climate,0,2018-04-24 17:11:21,Unknown,The agency plans to publish a new regulation T...,The New York Times,News,https://www.nytimes.com/2018/04/24/climate/epa...
2,5adf4626068401528a2aa628,2427,By PETE WELLS,article,"The New Noma, Explained","['Restaurants', 'Noma (Copenhagen, Restaurant)...",66,Dining,0,2018-04-24 14:58:44,Unknown,What’s it like to eat at the second incarnatio...,The New York Times,News,https://www.nytimes.com/2018/04/24/dining/noma...
3,5adf40d2068401528a2aa619,626,By JULIE HIRSCHFELD DAVIS and PETER BAKER,article,Unknown,"['Macron, Emmanuel (1977- )', 'Trump, Donald J...",68,Washington,0,2018-04-24 14:35:57,Europe,President Trump welcomed President Emmanuel Ma...,The New York Times,News,https://www.nytimes.com/2018/04/24/world/europ...
4,5adf3d64068401528a2aa60f,815,By IAN AUSTEN and DAN BILEFSKY,article,Unknown,"['Toronto, Ontario, Attack (April, 2018)', 'Mu...",68,Foreign,0,2018-04-24 14:21:21,Canada,"Alek Minassian, 25, a resident of Toronto’s Ri...",The New York Times,News,https://www.nytimes.com/2018/04/24/world/canad...


In [175]:
news_article_URLs = set(news_article_data.loc[:, 'webURL'].unique())
url_query_tweet_data = url_query_tweet_data.assign(**{
    'overlap_URLs' : url_query_tweet_data.loc[:, 'clean_URLs'].apply(lambda x: list(set(x) & news_article_URLs))
})

In [176]:
overlap_news_tweet_articles = set([y for x in url_query_tweet_data.loc[:, 'overlap_URLs'].values for y in x])
print(f'{len(overlap_news_tweet_articles)}/{len(news_article_URLs)} articles mentioned by at least one tweet')

556/1324 articles mentioned by at least one tweet


OK! We can recover about 42% of all observed articles from the Twitter data, which is pretty good considering the delay between data collection and data generation.

Let's save this data for later.

In [177]:
url_query_tweet_data.to_csv('../../data/twitter_data/sample_nytimes_URL_data.gz', sep='\t', compression='gzip', index=False)

### Assess engagement via replies

Now that we have a reasonable amount of posts, let's see if we can get reply-tweets.

Key question 1: do people tend to ask (1) relevant and (2) answerable-by-rewriting questions?

Key question 2: do we have a reasonable diversity of authors in terms of {geography, age, experience}?

In [None]:
auth_data = pd.read_csv('../../data/auth_data/twitter_auth.csv').iloc[0, :]
bearer_token = auth_data.loc['bearer_token']

In [201]:
url_query_tweet_data.head()

Unnamed: 0,id,text,created_at,conversation_id,post_URL,full_URLs,clean_URLs,overlap_URLs
0,991179622137163776,"The new CDC director's $375,000 salary is gett...",2018-05-01T04:56:51.000Z,991179622137163776,[https://t.co/8VobyiPlRe],[https://www.nytimes.com/2018/04/30/health/cdc...,[https://www.nytimes.com/2018/04/30/health/cdc...,[]
1,991175159829680129,President Trump was long advised to avoid twee...,2018-05-01T04:39:07.000Z,991175159829680129,[https://t.co/5HoElKHf7W],[https://www.nytimes.com/2018/04/30/us/politic...,[https://www.nytimes.com/2018/04/30/us/politic...,[]
3,991165837896798209,Detroit was crumbling. Here’s how it’s revivin...,2018-05-01T04:02:05.000Z,991165837896798209,[https://t.co/dnMPpUyJqp https://t.co/ddgE8gKfZc],[https://t.co/dnMPpUyJqp%20https://t.co/ddgE8g...,[https://t.co/dnMPpUyJqp%20https://t.co/ddgE8g...,[]
5,991158296508854272,A T-Mobile-Sprint merger could affect your cel...,2018-05-01T03:32:07.000Z,991158296508854272,[https://t.co/lXGHqfto7T],[https://www.nytimes.com/2018/04/30/business/t...,[https://www.nytimes.com/2018/04/30/business/t...,[]
6,991156012165095424,Today was the deadliest day for journalists in...,2018-05-01T03:23:02.000Z,991156012165095424,[https://t.co/1WE5bV0PLl],[https://www.nytimes.com/2018/04/30/world/asia...,[https://www.nytimes.com/2018/04/30/world/asia...,[https://www.nytimes.com/2018/04/30/world/asia...


In [204]:
## get time value
from datetime import datetime
import re
date_fmt = '%Y-%m-%d'
url_query_tweet_data = url_query_tweet_data.assign(**{
    'date' : url_query_tweet_data.loc[:, 'created_at'].apply(lambda x: datetime.strptime(x.split('T')[0], date_fmt))
})
display(url_query_tweet_data.loc[:, ['date', 'created_at']].head())

Unnamed: 0,date,created_at
0,2018-05-01,2018-05-01T04:56:51.000Z
1,2018-05-01,2018-05-01T04:39:07.000Z
3,2018-05-01,2018-05-01T04:02:05.000Z
5,2018-05-01,2018-05-01T03:32:07.000Z
6,2018-05-01,2018-05-01T03:23:02.000Z


In [205]:
## how to get tweet replies?? query conversation ID: https://developer.twitter.com/en/docs/twitter-api/conversation-id
from datetime import timedelta
tweet_reply_sample_size = 50
test_query_tweet_data = url_query_tweet_data.iloc[10]
test_tweet_id = test_query_tweet_data.loc['conversation_id']
# tmp debugging
# test_tweet_id = '1361756606896209924' # 2021-02-16 article
# test_tweet_id = '1361755900764110848' # 2021-02-16 article
search_url = "https://api.twitter.com/2/tweets/search/all"
# restrict replies to 1 week after post
reply_date_limit = 7
start_date = test_query_tweet_data.loc['date']
end_date = start_date + timedelta(days=reply_date_limit)
# start_date = datetime(year=2018, month=4, day=1)
# end_date = datetime(year=2018, month=4, day=8)

## mine stuff
from importlib import reload
import data_helpers
reload(data_helpers)
from data_helpers import create_headers, collect_all_tweets

time_zone = '-05:00'
date_fmt = f'%Y-%m-%dT%H:%M:%S{time_zone}'
query_params = {
    'query': f'conversation_id:{test_tweet_id}', 
    # tmp debugging
#     'query' : f'from:nytimes', 
    'start_time' : f'{datetime.strftime(start_date, date_fmt)}', 
    'end_time':f'{datetime.strftime(end_date, date_fmt)}',
    'max_results' : tweet_reply_sample_size,
    'tweet.fields' : 'id,text,created_at',
#     'user.fields' : ['username', 'description', 'location', 'verified'],
               }
headers = create_headers(bearer_token)
query_tweets = collect_all_tweets(search_url, headers, query_params, verbose=True)
print(json.dumps(query_tweets))

sleeping for 300 because rate limit error <Response [429]>
sleeping for 300 because rate limit error <Response [429]>
sleeping for 300 because rate limit error <Response [429]>
sleeping for 300 because rate limit error <Response [429]>
sleeping for 300 because rate limit error <Response [429]>
sleeping for 300 because rate limit error <Response [429]>
[{"created_at": "2018-05-07T05:58:34.000Z", "id": "993369478921629697", "text": "@nytimes Blah blah. There are more important things in life."}, {"created_at": "2018-05-04T23:27:25.000Z", "id": "992546268936196096", "text": "@nytimes Your the Criminal Muller"}, {"created_at": "2018-05-03T16:56:04.000Z", "id": "992085394232561664", "text": "@nytimes President Trump! Go tell Robert Mueller to go pound sand in his ass. Witch Hunt fake news!"}, {"created_at": "2018-05-03T05:47:16.000Z", "id": "991917082815021056", "text": "@nytimes Funny like 24 hrs after I said this, Trump is \u201crestating\u201d it."}, {"created_at": "2018-05-03T01:51:45.0

OK! We got some example replies.

Let's scale this up and query 100 comments from 10 articles.

In [274]:
import numpy as np
np.random.seed(123)
from importlib import reload
import data_helpers
reload(data_helpers)
from data_helpers import create_headers, collect_all_tweets

sample_reply_size = 10 # TODO: this applies to number of questions, not number of replies
sample_article_size = 50
# get articles that overlap with NYT data for direct comparison
sample_tweet_ids = np.random.choice(url_query_tweet_data[url_query_tweet_data.loc[:, 'overlap_URLs'].apply(lambda x: len(x) > 0)].loc[:, 'id'].unique(),
                                    sample_article_size, replace=False)
tweet_collection_day_range = 1 # only want 1 day after for maximum likelihood of direct reply
max_tweets = 500
sample_article_tweet_replies = []
for sample_tweet_id_i in tqdm(sample_tweet_ids):
#     print(f'collecting replies for {sample_tweet_id_i}')
    start_date_i = url_query_tweet_data[url_query_tweet_data.loc[:, 'id']==sample_tweet_id_i].loc[:, 'date'].min()
    end_date_i = start_date_i + timedelta(days=tweet_collection_day_range)
    time_zone = '-05:00'
    date_fmt = f'%Y-%m-%dT%H:%M:%S{time_zone}'
    query_params = {
        'query': f'conversation_id:{sample_tweet_id_i}', 
        # tmp debugging
    #     'query' : f'from:nytimes', 
        'start_time' : f'{datetime.strftime(start_date_i, date_fmt)}', 
        'end_time':f'{datetime.strftime(end_date_i, date_fmt)}',
        'max_results' : max_tweets,
        'tweet.fields' : 'id,text,created_at', # need "referenced_tweets.id" to get the immediate parent
        'user.fields' : 'id,name,username,description,location,verified',
        'expansions' : 'author_id,referenced_tweets.id',
    }
    headers = create_headers(bearer_token)
    reply_tweets_i = collect_all_tweets(search_url, headers, query_params, verbose=True, max_tweets=max_tweets)
    if(len(reply_tweets_i) > 0):
        reply_tweets_i = reply_tweets_i.assign(**{'conversation_id' : sample_tweet_id_i})
        sample_article_tweet_replies.append(reply_tweets_i)

  4%|▍         | 2/50 [00:01<00:27,  1.77it/s]

sleeping for 300 because rate limit error <Response [429]>


  8%|▊         | 4/50 [05:02<48:46, 63.62s/it]  

sleeping for 300 because rate limit error <Response [429]>


 10%|█         | 5/50 [10:03<1:41:07, 134.84s/it]

sleeping for 300 because rate limit error <Response [429]>


 12%|█▏        | 6/50 [15:04<2:15:23, 184.62s/it]

sleeping for 300 because rate limit error <Response [429]>


 14%|█▍        | 7/50 [20:05<2:37:17, 219.48s/it]

sleeping for 300 because rate limit error <Response [429]>


 18%|█▊        | 9/50 [25:06<1:56:47, 170.91s/it]

sleeping for 300 because rate limit error <Response [429]>


 22%|██▏       | 11/50 [30:08<1:35:36, 147.10s/it]

sleeping for 300 because rate limit error <Response [429]>


 28%|██▊       | 14/50 [35:09<56:57, 94.93s/it]   

sleeping for 300 because rate limit error <Response [429]>


 32%|███▏      | 16/50 [40:11<1:02:14, 109.84s/it]

sleeping for 300 because rate limit error <Response [429]>


 34%|███▍      | 17/50 [45:12<1:31:56, 167.16s/it]

sleeping for 300 because rate limit error <Response [429]>


 36%|███▌      | 18/50 [50:13<1:50:31, 207.25s/it]

sleeping for 300 because rate limit error <Response [429]>


 40%|████      | 20/50 [55:14<1:22:26, 164.88s/it]

sleeping for 300 because rate limit error <Response [429]>


 44%|████▍     | 22/50 [1:00:16<1:07:17, 144.19s/it]

sleeping for 300 because rate limit error <Response [429]>


 48%|████▊     | 24/50 [1:05:17<58:03, 133.99s/it]  

sleeping for 300 because rate limit error <Response [429]>


 52%|█████▏    | 26/50 [1:10:19<51:36, 129.03s/it]  

sleeping for 300 because rate limit error <Response [429]>


 56%|█████▌    | 28/50 [1:15:20<46:23, 126.54s/it]  

sleeping for 300 because rate limit error <Response [429]>


 60%|██████    | 30/50 [1:20:22<41:48, 125.41s/it]  

sleeping for 300 because rate limit error <Response [429]>


 66%|██████▌   | 33/50 [1:25:24<24:47, 87.49s/it] 

sleeping for 300 because rate limit error <Response [429]>


 68%|██████▊   | 34/50 [1:30:25<40:24, 151.55s/it]

sleeping for 300 because rate limit error <Response [429]>


 72%|███████▏  | 36/50 [1:35:26<32:05, 137.57s/it]

sleeping for 300 because rate limit error <Response [429]>


 74%|███████▍  | 37/50 [1:40:27<40:25, 186.56s/it]

sleeping for 300 because rate limit error <Response [429]>


 76%|███████▌  | 38/50 [1:45:28<44:10, 220.87s/it]

sleeping for 300 because rate limit error <Response [429]>


 78%|███████▊  | 39/50 [1:50:29<44:53, 244.90s/it]

sleeping for 300 because rate limit error <Response [429]>


 80%|████████  | 40/50 [1:55:30<43:37, 261.78s/it]

sleeping for 300 because rate limit error <Response [429]>


 82%|████████▏ | 41/50 [2:00:31<41:01, 273.53s/it]

sleeping for 300 because rate limit error <Response [429]>


 86%|████████▌ | 43/50 [2:05:33<23:02, 197.50s/it]

sleeping for 300 because rate limit error <Response [429]>


 88%|████████▊ | 44/50 [2:10:34<22:51, 228.50s/it]

sleeping for 300 because rate limit error <Response [429]>


 90%|█████████ | 45/50 [2:15:35<20:51, 250.25s/it]

sleeping for 300 because rate limit error <Response [429]>


 94%|█████████▍| 47/50 [2:20:36<09:17, 186.00s/it]

sleeping for 300 because rate limit error <Response [429]>


 98%|█████████▊| 49/50 [2:25:38<02:34, 154.55s/it]

sleeping for 300 because rate limit error <Response [429]>


100%|██████████| 50/50 [2:30:39<00:00, 180.79s/it]


In [224]:
# debugging how to get author info
# ## get user info
# from importlib import reload
# import data_helpers
# reload(data_helpers)
# from data_helpers import connect_to_endpoint, collect_all_tweets
# sample_tweet_id_i = sample_tweet_ids[0]
# start_date_i = url_query_tweet_data[url_query_tweet_data.loc[:, 'id']==sample_tweet_id_i].loc[:, 'date'].min()
# end_date_i = start_date_i + timedelta(days=tweet_collection_day_range)
# query_params = {
#     'query' : f'conversation_id:{sample_tweet_id_i}',
#     'start_time' : f'{datetime.strftime(start_date_i, date_fmt)}', 
#     'end_time':f'{datetime.strftime(end_date_i, date_fmt)}',
#     'max_results' : 50,
#     'tweet.fields' : 'id,text,created_at',
#     'user.fields' : 'id,name,username,description,location,verified',
#     'expansions' : 'author_id',
# }
# reply_tweets_i = collect_all_tweets(search_url, headers, query_params, verbose=True, max_tweets=50)
# # json_response = connect_to_endpoint(search_url, headers, query_params)
# # print(json_response)

In [295]:
## join with original articles
sample_article_tweet_reply_data = []
for data_i in sample_article_tweet_replies:
    ## restrict to direct replies, i.e. referenced tweet ID must be same as conversation ID
    if('referenced_tweets' in data_i.columns):
        data_i = data_i.assign(**{
            'tweet_reference_id' : data_i.loc[:, 'referenced_tweets'].apply(lambda x: x[0]['id'])
        })
        data_i = data_i[data_i.loc[:, 'tweet_reference_id'] == data_i.loc[:, 'conversation_id']]
    if(data_i.shape[0] > 0):
        conversation_id_i = data_i.loc[:, 'conversation_id'].iloc[0]
        article_data_i = url_query_tweet_data[url_query_tweet_data.loc[:, 'conversation_id']==conversation_id_i]
        article_data_i.rename(columns={'text' : 'article_title'}, inplace=True)
        data_df_i = pd.merge(data_i, article_data_i.loc[:, ['conversation_id', 'overlap_URLs', 'date', 'article_title']],
                             on='conversation_id', how='left')
        sample_article_tweet_reply_data.append(data_df_i)
sample_article_tweet_reply_data = pd.concat(sample_article_tweet_reply_data, axis=0)
display(sample_article_tweet_reply_data.head())

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sample_article_tweet_reply_data = pd.concat(sample_article_tweet_reply_data, axis=0)


Unnamed: 0,article_title,author_id,conversation_id,created_at,date,description,id,location,name,overlap_URLs,referenced_tweets,text,tweet_reference_id,username,verified
0,President Trump dropped his insistence on an i...,3260065987,981860006604795904,2018-04-05T19:47:56.000Z,2018-04-05,,981981783926419457,,Nanci Boice,[https://www.nytimes.com/2018/04/04/world/midd...,"[{'type': 'replied_to', 'id': '981860006604795...",@nytimes Seriously? He will demand it again w...,981860006604795904,sbnanci,False
1,President Trump dropped his insistence on an i...,2863846840,981860006604795904,2018-04-05T19:47:34.000Z,2018-04-05,For posts in other languages just click Translate,981981694105407489,"Johannesburg, South Africa",Maxine Alves,[https://www.nytimes.com/2018/04/04/world/midd...,"[{'type': 'replied_to', 'id': '981860006604795...",@nytimes Yes they need time to give some money...,981860006604795904,maxine_alves,False
2,President Trump dropped his insistence on an i...,20750015,981860006604795904,2018-04-05T17:55:59.000Z,2018-04-05,,981953610610348033,United States,John Adame,[https://www.nytimes.com/2018/04/04/world/midd...,"[{'type': 'replied_to', 'id': '981860006604795...",@nytimes Trump said he would led the Generals ...,981860006604795904,jadda566,False
3,President Trump dropped his insistence on an i...,915714834037186561,981860006604795904,2018-04-05T15:37:39.000Z,2018-04-05,,981918800848154624,,Daniel Haycocks,[https://www.nytimes.com/2018/04/04/world/midd...,"[{'type': 'replied_to', 'id': '981860006604795...",@nytimes He flips and flops like a flippy flop...,981860006604795904,DanielScottHayc,False
4,President Trump dropped his insistence on an i...,886116398304382976,981860006604795904,2018-04-05T15:09:56.000Z,2018-04-05,,981911825624195072,,黑太阳 BIACK SUN,[https://www.nytimes.com/2018/04/04/world/midd...,"[{'type': 'replied_to', 'id': '981860006604795...",@nytimes https://t.co/UCYwu99LZc,981860006604795904,PeterAmy2017,False


In [296]:
## extract questions
from importlib import reload
import data_helpers
reload(data_helpers)
from data_helpers import extract_questions_all_data
min_question_len = 10
sample_article_tweet_reply_data = sample_article_tweet_reply_data.assign(**{
    'questions' : extract_questions_all_data(sample_article_tweet_reply_data.loc[:, 'text'].values, min_question_len=min_question_len)
})
sample_article_tweet_question_data = sample_article_tweet_reply_data[sample_article_tweet_reply_data.loc[:, 'questions'].apply(lambda x: len(x) > 0)]
print(f'{sample_article_tweet_question_data.shape[0]} questions')
## get equal size sample of questions per article
import numpy as np
np.random.seed(123)
questions_per_article = 20
sample_article_question_label_data = []
for article_i, data_i in sample_article_tweet_question_data.groupby('conversation_id'):
    if(data_i.loc[:, 'article_title'].iloc[0] != '' and type(data_i.loc[:, 'overlap_URLs'].iloc[0]) is list):
        if(data_i.shape[0] > questions_per_article):
            data_i = data_i.loc[np.random.choice(data_i.index, questions_per_article, replace=False), :]
        # one question per tweet
        data_i = data_i.assign(**{'question' : data_i.loc[:, 'questions'].apply(lambda x: x[0]),
                                  'URL' : data_i.loc[:, 'overlap_URLs'].apply(lambda x: x[0] if len(x) > 0 else '')})
        data_i = data_i.loc[:, ['conversation_id', 'article_title', 'date', 'question', 'URL']]
        sample_article_question_label_data.append(data_i)
sample_article_question_label_data = pd.concat(sample_article_question_label_data, axis=0)

186 questions


In [297]:
## write to file
sample_article_tweet_reply_data.to_csv('../../data/twitter_data/sample_nytimes_reply_data.gz', compression='gzip', sep='\t')
sample_article_question_label_data.to_csv('../../data/twitter_data/sample_nytimes_reply_data_labels.tsv', sep='\t', index=False)

After labeling the data, how many valid questions do we have?

In [298]:
sample_article_question_label_data = pd.read_csv('../../data/twitter_data/sample_nytimes_reply_data_labels.tsv',
                                                 sep='\t', index_col=False)
display(sample_article_question_label_data.head())

Unnamed: 0,conversation_id,article_title,date,question,URL,question_is_relevant,question_is_clarification,submission_contains_answer,submission_can_include_question_answer
0,980638438692130818,China Slaps Tariffs on U.S. Products in Warnin...,2018-04-02,@nytimes @damonbethea1 what do they mean by ch...,https://www.nytimes.com/2018/04/01/world/asia/...,1,1,0,1
1,980852200992886785,President Trump suggested Putin visit the Whit...,2018-04-02,@nytimes Does Putin like Big Macs or Quarter P...,https://www.nytimes.com/2018/04/02/world/europ...,1,0,0,0
2,980852200992886785,President Trump suggested Putin visit the Whit...,2018-04-02,"@nytimes He subverts our election, tiris to mu...",https://www.nytimes.com/2018/04/02/world/europ...,1,0,0,0
3,981034071961808896,The EPA announced a plan to relax Obama-era gr...,2018-04-03,@nytimes So they love state’s rights only when...,https://www.nytimes.com/2018/04/02/climate/tru...,1,0,0,0
4,981034071961808896,The EPA announced a plan to relax Obama-era gr...,2018-04-03,Uhmmm... have these peoples ever been on the 405?,https://www.nytimes.com/2018/04/02/climate/tru...,1,0,0,0


In [307]:
pd.set_option('display.max_colwidth', 500)
valid_sample_article_question_data = sample_article_question_label_data[(sample_article_question_label_data.loc[:, 'question_is_relevant']==1) &
                                                                        (sample_article_question_label_data.loc[:, 'question_is_clarification']==1) &
                                                                        (sample_article_question_label_data.loc[:, 'submission_contains_answer']==0) &
                                                                        (sample_article_question_label_data.loc[:, 'submission_can_include_question_answer']==1)]
print(f'{valid_sample_article_question_data.shape[0]}/{sample_article_question_label_data.shape[0]} valid questions')
display(valid_sample_article_question_data.loc[:, ['article_title', 'URL', 'question']].head(10))

34/158 valid questions


Unnamed: 0,article_title,URL,question
0,China Slaps Tariffs on U.S. Products in Warning Shot to Trump https://t.co/krRoVOKAdc,https://www.nytimes.com/2018/04/01/world/asia/china-tariffs-united-states.html,@nytimes @damonbethea1 what do they mean by china defying trump?
15,"The EPA announced a plan to relax Obama-era greenhouse gas rules and said it would make California, which sets its own standards, fall in line https://t.co/OccvkjYaiv",https://www.nytimes.com/2018/04/02/climate/trump-auto-emissions-rules.html,"However, if a state wants to have tuffer standards then why should the Fed's be able to stop that?"
20,"A mentally ill man on suicide watch hanged himself, gang members were allowed to beat other prisoners, and those whose cries for medical attention were ignored resorted to setting fires in their cells. https://t.co/MMA2gkeAaU",https://www.nytimes.com/2018/04/03/us/mississippi-private-prison-abuse.html,@nytimes Are you suggesting this is unique to private prisons?
22,"President Trump dropped his insistence on an immediate withdrawal of troops from Syria after commanders told him they needed time to complete their mission, senior administration officials said https://t.co/Q9PNCEOlQc",https://www.nytimes.com/2018/04/04/world/middleeast/trump-syria-troops.html,@nytimes @GEsfandiari Begs a leading question - what is the mission?
36,"The concerns included unusually large spending on office furniture and first-class travel, as well as certain demands by Scott Pruitt for security coverage https://t.co/s5FBeIDmrX",https://www.nytimes.com/2018/04/05/business/epa-officials-questioned-scott-pruitt.html,@nytimes Isn’t there a Whistleblower Protection Act?
37,"Breaking News: 5 EPA officials were sidelined after raising concerns about the security demands and unusual spending of the agency’s chief, Scott Pruitt https://t.co/Za6DfGGhyS",https://www.nytimes.com/2018/04/05/business/epa-officials-questioned-scott-pruitt.html,@nytimes Isn’t there a Whistleblower Protection Act?
45,"Park Geun-hye, South Korea's impeached and ousted president, was sentenced to 24 years in prison on a variety of criminal charges https://t.co/IHOOJuBb19",https://www.nytimes.com/2018/04/06/world/asia/park-geun-hye-south-korea.html,@nytimes Could we follow suit in the USA?
51,"Long-term use of the medications is surging in the U.S., according to an analysis by The New York Times. One reason: withdrawal symptoms that make it difficult to stop. https://t.co/cYnJS4JOIA",https://www.nytimes.com/2018/04/07/health/antidepressants-withdrawal-prozac-cymbalta.html,@nytimes Why all of a sudden are so many people depressed?
53,"Long-term use of the medications is surging in the U.S., according to an analysis by The New York Times. One reason: withdrawal symptoms that make it difficult to stop. https://t.co/cYnJS4JOIA",https://www.nytimes.com/2018/04/07/health/antidepressants-withdrawal-prozac-cymbalta.html,@nytimes Why is this article so negative about antidepressants?
54,"Long-term use of the medications is surging in the U.S., according to an analysis by The New York Times. One reason: withdrawal symptoms that make it difficult to stop. https://t.co/cYnJS4JOIA",https://www.nytimes.com/2018/04/07/health/antidepressants-withdrawal-prozac-cymbalta.html,How is it that such products are approved by the FDA?


Overall very low validity for questions, due in part to social pressure on Twitter to engage with headline/subjective opinions.