Input authentication keys from the Twitter developers' dashboard

In [1]:
consumer_key = ''
consumer_secret = ''
access_token_key = ''
access_token_secret = ''

Import necessary packages

In [2]:
import twitter  # For API
import pandas as pd  # For JSONL file handling

# Part 1: Retrieving tweets

Authenticate the API using the keys obtained from Twitter.

In [3]:
api = twitter.Api(
    consumer_key,
    consumer_secret,
    access_token_key,
    access_token_secret
)

The API returns responses in a `Status` type (sa seen below), defined by the author of the module. We need to convert it to a dictionary for further usage

In [4]:
type(api.GetUserTimeline(screen_name='midasiiitd', count=200)[0])

twitter.models.Status

Request maximum possible tweets by username `midasiiitd` from the API. A maximum of 200 tweets can be retrieved per request, as according to the [official twitter docs](https://developer.twitter.com/en/docs/tweets/timelines/api-reference/get-statuses-user_timeline.html#parameters).

In [5]:
tweets = [
    tweet.AsDict() for tweet 
    in api.GetUserTimeline(screen_name='midasiiitd', count=200)
]

An example response

In [6]:
tweets[0]

{'created_at': 'Fri Apr 05 16:08:37 +0000 2019',
 'favorite_count': 7,
 'hashtags': [],
 'id': 1114198161562775553,
 'id_str': '1114198161562775553',
 'lang': 'en',
 'retweet_count': 1,
 'source': '<a href="http://twitter.com" rel="nofollow">Twitter Web Client</a>',
 'text': 'We have emailed the task details to all candidates who have applied to @midasIIITD internship through IIITD portal.… https://t.co/gZwyr7D2Sw',
 'truncated': True,
 'urls': [{'expanded_url': 'https://twitter.com/i/web/status/1114198161562775553',
   'url': 'https://t.co/gZwyr7D2Sw'}],
 'user': {'created_at': 'Mon Jul 23 11:26:04 +0000 2018',
  'default_profile': True,
  'description': 'MIDAS is a group of researchers at IIIT-Delhi who study, analyze, and build different multimedia systems for society leveraging multimodal information.',
  'favourites_count': 140,
  'followers_count': 233,
  'following': True,
  'friends_count': 42,
  'id': 1021355762575073281,
  'id_str': '1021355762575073281',
  'lang': 'en',
  'l

Create a pandas `DataFrame` from the list of dictionaries we obtained from the API.

In [7]:
tweets_df = pd.DataFrame(tweets)

In [8]:
tweets_df.head()

Unnamed: 0,created_at,favorite_count,hashtags,id,id_str,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,lang,media,...,quoted_status_id,quoted_status_id_str,retweet_count,retweeted_status,source,text,truncated,urls,user,user_mentions
0,Fri Apr 05 16:08:37 +0000 2019,7.0,[],1114198161562775553,1114198161562775553,,,,en,,...,,,1.0,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",We have emailed the task details to all candid...,True,[{'expanded_url': 'https://twitter.com/i/web/s...,{'created_at': 'Mon Jul 23 11:26:04 +0000 2018...,"[{'id': 1021355762575073281, 'id_str': '102135..."
1,Fri Apr 05 04:05:11 +0000 2019,,[],1114016105079693312,1114016105079693312,,,,en,,...,,,16.0,{'created_at': 'Thu Apr 04 20:44:40 +0000 2019...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @rfpvjr: Our NAACL paper on polarization in...,,[{'expanded_url': 'https://arxiv.org/abs/1904....,{'created_at': 'Mon Jul 23 11:26:04 +0000 2018...,"[{'id': 399948170, 'id_str': '399948170', 'nam..."
2,Fri Apr 05 04:04:43 +0000 2019,,[],1114015987395854336,1114015987395854336,,,,en,"[{'display_url': 'pic.twitter.com/ccX4Uhxjn8',...",...,,,10.0,{'created_at': 'Fri Apr 05 04:00:11 +0000 2019...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @kdnuggets: Effective Transfer Learning For...,,"[{'expanded_url': 'https://buff.ly/2K7XvvN', '...",{'created_at': 'Mon Jul 23 11:26:04 +0000 2018...,"[{'id': 20167623, 'id_str': '20167623', 'name'..."
3,Wed Apr 03 18:31:53 +0000 2019,,[],1113509442849525760,1113509442849525760,,,,en,,...,,,55.0,{'created_at': 'Wed Apr 03 17:07:30 +0000 2019...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @stanfordnlp: What’s new in @Stanford CS224...,,[],{'created_at': 'Mon Jul 23 11:26:04 +0000 2018...,"[{'id': 118263124, 'id_str': '118263124', 'nam..."
4,Wed Apr 03 17:04:32 +0000 2019,,[],1113487457780215808,1113487457780215808,,,,en,,...,,,841.0,{'created_at': 'Wed Apr 03 12:38:17 +0000 2019...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @DeepMindAI: Today we're releasing a large-...,,[],{'created_at': 'Mon Jul 23 11:26:04 +0000 2018...,"[{'id': 4783690002, 'id_str': '4783690002', 'n..."


Finally, dump the DataFrame in `jsonl` format.

In [9]:
tweets_df.to_json('tweets.jsonl', orient='records', lines=True)

# Part 2: Reading the tweets' data from `jsonl` file

In [10]:
tweets_df = pd.read_json('tweets.jsonl', lines=True)

In [11]:
tweets_df.head()

Unnamed: 0,created_at,favorite_count,hashtags,id,id_str,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,lang,media,...,quoted_status_id,quoted_status_id_str,retweet_count,retweeted_status,source,text,truncated,urls,user,user_mentions
0,2019-04-05 16:08:37,7.0,[],1114198161562775553,1114198161562775552,,,,en,,...,,,1.0,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",We have emailed the task details to all candid...,1.0,[{'expanded_url': 'https://twitter.com/i/web/s...,{'created_at': 'Mon Jul 23 11:26:04 +0000 2018...,"[{'id': 1021355762575073281, 'id_str': '102135..."
1,2019-04-05 04:05:11,,[],1114016105079693312,1114016105079693312,,,,en,,...,,,16.0,{'created_at': 'Thu Apr 04 20:44:40 +0000 2019...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @rfpvjr: Our NAACL paper on polarization in...,,[{'expanded_url': 'https://arxiv.org/abs/1904....,{'created_at': 'Mon Jul 23 11:26:04 +0000 2018...,"[{'id': 399948170, 'id_str': '399948170', 'nam..."
2,2019-04-05 04:04:43,,[],1114015987395854336,1114015987395854336,,,,en,"[{'display_url': 'pic.twitter.com/ccX4Uhxjn8',...",...,,,10.0,{'created_at': 'Fri Apr 05 04:00:11 +0000 2019...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @kdnuggets: Effective Transfer Learning For...,,"[{'expanded_url': 'https://buff.ly/2K7XvvN', '...",{'created_at': 'Mon Jul 23 11:26:04 +0000 2018...,"[{'id': 20167623, 'id_str': '20167623', 'name'..."
3,2019-04-03 18:31:53,,[],1113509442849525760,1113509442849525760,,,,en,,...,,,55.0,{'created_at': 'Wed Apr 03 17:07:30 +0000 2019...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @stanfordnlp: What’s new in @Stanford CS224...,,[],{'created_at': 'Mon Jul 23 11:26:04 +0000 2018...,"[{'id': 118263124, 'id_str': '118263124', 'nam..."
4,2019-04-03 17:04:32,,[],1113487457780215808,1113487457780215808,,,,en,,...,,,841.0,{'created_at': 'Wed Apr 03 12:38:17 +0000 2019...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @DeepMindAI: Today we're releasing a large-...,,[],{'created_at': 'Mon Jul 23 11:26:04 +0000 2018...,"[{'id': 4783690002, 'id_str': '4783690002', 'n..."


Let us first see what are all the columns we have

In [12]:
tweets_df.columns

Index(['created_at', 'favorite_count', 'hashtags', 'id', 'id_str',
       'in_reply_to_screen_name', 'in_reply_to_status_id',
       'in_reply_to_user_id', 'lang', 'media', 'quoted_status',
       'quoted_status_id', 'quoted_status_id_str', 'retweet_count',
       'retweeted_status', 'source', 'text', 'truncated', 'urls', 'user',
       'user_mentions'],
      dtype='object')

From above, we can see that we'll need following columns:

Column | Needed for
--|--
`text` | The text of the tweet
`created_at` | Date and time of the tweet
`favorite_count` | The number of favorites/likes
`retweet_count` | The number of retweets
`images_count`* | Number of Images present in Tweet

*We will create a new column `images _count` using a function that will count the photo entries in the `media` column of the tweet.

If a file is associated with a tweet, the corresponding `media` column is a list of dictionaries, containing detailed info about each of the media files (as shown in by the code cell below). One of the info included is the type of file. We will seperate out the ones with `type` attribute set to `photo`.

In [13]:
# Get the index to the first non-NaN entry in media column
idx = tweets_df.index[~tweets_df.media.isna()][0]

# Print the attributes of the first file in the media list
tweets_df.media[idx][0].keys()

dict_keys(['display_url', 'expanded_url', 'id', 'media_url', 'media_url_https', 'sizes', 'type', 'url'])

In [14]:
def count_images(media_files):
    if media_files:  # Check for NaN values
        count = sum([media['type'] == 'photo' for media in media_files])
    else:
        count = 0

    return count

Map the above function to the `media` column, and make a new column named `images_count`.

In [15]:
tweets_df['images_count'] = tweets_df.media.apply(count_images)

Finally, fill all the `NaN` values with 0 and display the required columns.

In [16]:
tweets_df.fillna(0,  inplace=True)

# Final Output

In [18]:
tweets_df.loc[:, ['text', 'created_at', 'favorite_count', 'retweet_count', 'images_count']]

Unnamed: 0,text,created_at,favorite_count,retweet_count,images_count
0,We have emailed the task details to all candid...,2019-04-05 16:08:37,7.0,1.0,0
1,RT @rfpvjr: Our NAACL paper on polarization in...,2019-04-05 04:05:11,0.0,16.0,0
2,RT @kdnuggets: Effective Transfer Learning For...,2019-04-05 04:04:43,0.0,10.0,1
3,RT @stanfordnlp: What’s new in @Stanford CS224...,2019-04-03 18:31:53,0.0,55.0,0
4,RT @DeepMindAI: Today we're releasing a large-...,2019-04-03 17:04:32,0.0,841.0,0
5,RT @ylecun: Congratulations Jitendra Malik !\n...,2019-04-03 09:03:40,0.0,16.0,0
6,RT @IIITDelhi: Another chance to take admissio...,2019-04-03 07:46:02,0.0,4.0,0
7,Dear @midasIIITD internship candidates who hav...,2019-04-02 04:20:13,8.0,1.0,0
8,Looking forward to your paper submission to @I...,2019-04-02 02:44:54,5.0,1.0,0
9,RT @ngrams: Reproducibility in multimedia rese...,2019-04-02 02:35:44,0.0,7.0,0
