# 02-03 : Combine Twitter Data

In [16]:
import pandas as pd
from datetime import datetime
import json
import pickle
from pprint import pprint
from typing import List, Dict

## Data Load

In [2]:
def read_pickle_tweets(file_path:str) -> List[Dict]:
    """Read a tweet pickle file and return a list of tweets."""
    # read the pickle file
    with open(file_path, 'rb') as f:
        pkl_tweets = pickle.load(f)

    # return the list of tweets
    return [json.loads(tweet.json()) for tweet in pkl_tweets]  

In [3]:
js_vodacom = read_pickle_tweets('../../data/raw/02_01_tweets.pkl')
print(f'Vodacom  : {len(js_vodacom)}')

js_vodafone = read_pickle_tweets('../../data/raw/02_02_tweets_vodafone.pkl')
print(f'Vodafone : {len(js_vodafone)}')

Vodacom  : 983
Vodafone : 1482


## Convert to DataFrame

In [4]:
pprint(js_vodacom[101])

{'_type': 'snscrape.modules.twitter.Tweet',
 'cashtags': [],
 'conversationId': 1643599377834061826,
 'coordinates': None,
 'date': '2023-04-07 15:08:10+00:00',
 'hashtags': [],
 'id': 1644356395607678977,
 'id_str': '1644356395607678977',
 'inReplyToTweetId': 1643599377834061826,
 'inReplyToUser': {'_type': 'snscrape.modules.twitter.UserRef',
                   'displayname': 'Vodacom',
                   'id': 14574763,
                   'username': 'Vodacom'},
 'lang': 'en',
 'likeCount': 0,
 'links': [],
 'media': {'animated': [], 'photos': [], 'videos': []},
 'mentionedUsers': [{'_type': 'snscrape.modules.twitter.UserRef',
                     'displayname': 'Vodacom',
                     'id': 14574763,
                     'username': 'Vodacom'},
                    {'_type': 'snscrape.modules.twitter.UserRef',
                     'displayname': 'Evan Greenwood',
                     'id': 2867399789,
                     'username': 'CodeOfTheVoid'},
                    {'_t

In [54]:
def extract_tweet_data(tweet:Dict) -> Dict:
    """Extracts relevant data from a tweet"""
    return {
        'id': tweet['id'],
        'conversationId': tweet['conversationId'],
        'date': datetime.strptime(tweet['date'], '%Y-%m-%d %H:%M:%S%z'),
        'rawContent': tweet['rawContent'],
        'inReplyToTweetId': tweet['inReplyToTweetId'],
        'inReplyToUserId': tweet['inReplyToUser']['id'],
        'inReplyToUsername': tweet['inReplyToUser']['username'],
        'inReplyToDisplayName': tweet['inReplyToUser']['displayname'],
        'language': tweet['lang'],
        'likeCount': tweet['likeCount'],
        'quoteCount': tweet['quoteCount'],
        'replyCount': tweet['replyCount'],
        'retweetCount': tweet['retweetCount'],
        'source': tweet['sourceLabel'],
        'url': tweet['url'],
        'userCreated': datetime.strptime(tweet['user']['created'], '%Y-%m-%d %H:%M:%S%z'),
        'userDisplayName': tweet['user']['displayname'],
        'userFollowersCount': tweet['user']['followersCount'],
        'userId': tweet['user']['id'],
        'userLocation': tweet['user']['location'],
        'userStatusesCount': tweet['user']['statusesCount'],
        'userUsername': tweet['user']['username'],
    }

# test the function
pprint(extract_tweet_data(js_vodacom[20]))

{'conversationId': 1662066053428510721,
 'date': datetime.datetime(2023, 7, 9, 22, 48, 37, tzinfo=datetime.timezone.utc),
 'id': 1678174347033296897,
 'inReplyToDisplayName': 'Vodacom',
 'inReplyToTweetId': 1662066053428510721,
 'inReplyToUserId': 14574763,
 'inReplyToUsername': 'Vodacom',
 'language': 'en',
 'likeCount': 0,
 'quoteCount': 0,
 'rawContent': '@Vodacom This tobi thing is nothing but useless... Kept on '
               'saying it will send sms but no sms is received...',
 'replyCount': 2,
 'retweetCount': 0,
 'source': 'Twitter for Android',
 'url': 'https://twitter.com/kbwalefafatshe/status/1678174347033296897',
 'userCreated': datetime.datetime(2009, 10, 30, 14, 21, 8, tzinfo=datetime.timezone.utc),
 'userDisplayName': 'Kabelo Tlhabanelo',
 'userFollowersCount': 2313,
 'userId': 86322365,
 'userLocation': '🇿🇦',
 'userStatusesCount': 2288,
 'userUsername': 'kbwalefafatshe'}
