# 02-03 : Combine Twitter Data

In [1]:
import pandas as pd
from datetime import datetime
import json
import pickle
from pprint import pprint
from typing import List, Dict

## Data Load

In [2]:
def read_pickle_tweets(file_path:str) -> List[Dict]:
    """Read a tweet pickle file and return a list of tweets."""
    # read the pickle file
    with open(file_path, 'rb') as f:
        pkl_tweets = pickle.load(f)

    # return the list of tweets
    return [json.loads(tweet.json()) for tweet in pkl_tweets]  

In [3]:
js_vodacom = read_pickle_tweets('../../data/raw/02_01_tweets.pkl')
print(f'Vodacom  : {len(js_vodacom)}')

js_vodafone = read_pickle_tweets('../../data/raw/02_02_tweets_vodafone.pkl')
print(f'Vodafone : {len(js_vodafone)}')

Vodacom  : 983
Vodafone : 1482


## Convert to DataFrame

### Functions

In [4]:
pprint(js_vodacom[101])

{'_type': 'snscrape.modules.twitter.Tweet',
 'cashtags': [],
 'conversationId': 1643599377834061826,
 'coordinates': None,
 'date': '2023-04-07 15:08:10+00:00',
 'hashtags': [],
 'id': 1644356395607678977,
 'id_str': '1644356395607678977',
 'inReplyToTweetId': 1643599377834061826,
 'inReplyToUser': {'_type': 'snscrape.modules.twitter.UserRef',
                   'displayname': 'Vodacom',
                   'id': 14574763,
                   'username': 'Vodacom'},
 'lang': 'en',
 'likeCount': 0,
 'links': [],
 'media': {'animated': [], 'photos': [], 'videos': []},
 'mentionedUsers': [{'_type': 'snscrape.modules.twitter.UserRef',
                     'displayname': 'Vodacom',
                     'id': 14574763,
                     'username': 'Vodacom'},
                    {'_type': 'snscrape.modules.twitter.UserRef',
                     'displayname': 'Evan Greenwood',
                     'id': 2867399789,
                     'username': 'CodeOfTheVoid'},
                    {'_t

In [36]:
def extract_tweet_data(tweet:Dict) -> Dict:
    """Extracts relevant data from a tweet"""
    return {
        'id': str(tweet['id']),
        'conversationId': str(tweet['conversationId']),
        'date': datetime.strptime(tweet['date'], '%Y-%m-%d %H:%M:%S%z'),
        'rawContent': tweet['rawContent'],
        'inReplyToTweetId': str(tweet['inReplyToTweetId']),
        'inReplyToUserId':  str(tweet['inReplyToUser']['id'] if tweet['inReplyToUser'] else None),
        'inReplyToUsername': tweet['inReplyToUser']['username'] if tweet['inReplyToUser'] else None,
        'inReplyToDisplayName': tweet['inReplyToUser']['displayname'] if tweet['inReplyToUser'] else None,
        'language': tweet['lang'],
        'likeCount': tweet['likeCount'],
        'quoteCount': tweet['quoteCount'],
        'replyCount': tweet['replyCount'],
        'retweetCount': tweet['retweetCount'],
        'source': tweet['sourceLabel'],
        'url': tweet['url'],
        'userCreated': datetime.strptime(tweet['user']['created'], '%Y-%m-%d %H:%M:%S%z'),
        'userDisplayName': tweet['user']['displayname'],
        'userFollowersCount': tweet['user']['followersCount'],
        'userId': str(tweet['user']['id']),
        'userLocation': tweet['user']['location'],
        'userStatusesCount': tweet['user']['statusesCount'],
        'userUsername': tweet['user']['username'],
    }

# test the function
pprint(extract_tweet_data(js_vodacom[20]))

{'conversationId': '1662066053428510721',
 'date': datetime.datetime(2023, 7, 9, 22, 48, 37, tzinfo=datetime.timezone.utc),
 'id': '1678174347033296897',
 'inReplyToDisplayName': 'Vodacom',
 'inReplyToTweetId': '1662066053428510721',
 'inReplyToUserId': '14574763',
 'inReplyToUsername': 'Vodacom',
 'language': 'en',
 'likeCount': 0,
 'quoteCount': 0,
 'rawContent': '@Vodacom This tobi thing is nothing but useless... Kept on '
               'saying it will send sms but no sms is received...',
 'replyCount': 2,
 'retweetCount': 0,
 'source': 'Twitter for Android',
 'url': 'https://twitter.com/kbwalefafatshe/status/1678174347033296897',
 'userCreated': datetime.datetime(2009, 10, 30, 14, 21, 8, tzinfo=datetime.timezone.utc),
 'userDisplayName': 'Kabelo Tlhabanelo',
 'userFollowersCount': 2313,
 'userId': '86322365',
 'userLocation': 'üáøüá¶',
 'userStatusesCount': 2288,
 'userUsername': 'kbwalefafatshe'}


## Create Dataframes

In [39]:
df_vodacom = pd.DataFrame([extract_tweet_data(tweet) for tweet in js_vodacom])

# save the dataframe
df_vodacom.to_parquet('../../data/interim/02-03_twitter_vodacom.parquet', compression='gzip')

# show the dataframe
print(df_vodacom.shape)
display(df_vodacom.head())

(983, 22)


Unnamed: 0,id,conversationId,date,rawContent,inReplyToTweetId,inReplyToUserId,inReplyToUsername,inReplyToDisplayName,language,likeCount,...,retweetCount,source,url,userCreated,userDisplayName,userFollowersCount,userId,userLocation,userStatusesCount,userUsername
0,1684937340169375744,1684937340169375744,2023-07-28 14:42:21+00:00,"Serious question:\n\nHas anyone, anywhere ever...",,,,,en,2,...,1,Twitter for Android,https://twitter.com/TopEditorInt/status/168493...,2009-03-04 11:44:38+00:00,TopEditor Internatio,2956,22759776,USA & SA & UK,70294,TopEditorInt
1,1684892139375710208,1684890636997029888,2023-07-28 11:42:44+00:00,@Vodacom That‚Äôs why I wanted to talk to someon...,1.6848917624965652e+18,14574763.0,Vodacom,Vodacom,en,0,...,0,Twitter for iPhone,https://twitter.com/AkonaMhlana/status/1684892...,2009-05-20 14:38:05+00:00,Ntaba ayilaliüëèüèΩ,135,41370409,South Africa,866,AkonaMhlana
2,1684890636997029888,1684890636997029888,2023-07-28 11:36:46+00:00,@Vodacom what number should I dial to talk to...,,14574763.0,Vodacom,Vodacom,en,0,...,0,Twitter for iPhone,https://twitter.com/AkonaMhlana/status/1684890...,2009-05-20 14:38:05+00:00,Ntaba ayilaliüëèüèΩ,135,41370409,South Africa,866,AkonaMhlana
3,1684595913744953345,1684595913744953345,2023-07-27 16:05:38+00:00,"If you knew how useless your chatbot is , you ...",,,,,en,0,...,0,Twitter for iPhone,https://twitter.com/Moselanku/status/168459591...,2010-10-04 23:34:40+00:00,Moselanku,1325,198678570,Limpopo,58673,Moselanku
4,1683916251821756416,1683916251821756416,2023-07-25 19:04:54+00:00,That Vodacom Tobi is useless when doing sim swap.,,,,,en,0,...,0,Twitter for iPhone,https://twitter.com/PoliteMashaba07/status/168...,2018-11-13 09:03:29+00:00,N'wa Mashaba,1906,1062269709696024576,"Pretoria, South Africa",1806,PoliteMashaba07


In [40]:
df_vodafone = pd.DataFrame([extract_tweet_data(tweet) for tweet in js_vodafone])

# save the dataframe
df_vodafone.to_parquet('../../data/interim/02-03_twitter_vodafone.parquet', compression='gzip')

# show the dataframe
print(df_vodafone.shape)
display(df_vodafone.head())

(1482, 22)


Unnamed: 0,id,conversationId,date,rawContent,inReplyToTweetId,inReplyToUserId,inReplyToUsername,inReplyToDisplayName,language,likeCount,...,retweetCount,source,url,userCreated,userDisplayName,userFollowersCount,userId,userLocation,userStatusesCount,userUsername
0,1686000773698654210,1686000773698654210,2023-07-31 13:08:03+00:00,Tried customer service. No good. Tried Tobi -...,,,,,en,0,...,0,Twitter for Android,https://twitter.com/JohndeWinton/status/168600...,2011-09-23 08:59:27+00:00,John de Wintonüíô,75,378488745,Wales,3835,JohndeWinton
1,1685735867090964480,1685735867090964480,2023-07-30 19:35:24+00:00,@VodafoneUK Your Tobi chat ‚Äòhelp‚Äô service is t...,,20678384.0,VodafoneUK,Vodafone UK,en,5,...,0,Twitter for iPhone,https://twitter.com/garry__spence/status/16857...,2012-08-06 00:33:55+00:00,Garry Spence,25737,739523767,"Scotland, United Kingdom",10492,garry__spence
2,1685556031110144000,1685556031110144000,2023-07-30 07:40:48+00:00,@VodafoneUK @Ofcom Vodafone seem completely un...,,20678384.0,VodafoneUK,Vodafone UK,en,1,...,0,Twitter for iPhone,https://twitter.com/vincentconnolly/status/168...,2011-05-21 08:55:53+00:00,AMUdoc üíô,1034,302499728,,4164,vincentconnolly
3,1684951668805283840,1684923531279142912,2023-07-28 15:39:17+00:00,@VodafoneUK Yes and still no further with my e...,1.6849238633590129e+18,20678384.0,VodafoneUK,Vodafone UK,en,0,...,0,Twitter for Android,https://twitter.com/SarahGi49130661/status/168...,2023-01-17 12:36:35+00:00,Sarah Gilmour (Haft) ‚ú°Ô∏è,15,1615327112449204226,"England, United Kingdom",172,SarahGi49130661
4,1684838696481169408,1684829224362729473,2023-07-28 08:10:22+00:00,"@admdly It's my pleasure, Adam üòä Before you go...",1.6848310101841715e+18,124745222.0,admdly,Adam,en,1,...,0,Verint Messaging,https://twitter.com/VodafoneUK/status/16848386...,2009-02-12 13:52:15+00:00,Vodafone UK,252054,20678384,,672026,VodafoneUK
