In [17]:
# This is a Jupyter notebook that we are using for BADM590SMA. It will contain many scripts that are
# directly from the Bonzanini textbook. His GitHub can be found at:
# https://github.com/bonzanini/Book-SocialMediaMiningPython. His scripts will be cited here, but we (or you)
# may modify parts of his scripts as needed.
# You will need to pip install tweepy for this notebook

In [18]:
import os
import sys
import tweepy
from tweepy import API
from tweepy import OAuthHandler

In [19]:
# Change the name of the twitter handle below for the user you are investigating
twitterhandle = 'Illinois_Alma'

In [20]:
def get_twitter_auth():
    """Setup Twitter authentication.

    Return: tweepy.OAuthHandler object
    """
    try:
        consumer_key = os.environ['TWITTER_CONSUMER_KEY']
        consumer_secret = os.environ['TWITTER_CONSUMER_SECRET']
        access_token = os.environ['TWITTER_ACCESS_TOKEN']
        access_secret = os.environ['TWITTER_ACCESS_SECRET']
    except KeyError:
        sys.stderr.write("TWITTER_* environment variables not set\n")
        sys.exit(1)    
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)
    return auth

In [21]:
def get_twitter_client():
    """Setup Twitter API client.

    Return: tweepy.API object
    """
    auth = get_twitter_auth()
    client = API(auth)
    return client

In [22]:
#######################################################################
# Getting recent tweets from a user's timeline (Rest API)
#######################################################################

In [23]:
# Chap02-03/twitter_get_user_timeline.py
import sys
import json
from tweepy import Cursor

user = twitterhandle # Change this user name to whatever you want
client = get_twitter_client()

fname = "user_timeline_{}.jsonl".format(user)
with open(fname, 'w') as f:
    for page in Cursor(client.user_timeline, screen_name=user, count=200).pages(16):
        for status in page:
            f.write(json.dumps(status._json)+"\n")

In [24]:
#######################################################################
# Analyzing hastag frequencies for the user timeline pulled above
#######################################################################

In [25]:
# Chap02-03/twitter_hashtag_frequency.py 
import sys 
from collections import Counter 
import json 

user = twitterhandle # Change this user name to whatever you want
fname = "user_timeline_{}.jsonl".format(user)


def get_hashtags(tweet): 
  entities = tweet.get('entities', {}) 
  hashtags = entities.get('hashtags', []) 
  return [tag['text'].lower() for tag in hashtags]

with open(fname, 'r') as f: 
    hashtags = Counter() 
    for line in f: 
        tweet = json.loads(line) 
        hashtags_in_tweet = get_hashtags(tweet) 
        hashtags.update(hashtags_in_tweet) 
    for tag, count in hashtags.most_common(20): 
        print("{}: {}".format(tag, count))

illini: 502
illinois: 456
illinoishomecoming: 70
tbt: 35
illinois2021: 26
wewillwin: 25
illinoisresearch: 24
vetmed: 18
illinoisfall: 17
attheunion: 16
crittercam: 14
election2016: 14
illinidays: 14
bragginrights: 14
chambana: 12
illinoiswantsme: 12
b1g: 11
livestream: 10
ischoolui: 10
cancer: 9


In [26]:
#######################################################################
# Analyzing hastag stats for the user timeline pulled above
#######################################################################

In [27]:
# Chap02-03/twitter_hashtag_stats.py
import sys
from collections import defaultdict
import json

def usage():
    print("Usage:")
    print("python {} <filename.jsonl>".format(sys.argv[0]))

with open(fname, 'r') as f:
    hashtag_count = defaultdict(int)
    for line in f:
        tweet = json.loads(line)
        hashtags_in_tweet = get_hashtags(tweet)
        n_of_hashtags = len(hashtags_in_tweet)
        hashtag_count[n_of_hashtags] += 1
        
    tweets_with_hashtags = sum([count for n_of_tags, count in hashtag_count.items() if n_of_tags > 0])
    tweets_no_hashtags = hashtag_count[0]
    tweets_total = tweets_no_hashtags + tweets_with_hashtags
    tweets_with_hashtags_percent = "%.2f" % (tweets_with_hashtags / tweets_total * 100)
    tweets_no_hashtags_percent = "%.2f" % (tweets_no_hashtags / tweets_total * 100)
    print("{} tweets without hashtags ({}%)".format(tweets_no_hashtags, tweets_no_hashtags_percent))
    print("{} tweets with at least one hashtag ({}%)".format(tweets_with_hashtags, tweets_with_hashtags_percent))

    for tag_count, tweet_count in hashtag_count.items():
        if tag_count > 0:
            percent_total = "%.2f" % (tweet_count / tweets_total * 100)
            percent_elite = "%.2f" % (tweet_count / tweets_with_hashtags * 100)
            print("{} tweets with {} hashtags ({}% total, {}% elite)".format(tweet_count, tag_count, percent_total, percent_elite))


1494 tweets without hashtags (46.70%)
1705 tweets with at least one hashtag (53.30%)
1166 tweets with 1 hashtags (36.45% total, 68.39% elite)
422 tweets with 2 hashtags (13.19% total, 24.75% elite)
97 tweets with 3 hashtags (3.03% total, 5.69% elite)
15 tweets with 4 hashtags (0.47% total, 0.88% elite)
5 tweets with 5 hashtags (0.16% total, 0.29% elite)


In [None]:
#######################################################################
# Analyzing user mentions for the user timeline pulled above
#######################################################################

In [35]:
# Chap02-03/twitter_mention_frequency.py
import sys
from collections import Counter
import json

def get_mentions(tweet):
    entities = tweet.get('entities', {})
    hashtags = entities.get('user_mentions', [])
    return [tag['screen_name'] for tag in hashtags]

user = twitterhandle # Change this user name to whatever you want
fname = "user_timeline_{}.jsonl".format(user)
with open(fname, 'r') as f:
    users = Counter()
    for line in f:
        tweet = json.loads(line)
        mentions_in_tweet = get_mentions(tweet)
        users.update(mentions_in_tweet)
    for user, count in users.most_common(20):
        print("{}: {}".format(user, count))

Illinois_Alma: 314
LASillinois: 133
EngineeringAtIL: 105
UIAA: 99
IlliniFootball: 97
IlliniMBB: 95
uiucbusiness: 81
Illini_Union: 79
uofiadmissions: 68
IlliniAthletics: 67
UIarchives: 66
edILLINOIS: 66
ACESIllinois: 53
VetMedIllinois: 53
StateFarmCenter: 48
KAMillinois: 47
MarchingIllini: 47
UIPD: 43
UofIFS: 42
IlliniVBall: 41


In [28]:
###############################################################################
# Now let's grab the user's network (You can adjust the number of max friends)
###############################################################################

In [29]:
# Chap02-03/twitter_get_user.py
import os
import sys
import json
import time
import math
from tweepy import Cursor

MAX_FRIENDS = 15000

def usage():
    print("Usage:")
    print("python {} <username>".format(sys.argv[0]))

def paginate(items, n):
    """Generate n-sized chunks from items"""
    for i in range(0, len(items), n):
        yield items[i:i+n]

screen_name = twitterhandle
client = get_twitter_client()
dirname = "users/{}".format(screen_name)
max_pages = math.ceil(MAX_FRIENDS / 5000)
try:
    os.makedirs(dirname, mode=0o755, exist_ok=True)
except OSError:
    print("Directory {} already exists".format(dirname))
except Exception as e:
    print("Error while creating directory {}".format(dirname))
    print(e)
    sys.exit(1)

# get followers for a given user
fname = "users/{}/followers.jsonl".format(screen_name)
with open(fname, 'w') as f:
    for followers in Cursor(client.followers_ids, screen_name=screen_name).pages(max_pages):
        for chunk in paginate(followers, 100):
            users = client.lookup_users(user_ids=chunk)
            for user in users:
                f.write(json.dumps(user._json)+"\n")
        if len(followers) == 5000:
            print("More results available. Sleeping for 60 seconds to avoid rate limit")
            time.sleep(60)

# get friends for a given user
fname = "users/{}/friends.jsonl".format(screen_name)
with open(fname, 'w') as f:
    for friends in Cursor(client.friends_ids, screen_name=screen_name).pages(max_pages):
        for chunk in paginate(friends, 100):
            users = client.lookup_users(user_ids=chunk)
            for user in users:
                f.write(json.dumps(user._json)+"\n")
        if len(friends) == 5000:
            print("More results available. Sleeping for 60 seconds to avoid rate limit")
            time.sleep(60)

# get user's profile
fname = "users/{}/user_profile.json".format(screen_name)
with open(fname, 'w') as f:
    profile = client.get_user(screen_name=screen_name)
    f.write(json.dumps(profile._json, indent=4))

More results available. Sleeping for 60 seconds to avoid rate limit
More results available. Sleeping for 60 seconds to avoid rate limit
More results available. Sleeping for 60 seconds to avoid rate limit
More results available. Sleeping for 60 seconds to avoid rate limit


In [30]:
# Wait for the above to finish as you might hit some rate limits (Check to see if the kernel is still running)

In [31]:
#########################################################################
# Now run some following/follower stats (If you pull someone with very 
# low number of friends, then this will give you a "Sample larger than
# population" error)
#########################################################################

In [32]:
# Chap02-03/twitter_followers_stats.py
import sys
import json
from random import sample
import time

def usage():
    print("Usage:")
    print("python {} <username>".format(sys.argv[0]))

followers_file = 'users/{}/followers.jsonl'.format(screen_name)
friends_file = 'users/{}/friends.jsonl'.format(screen_name)
with open(followers_file) as f1, open(friends_file) as f2:
    t0 = time.time()
    followers = []
    friends = []
    for line in f1:
        profile = json.loads(line)
        followers.append(profile['screen_name'])
    for line in f2:
        profile = json.loads(line)
        friends.append(profile['screen_name'])
    t1 = time.time()
    mutual_friends = [user for user in friends if user in followers]
    followers_not_following = [user for user in followers if user not in friends]
    friends_not_following = [user for user in friends if user not in followers]
    t2 = time.time()
    print("----- Timing -----")
    print("Initialize data: {}".format(t1-t0))
    print("Set-based operations: {}".format(t2-t1))
    print("Total time: {}".format(t2-t0))
    print("----- Stats -----")
    print("{} has {} followers".format(screen_name, len(followers)))
    print("{} has {} friends".format(screen_name, len(friends)))
    print("{} has {} mutual friends".format(screen_name, len(mutual_friends)))
    print("{} friends are not following {} back".format(len(friends_not_following), screen_name))
    print("{} followers are not followed back by {}".format(len(followers_not_following), screen_name))

    some_mutual_friends = ', '.join(sample(mutual_friends, 5))
    print("Some mutual friends: {}".format(some_mutual_friends))

----- Timing -----
Initialize data: 1.2465589046478271
Set-based operations: 5.538280010223389
Total time: 6.784838914871216
----- Stats -----
Illinois_Alma has 14974 followers
Illinois_Alma has 8527 friends
Illinois_Alma has 366 mutual friends
8161 friends are not following Illinois_Alma back
14608 followers are not followed back by Illinois_Alma
Some mutual friends: Michael_Adams1, commodityweek, NinjaYamaPT, __aphro, BrochuDanielle


In [36]:
#########################################################################
# Let's save the content of the user's tweets to a CSV file
#########################################################################

In [39]:
import json
import csv

user = twitterhandle # Change this user name to whatever you want
fname = "user_timeline_{}.jsonl".format(user)
csvname = "user_timeline_{}.csv".format(user)

tempcsv = open(csvname, 'w')
csvwriter = csv.writer(tempcsv)
csvwriter.writerow(['Date', 'Tweet'])

with open(fname, 'r') as f:
    for line in f:
        tweet = json.loads(line)
        csvwriter.writerow([tweet['created_at'], tweet['text']])