# Packages

In [1]:
# packages
import tweepy
import pandas as pd
import os

print("Tweepy version: " + tweepy.__version__)
print("Pandas version: " + pd.__version__)

Tweepy version: 4.4.0
Pandas version: 1.3.4


In [2]:
# import tokens from config.py file
if os.path.isfile("config.py"):
    print("config.py exists\nAPI keys and tokens are imported")
    from config import consumer_key, consumer_secret, access_token, access_token_secret
else:
    print("config.py does not exist\nPlease add config.py to proceed")

config.py exists
API keys and tokens are imported


# Setting up API

In [3]:
# setup consumer API key
auth = tweepy.OAuthHandler(
    consumer_key,
    consumer_secret
)

# setup access token
auth.set_access_token(
    access_token,
    access_token_secret
)

# create API variable
api = tweepy.API(
    auth, 
    wait_on_rate_limit = True
)

In [4]:
# check if API credentials work
try:
    api.verify_credentials()
    print("Authentication OK")
except:
    print("Error during authentication")

Authentication OK


# Testing Twitter API

In [5]:
# test if user_timeline method works with own twitter account
tweets_lw = api.user_timeline(
    screen_name = "lukas_warode",
    count = 1,
    include_rts = False,
    tweet_mode = "extended"
)

# print type of user_timeline method object
type(tweets_lw)

# function to convert tweepy object to a pandas dataframe
def timeline_to_df(tweepy_timeline):
    json_data = [r._json for r in tweepy_timeline]
    df = pd.json_normalize(json_data)
    return df

# apply function 
tweets_lw_df = timeline_to_df(tweets_lw)

# print full text column of tweet dataframe
pd.options.display.max_colwidth = int(tweets_lw_df["full_text"].str.len())
print(tweets_lw_df["full_text"])

0    @p_c_bauer @MichaelImre Nice project! Seems to be a very rare coincidence, I worked basically on the same project last year while using the same name 😄\nhttps://t.co/QQtOQ...
Name: full_text, dtype: object


In [6]:

# print full text column of tweet dataframe
pd.options.display.max_colwidth = int(tweets_lw_df["full_text"].str.len())
print(tweets_lw_df["full_text"])

0    @p_c_bauer @MichaelImre Nice project! Seems to be a very rare coincidence, I worked basically on the same project last year while using the same name 😄\nhttps://t.co/QQtOQ...
Name: full_text, dtype: object


# Use csv file from WZB project to extract list of German MPs' Twitter accounts
## (Project author: Markus Konrad)

In [7]:
# read csv as dataframe from GitHub repository
wzb_df = pd.read_csv("https://raw.githubusercontent.com/WZBSocialScienceCenter/mdb-twitter-network/master/data/deputies_twitter_20190702.csv")

# create subset with the 2 relevant columns and drop NAs
twitter_df = wzb_df[["twitter_name", "party"]].dropna()

# Sampling approaches
## a) Get random MP Twitter handles

In [8]:
# Function to extract random MPs' Twitter handeles
def random_sample_handle(df, n):
    sample = df[["twitter_name"]].sample(n = n)
    name_string = sample.to_string(index = False, header = False)
    return name_string

# apply function
print(
    random_sample_handle(
        df = twitter_df,
        n = 5
    )
)

 andrealindholz
 lothar_binding
 stefangelbhaar
           w_sk
fritzfelgentreu


## b) Extract Twitter handles by popularity

In [9]:
# follower count function
def follower_count_fun(twitter_handle):
    try: 
        user = api.get_user(screen_name = twitter_handle)
        count = user.followers_count
        return count
    except tweepy.TweepyException:
        pass

# for demonstration and simplification purposes we create a subset with Green MPs
twitter_df_greens = twitter_df[twitter_df["party"] == "DIE GRÜNEN"]

# store Twitter handles as list from data frame (column) with a function
def col_to_tidy_list(df, col):
    col_string = df[[col]].to_string(index = False, header = False)
    tidy_string = col_string.replace(" ", "")
    tidy_list = tidy_string.split("\n")
    return tidy_list

# test and print results
twitter_handles_list = col_to_tidy_list(
    twitter_df_greens,
    "twitter_name"
)

print(twitter_handles_list)

['kirstenkappert', 'konstantinnotz', 'markuskurthmdb', 'babetteschefin', 'sven_kindler', 'agnieszka_mdb', 'goeringeckardt', 'markustressel', 'beatewaro', 'julia_verlinden', 'jtrittin', 'k_sa', 'ulle_schauws', 'schickgerhard', 'manuelsarrazin', 'tabearoessner', 'crueffer', 'lisapaus', 'fostendorff', 'cem_oezdemir', 'nouripour', 'gruenebeate', 'irenemihalic', 'tobiaslindner', 'steffilemke', 'monikalazar', 'renatekuenast', 'chriskuehn_mdb', 'stephankuehn', 'oliver_krischer', 'mariaklschmeink', 'uwekekeritz', 'djanecek', 'brihasselmann', 'hajdukbundestag', 'kaigehring', 'matthiasgastel', 'katjadoerner', 'katdro', 'ebner_sha', 'ekindeligoez', 'fbrantner', 'kerstinandreae', 'abaerbock', 'w_sk', 'lieblingxhain', 'stefangelbhaar', 'danywagner_da', 'badulrichmartha', 'gruenclaudia', 'derdanyal', 'margaretebause', 'filizgreen', 'owvonholtz', 'svenlehmann', 'annachristmann']


In [10]:
# apply function in a for loop and store follower count in list
follower_count_list = []

for twitter_name in twitter_handles_list:
    follower_count_list.append(
        follower_count_fun(twitter_name)
    )

In [11]:
# print results 
print(follower_count_list)

[6194, 84955, 3963, None, 19422, 13577, 200537, 2012, None, 9569, 115093, 7415, 7682, 11899, 6524, 9208, 3429, 10953, None, 279590, 27582, 5744, 8857, 8897, 17437, 6331, 76514, 4450, 7485, 19540, 7651, 3351, 13616, 35254, 2390, 11903, 7413, 17858, 8398, 5337, 9375, 12738, 8865, 410042, 7506, 12863, 7740, 1707, 5589, 3047, 16986, 7330, 5547, 1740, 21858, 5051]


In [12]:
# add follower count list to data fraeme as a numeric column
twitter_df_greens["follower_count"] = follower_count_list

# print transformed data frame
print(twitter_df_greens)

        twitter_name       party  follower_count
34    kirstenkappert  DIE GRÜNEN          6194.0
47    konstantinnotz  DIE GRÜNEN         84955.0
67    markuskurthmdb  DIE GRÜNEN          3963.0
68    babetteschefin  DIE GRÜNEN             NaN
71      sven_kindler  DIE GRÜNEN         19422.0
84     agnieszka_mdb  DIE GRÜNEN         13577.0
92    goeringeckardt  DIE GRÜNEN        200537.0
98     markustressel  DIE GRÜNEN          2012.0
122        beatewaro  DIE GRÜNEN             NaN
129  julia_verlinden  DIE GRÜNEN          9569.0
135         jtrittin  DIE GRÜNEN        115093.0
166             k_sa  DIE GRÜNEN          7415.0
176     ulle_schauws  DIE GRÜNEN          7682.0
179    schickgerhard  DIE GRÜNEN         11899.0
185   manuelsarrazin  DIE GRÜNEN          6524.0
189    tabearoessner  DIE GRÜNEN          9208.0
194         crueffer  DIE GRÜNEN          3429.0
221         lisapaus  DIE GRÜNEN         10953.0
226      fostendorff  DIE GRÜNEN             NaN
228     cem_oezdemir

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_df_greens["follower_count"] = follower_count_list


In [13]:
# filter observation with highest follower count
max_followers = twitter_df_greens["follower_count"].max()

twitter_df_greens[twitter_df_greens["follower_count"] == max_followers]

# get twitter name column with highest follower count as string
most_followers_mp = twitter_df_greens[twitter_df_greens["follower_count"] == max_followers]["twitter_name"].to_string(index = False, header = False)

print(most_followers_mp)

abaerbock


# Tweet extraction
## convert `user_timeline` of **Annalena Baerbock** to data frame

In [14]:
# extract tweets
baerbock_tweets = api.user_timeline(
    # MP with most followers (Greens) - Annalena Baerbock
    screen_name = most_followers_mp,
    # maximum number of tweets extractable
    count = 200,
    # do not include retweets
    include_rts = False,
    # scope of retrieved information
    tweet_mode = "extended"
)

# apply function that converts timeline object to data frame
baerbock_tweets_df = timeline_to_df(baerbock_tweets)

# print data frame
print(baerbock_tweets_df)

                         created_at                   id               id_str  \
0    Tue Dec 07 14:58:54 +0000 2021  1468233562742788111  1468233562742788111   
1    Thu Dec 02 19:01:22 +0000 2021  1466482642942054400  1466482642942054400   
2    Thu Dec 02 19:01:22 +0000 2021  1466482641578999817  1466482641578999817   
3    Thu Dec 02 19:01:21 +0000 2021  1466482640102510601  1466482640102510601   
4    Sun Nov 28 15:45:00 +0000 2021  1464983672415608834  1464983672415608834   
..                              ...                  ...                  ...   
173  Sat May 08 08:30:44 +0000 2021  1390947262277238785  1390947262277238785   
174  Sat May 08 08:30:44 +0000 2021  1390947260129857537  1390947260129857537   
175  Fri May 07 14:27:36 +0000 2021  1390674683910053895  1390674683910053895   
176  Fri May 07 14:26:36 +0000 2021  1390674430251122692  1390674430251122692   
177  Sat May 01 11:23:27 +0000 2021  1388454010798096386  1388454010798096386   

                           

## Save relevant columns as `.csv` file

In [15]:
# create subset of complete data frame
baerbock_tweets_subset_df = baerbock_tweets_df[[
    "id", 
    "created_at",
    "full_text",
    "display_text_range",
    "in_reply_to_user_id",
    "in_reply_to_screen_name",
    "is_quote_status",
    "retweet_count",
    "favorite_count",
    "possibly_sensitive"
]]

# print subsetted data frame
print(baerbock_tweets_subset_df)

                      id                      created_at  \
0    1468233562742788111  Tue Dec 07 14:58:54 +0000 2021   
1    1466482642942054400  Thu Dec 02 19:01:22 +0000 2021   
2    1466482641578999817  Thu Dec 02 19:01:22 +0000 2021   
3    1466482640102510601  Thu Dec 02 19:01:21 +0000 2021   
4    1464983672415608834  Sun Nov 28 15:45:00 +0000 2021   
..                   ...                             ...   
173  1390947262277238785  Sat May 08 08:30:44 +0000 2021   
174  1390947260129857537  Sat May 08 08:30:44 +0000 2021   
175  1390674683910053895  Fri May 07 14:27:36 +0000 2021   
176  1390674430251122692  Fri May 07 14:26:36 +0000 2021   
177  1388454010798096386  Sat May 01 11:23:27 +0000 2021   

                                                                                                                                                                          full_text  \
0    Mit dem #Koalitionsvertrag, den SPD, FDP und wir heute unterzeichnet haben, kommen wir endl

In [16]:
# save data frame as csv in case it does not already exist
if os.path.isfile("baerbock_tweets.csv"):
    print("baerbock_tweets.csv already exists")
else:
    print("baerbock_tweets.csv did not exist before\nTweets are saved in a csv file")
    baerbock_tweets_subset_df.to_csv("baerbock_tweets.csv")

baerbock_tweets.csv did not exist before
Tweets are saved in a csv file
