# Notebook for scraping the Mentions Dataset

- Scraping every tweet, where an MP is mentioned.

In [1]:
!pip install tweepy

Collecting tweepy
  Using cached tweepy-4.12.1-py3-none-any.whl (101 kB)
Installing collected packages: tweepy
Successfully installed tweepy-4.12.1


In [35]:
import tweepy
from twitter_authentication import bearer_token
import time
import pandas as pd
import time
# Reference: https://github.com/jdfoote/Intro-to-Programming-and-Data-Science/blob/fall2021/extra_topics/twitter_v2_example.ipynb

In [36]:
# Access to the Twitter API
client = tweepy.Client(bearer_token, wait_on_rate_limit=True)

In [37]:
# Read in extracted usernames
politicians = pd.read_csv("twitter_usernames_extracted.csv", sep=";")
politicians = politicians.drop(["Unnamed: 0", "Unnamed: 0.1"], axis=1)
len(politicians)

740

In [38]:
# Filter only for politcians with an active Twitter Account
politicians = politicians.loc[politicians.twitter != "[]"]
print(len(politicians))
politicians

523


Unnamed: 0,full_name,party,twitter,username
0,Sanae Abdi,SPD,https://twitter.com/abdisanae'],abdisanae
1,Valentin Abel,FDP,['https://twitter.com/Valentin_C_Abel'],Valentin_C_Abel
3,Katja Adler,FDP,['https://twitter.com/katjadler'],katjadler
4,Stephanie Aeffner,Bündnis 90/Die Grünen,['https://twitter.com/s_aeffner'],s_aeffner
6,Gökay Akbulut,Die Linke,['https://twitter.com/akbulutgokay?lang=de'],akbulutgokay
...,...,...,...,...
735,Kay-Uwe Ziegler,AfD,['https://twitter.com/KayUweZiegler71'],KayUweZiegler71
736,Paul Ziemiak,CDU/CSU,['https://twitter.com/PaulZiemiak'],PaulZiemiak
737,Stefan Zierke,SPD,['http://twitter.com/zierke'],zierke
738,Dr. Jens Zimmermann,SPD,['https://twitter.com/jenszspd'],jenszspd


# Build Functions

In [39]:
# Function to extract every party with its corresponding politicians
# We only scrape 40 politicians at once, so depending on how many poiticans exist in one party, we create a different amount of dataframes
# We make different Runs, for every dataframe with its 40 politicians in it
def extract_party(party_name):
    party = politicians.loc[politicians.party == party_name]
    
    party_1 = party[0:40]
    party_2 = party[40:82]
    party_3 = party[82:122]
    party_4 = party[122:144]

    party_polis_1 = []
    party_polis_2 = []
    party_polis_3 = []
    party_polis_4 = []
    
    for user in party_1.username:
        user = user.replace("@","")
        party_polis_1.append(user)

    for user in party_2.username:
        user = user.replace("@","")
        party_polis_2.append(user)

    for user in party_3.username:
        user = user.replace("@","")
        party_polis_3.append(user)

    for user in party_4.username:
        user = user.replace("@","")
        party_polis_4.append(user)

    party_polis_1 = " OR ".join([f"@{username}" for username in party_polis_1])
    party_polis_2 = " OR ".join([f"@{username}" for username in party_polis_2])
    party_polis_3 = " OR ".join([f"@{username}" for username in party_polis_3])
    party_polis_4 = " OR ".join([f"@{username}" for username in party_polis_4])
    
    liste = []

    if party_polis_1 != "":
        liste.append(party_polis_1)
    if party_polis_2 != "":
        liste.append(party_polis_2)
    if party_polis_3 != "":
        liste.append(party_polis_3)
    if party_polis_4 != "":
        liste.append(party_polis_4)
        
    return liste

In [40]:
# Function to scrape the Metion Dataset
def scrape_mentions(FILE_NAME, query_term_liste):
    start = time.time()
    df_counter = 0
    
    # A query that looks, if a politician is mentioned
    for i in query_term_liste:

        tweets = []
        counter = 0

        # Scraping with the relevant fields and time stamps
        for response in tweepy.Paginator(client.search_all_tweets, 
                                            query = f'({i}) lang:de',
                                            user_fields = ['username', 'public_metrics', 'description', 'location'],
                                            tweet_fields = ['created_at', 'geo', 'public_metrics', 'text'],
                                            expansions = ['author_id', 'entities.mentions.username'],
                                            start_time = '2022-02-01T00:00:00Z',
                                            end_time = '2022-04-01T00:00:00Z',
                                            max_results=500):
            time.sleep(1)
            tweets.append(response)
            counter = counter + 1
            print(f"Response Nummer: {counter}")

        end = time.time()
        print(f"Das Scrapen von hat {(end - start)/60} Minuten gebraucht.")
        print(len(tweets))

        result = []
        user_dict = {}
            # Loop through each response object
        for response in tweets:
                # Take all of the users, and put them into a dictionary of dictionaries with the info we want to keep
            for user in response.includes['users']:
                user_dict[user.id] = {'username': user.username, 
                                          'followers': user.public_metrics['followers_count'],
                                          'tweets': user.public_metrics['tweet_count'],
                                          'description': user.description,
                                          'location': user.location
                                         }
            for tweet in response.data:
                #print(tweet.entities["mentions"])
                if tweet.entities != None:
                    mentions = tweet.entities["mentions"] 
                else:
                    mentions = "None"
                    
                mention_in_tweet = []

                for d in mentions:
                    username = d
                    if isinstance(username, dict):
                        mention_in_tweet.append(username["username"])
                    else:
                        username ="NONE"
                        mention_in_tweet.append(username)
                        #print(mention_in_tweet)
                    # For each tweet, find the author's information
                author_info = user_dict[tweet.author_id]
                    # Put all of the information we want to keep in a single dictionary for each tweet
                result.append({'author_id': tweet.author_id, 
                                   'username': author_info['username'],
                                   'author_followers': author_info['followers'],
                                   'author_tweets': author_info['tweets'],
                                   'author_description': author_info['description'],
                                   'author_location': author_info['location'],
                                   'text': tweet.text,
                                   'created_at': tweet.created_at,
                                   'quote_count': tweet.public_metrics['quote_count'],
                                   'retweets': tweet.public_metrics['retweet_count'],
                                   'replies': tweet.public_metrics['reply_count'],
                                   'likes': tweet.public_metrics['like_count'],
                                   'mentioned': mention_in_tweet
                                  })

            # Change this list of dictionaries into a dataframe
            df = pd.DataFrame(result)
            df.sort_values(by=['created_at'], ascending=False)
            print("DIESER RUN IST FERTIG**************************************************")

        df_counter = df_counter + 1

        df["Run"] = f"{FILE_NAME}_polis_{df_counter}"
        df.to_csv(f"mentions_{FILE_NAME}_{df_counter}.csv")
        print(len(df))

# Start Scraping

In [41]:
# Check how many politicians per party
politicians.party.value_counts()

 SPD                      144
 Bündnis 90/Die Grünen    106
 CDU/CSU                  102
 FDP                       81
 AfD                       53
 Die Linke                 32
 fraktionslos               5
Name: party, dtype: int64

In [42]:
# Get every politician from the regarding party
afd = extract_party(" AfD")
linke = extract_party(" Die Linke")
cducsu = extract_party(" CDU/CSU")
gruene = extract_party(" Bündnis 90/Die Grünen")
spd =  extract_party(" SPD")
afd[1]

'@Schneider_AfD OR @Uwe_Schulz_AfD OR @Th_Seitz_AfD OR @Martin_Sichert OR @dirkspaniel OR @Rene_Springer OR @Beatrix_vStorch OR @Alice_Weidel OR @h_weyel OR @wolfgangwiehle OR @ChrWirthMdB OR @joachimwundrak OR @KayUweZiegler71'

##### Start Scraping

In [45]:
#scrape_mentions("Linke", linke)

In [46]:
scrape_mentions("AfD", afd)

Response Nummer: 1
Response Nummer: 2
Response Nummer: 3
Response Nummer: 4
Response Nummer: 5
Response Nummer: 6
Response Nummer: 7
Response Nummer: 8
Response Nummer: 9
Response Nummer: 10
Response Nummer: 11
Response Nummer: 12
Response Nummer: 13
Response Nummer: 14
Response Nummer: 15
Response Nummer: 16
Response Nummer: 17
Response Nummer: 18
Response Nummer: 19
Response Nummer: 20
Response Nummer: 21
Response Nummer: 22
Response Nummer: 23
Response Nummer: 24
Response Nummer: 25
Response Nummer: 26
Response Nummer: 27
Response Nummer: 28
Response Nummer: 29
Response Nummer: 30
Response Nummer: 31
Response Nummer: 32
Response Nummer: 33
Response Nummer: 34
Response Nummer: 35
Response Nummer: 36
Response Nummer: 37
Response Nummer: 38
Response Nummer: 39
Response Nummer: 40
Response Nummer: 41
Response Nummer: 42
Response Nummer: 43
Response Nummer: 44
Response Nummer: 45
Response Nummer: 46
Response Nummer: 47
Response Nummer: 48
Response Nummer: 49
Response Nummer: 50
Response 

Rate limit exceeded. Sleeping for 108 seconds.


Response Nummer: 300
Response Nummer: 301
Response Nummer: 302
Response Nummer: 303
Response Nummer: 304
Das Scrapen von hat 15.179393502076467 Minuten gebraucht.
304
DIESER RUN IST FERTIG**************************************************
DIESER RUN IST FERTIG**************************************************
DIESER RUN IST FERTIG**************************************************
DIESER RUN IST FERTIG**************************************************
DIESER RUN IST FERTIG**************************************************
DIESER RUN IST FERTIG**************************************************
DIESER RUN IST FERTIG**************************************************
DIESER RUN IST FERTIG**************************************************
DIESER RUN IST FERTIG**************************************************
DIESER RUN IST FERTIG**************************************************
DIESER RUN IST FERTIG**************************************************
DIESER RUN IST FERTIG********************

In [47]:
scrape_mentions("Gruene", gruene)

Response Nummer: 1
Response Nummer: 2
Response Nummer: 3
Response Nummer: 4
Response Nummer: 5
Response Nummer: 6
Response Nummer: 7
Response Nummer: 8
Response Nummer: 9
Response Nummer: 10
Response Nummer: 11
Response Nummer: 12
Response Nummer: 13
Response Nummer: 14
Response Nummer: 15
Response Nummer: 16
Response Nummer: 17
Response Nummer: 18
Response Nummer: 19
Response Nummer: 20
Response Nummer: 21
Response Nummer: 22
Response Nummer: 23
Response Nummer: 24
Response Nummer: 25
Response Nummer: 26
Response Nummer: 27
Response Nummer: 28
Response Nummer: 29
Response Nummer: 30
Response Nummer: 31
Response Nummer: 32
Response Nummer: 33
Response Nummer: 34
Response Nummer: 35
Response Nummer: 36
Response Nummer: 37
Response Nummer: 38
Response Nummer: 39
Response Nummer: 40
Response Nummer: 41
Response Nummer: 42
Response Nummer: 43
Response Nummer: 44
Response Nummer: 45
Response Nummer: 46
Response Nummer: 47
Response Nummer: 48
Response Nummer: 49
Response Nummer: 50
Response 

Rate limit exceeded. Sleeping for 13 seconds.


Response Nummer: 943
Response Nummer: 944
Response Nummer: 945
Response Nummer: 946
Response Nummer: 947
Response Nummer: 948
Response Nummer: 949
Response Nummer: 950
Response Nummer: 951
Response Nummer: 952
Response Nummer: 953
Response Nummer: 954
Response Nummer: 955
Response Nummer: 956
Response Nummer: 957
Response Nummer: 958
Response Nummer: 959
Response Nummer: 960
Response Nummer: 961
Response Nummer: 962
Response Nummer: 963
Response Nummer: 964
Response Nummer: 965
Response Nummer: 966
Response Nummer: 967
Response Nummer: 968
Response Nummer: 969
Response Nummer: 970
Response Nummer: 971
Response Nummer: 972
Response Nummer: 973
Response Nummer: 974
Response Nummer: 975
Response Nummer: 976
Response Nummer: 977
Response Nummer: 978
Response Nummer: 979
Response Nummer: 980
Response Nummer: 981
Response Nummer: 982
Response Nummer: 983
Response Nummer: 984
Response Nummer: 985
Response Nummer: 986
Response Nummer: 987
Response Nummer: 988
Response Nummer: 989
Response Numm

Rate limit exceeded. Sleeping for 22 seconds.


Response Nummer: 1243
Response Nummer: 1244
Response Nummer: 1245
Response Nummer: 1246
Response Nummer: 1247
Response Nummer: 1248
Response Nummer: 1249
Response Nummer: 1250
Response Nummer: 1251
Response Nummer: 1252
Response Nummer: 1253
Response Nummer: 1254
Response Nummer: 1255
Response Nummer: 1256
Response Nummer: 1257
Response Nummer: 1258
Response Nummer: 1259
Response Nummer: 1260
Response Nummer: 1261
Response Nummer: 1262
Response Nummer: 1263
Response Nummer: 1264
Response Nummer: 1265
Response Nummer: 1266
Response Nummer: 1267
Response Nummer: 1268
Response Nummer: 1269
Response Nummer: 1270
Response Nummer: 1271
Response Nummer: 1272
Response Nummer: 1273
Response Nummer: 1274
Response Nummer: 1275
Response Nummer: 1276
Response Nummer: 1277
Response Nummer: 1278
Response Nummer: 1279
Response Nummer: 1280
Response Nummer: 1281
Response Nummer: 1282
Response Nummer: 1283
Response Nummer: 1284
Response Nummer: 1285
Response Nummer: 1286
Response Nummer: 1287
Response N

In [48]:
scrape_mentions("SPD", spd)

Response Nummer: 1
Response Nummer: 2
Response Nummer: 3
Response Nummer: 4
Response Nummer: 5
Response Nummer: 6
Response Nummer: 7
Response Nummer: 8
Response Nummer: 9
Response Nummer: 10
Response Nummer: 11
Response Nummer: 12
Response Nummer: 13
Response Nummer: 14
Response Nummer: 15
Response Nummer: 16
Response Nummer: 17
Response Nummer: 18
Response Nummer: 19
Response Nummer: 20
Response Nummer: 21
Response Nummer: 22
Response Nummer: 23
Response Nummer: 24
Response Nummer: 25
Response Nummer: 26
Response Nummer: 27
Response Nummer: 28
Response Nummer: 29
Response Nummer: 30
Response Nummer: 31
Response Nummer: 32
Response Nummer: 33
Response Nummer: 34
Response Nummer: 35
Response Nummer: 36
Response Nummer: 37
Response Nummer: 38
Response Nummer: 39
Response Nummer: 40
Response Nummer: 41
Response Nummer: 42
Response Nummer: 43
Response Nummer: 44
Response Nummer: 45
Response Nummer: 46
Response Nummer: 47
Response Nummer: 48
Response Nummer: 49
Response Nummer: 50
Response 

In [49]:
scrape_mentions("CDUCSU", cducsu)

Response Nummer: 1
Response Nummer: 2
Response Nummer: 3
Response Nummer: 4
Response Nummer: 5
Response Nummer: 6
Response Nummer: 7
Response Nummer: 8
Response Nummer: 9
Response Nummer: 10
Response Nummer: 11
Response Nummer: 12
Response Nummer: 13
Response Nummer: 14
Response Nummer: 15
Response Nummer: 16
Response Nummer: 17
Response Nummer: 18
Response Nummer: 19
Response Nummer: 20
Response Nummer: 21
Response Nummer: 22
Response Nummer: 23
Response Nummer: 24
Response Nummer: 25
Response Nummer: 26
Response Nummer: 27
Response Nummer: 28
Response Nummer: 29
Response Nummer: 30
Response Nummer: 31
Response Nummer: 32
Response Nummer: 33
Response Nummer: 34
Response Nummer: 35
Response Nummer: 36
Response Nummer: 37
Response Nummer: 38
Response Nummer: 39
Response Nummer: 40
Response Nummer: 41
Response Nummer: 42
Response Nummer: 43
Response Nummer: 44
Response Nummer: 45
Response Nummer: 46
Response Nummer: 47
Response Nummer: 48
Response Nummer: 49
Response Nummer: 50
Response 

Rate limit exceeded. Sleeping for 3 seconds.


Response Nummer: 509
Response Nummer: 510
Response Nummer: 511
Response Nummer: 512
Response Nummer: 513
Response Nummer: 514
Response Nummer: 515
Response Nummer: 516
Response Nummer: 517
Response Nummer: 518
Response Nummer: 519
Response Nummer: 520
Response Nummer: 521
Response Nummer: 522
Response Nummer: 523
Response Nummer: 524
Response Nummer: 525
Response Nummer: 526
Response Nummer: 527
Response Nummer: 528
Response Nummer: 529
Response Nummer: 530
Response Nummer: 531
Response Nummer: 532
Response Nummer: 533
Response Nummer: 534
Response Nummer: 535
Response Nummer: 536
Response Nummer: 537
Response Nummer: 538
Response Nummer: 539
Response Nummer: 540
Response Nummer: 541
Response Nummer: 542
Response Nummer: 543
Response Nummer: 544
Response Nummer: 545
Response Nummer: 546
Response Nummer: 547
Response Nummer: 548
Response Nummer: 549
Response Nummer: 550
Response Nummer: 551
Response Nummer: 552
Response Nummer: 553
Response Nummer: 554
Response Nummer: 555
Response Numm

In [36]:
#df.loc[df['username'] == 'Markus_Soeder', 'username']

Series([], Name: username, dtype: object)