In [28]:
import pandas as pd
import requests
import time
import datetime

In [29]:
def get_date_n_days_ago(ndelta=0):
    """
    Returns date n days ago in format YYYY_MM_DD. Without parameter it returns today's date.
    """
    # get current time as datetime
    current_time = datetime.datetime.today()

    # check if a historic date is requested
    if ndelta == 0:

        current_time = str(current_time)

        return f"{current_time[:4]}_{current_time[5:7]}_{current_time[8:10]}"

    # get current time yesterday
    current_time_yesterday = str(current_time - datetime.timedelta(ndelta))

    return f"{current_time_yesterday[:4]}_{current_time_yesterday[5:7]}_{current_time_yesterday[8:10]}"

In [79]:
def create_url(keywords, start_date, end_date, max_results = 10):
    """
    Create the url with current keywords
    """

    #Change to the endpoint you want to collect data from
    search_url = "https://api.twitter.com/2/tweets/search/all"

    #change params based on the endpoint you are using
    query_params = {'query': keywords,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'expansions': 'author_id,geo.place_id',
                    'tweet.fields': 'id,text,created_at,lang,public_metrics,source',
                    'user.fields': 'id,created_at,location,public_metrics',
                    'place.fields': 'full_name,country_code,geo,place_type',
                    'next_token': {}}

    # return tuple: [0] is search_url and [1] is the dict with query params
    return (search_url, query_params)



In [81]:
def connect_to_endpoint(url, headers, params, next_token):
    """
    Returns the api response in a json format and sets 'next_token' value
    """

    #params dict received from create_url function, set next_token value
    params['next_token'] = next_token

    # call api
    response = requests.request("GET", url, headers = headers, params = params)
    print(response.headers["x-rate-limit-remaining"])
    if response.status_code != 200:
        raise Exception(response.status_code, response.text)

    # return api response as json
    return response.json()

In [42]:
def call_api(headers,
             keywords,
             start_time,
             end_time,
             max_results,
             new_token=None):
    """
    Calls the twitter api and returns a dataframe containig all fetched information
    """
    # sleep 4 seconds to not exceed api call limit per 15 Minutes (300)
    time.sleep(4)

    # create url by calling above defined function; url is a tuple
    url = create_url(keywords, start_time, end_time, max_results)

    # call api with above defined function
    response = connect_to_endpoint(url[0], headers, url[1], new_token)

    #response has 3 keys:
    #"data": tweet information in a dict
    #"includes": dict with one key "users" which is a dict of user information
    #"meta": api request information


    # create first DataFrame about tweets out of data key
    tweet_df = pd.json_normalize(response["data"])

    # rename columns to not get identical names with user_df
    tweet_df.rename(columns={'id':'tweet_id',
                             'created_at': "tweet_created_at"},
                    inplace=True)
    print(tweet_df.iloc[-1]["tweet_created_at"])
    # create second DataFrame about users out of response key
    user_df = pd.json_normalize(response["includes"]["users"])

    # rename columns to not get identical names with tweet_df
    user_df.rename(columns={'id':'author_id',
                            'created_at': "profile_created_at"},
                    inplace=True)

    # merge two dataframes into one
    merged_df = tweet_df.merge(user_df, how = "outer", on="author_id")


    collected_tweets = response["meta"]["result_count"]
    next_token = response["meta"].get("next_token", False)

    # return a df containing all information from the api call, the next_token for continuous search and number of collected tweets
    return merged_df, next_token, collected_tweets


In [54]:
def search_twitter(headers, party, keywords, start_time,
             end_time, max_results, tweet_amount):
    """
    Searches twitter for a keywords searchstring until a maximum 'tweet_amount' has been reached or
    there are no more results and next_token is False.
    Returns a DataFrame containing all fetched information
    """

    # create a main DataFrame to which all the api call results for one party are concatenated
    one_party_df = pd.DataFrame()
    headers = headers

    # set collected tweet counter to zero
    counter = 0

    # call the api the first time resulting in a returned DataFrame, the next token and number of collected tweets
    single_api_call_df, next_token, collected_tweets = call_api(headers, keywords,
                                   start_time, end_time,
                                   max_results,
                                   new_token=None)

    # concat the DataFrames to save function call result
    one_party_df = pd.concat([one_party_df, single_api_call_df], ignore_index=True)

    # add number of fetched tweets to counter
    counter += collected_tweets

    # while there is a next token in api call result
    # call the api again with the next_token as additional parameter until the value becomes false
    while next_token:

        print(f"Currently at party {party} and in total {counter} tweets")
        # call api with new next_token
        single_api_call_df, next_token, collected_tweets = call_api(headers, keywords,
                                       start_time, end_time,
                                       max_results,
                                       new_token=next_token)
        
        
        # save result and add number of tweets
        one_party_df = pd.concat([one_party_df, single_api_call_df], ignore_index=True)
        counter += collected_tweets
        
        
        # break the loop if the predefined search limit is reached
        if counter > tweet_amount:
            print(f"Reached predefined search limit of {tweet_amount} tweets")
            break

    # after collecting all tweets create a new column containing the party's name
    one_party_df["party"] = party

    # return a DataFrame with all tweets for one time period
    return one_party_df

In [87]:
missing_cdu = search_twitter(headers, "CDU", query_dict["CDU"][0], "2021-08-27T00:00:01.000Z","2021-08-27T05:10:30.000Z", 500, 10000)

240
2021-08-27T00:00:03.000Z


In [89]:
final = pd.concat([raw_data, missing_cdu])

In [96]:
final.to_csv("raw_twitter_08_31.csv", index=False)

In [95]:
len(final)

276129

In [262]:
data = pd.read_csv("correct_tweet_database_08_31.csv", lineterminator="\n")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [265]:
data2 = load_and_clean_csv(data)

In [289]:
import requests

In [None]:
response = requests.post("http://127.0.0.1:8000/run_app/")

In [266]:
data2

Unnamed: 0,party,tweet_date,author_id,tweet_id,text,source,retweet_count,reply_count,like_count,profile_creation_date,followers_count,following_count,user_tweet_count,location,sentiment,username
0,SPD,2021-08-31 23:59:24,229106593,1432855573284937730,Heftige Merkel-Attacke - ist Olaf Scholz eine ...,Twitter Web App,1,0,0,2010-12-21 14:55:20,56,118,7921,Cuxhaven,1,13und13
1,SPD,2021-08-31 23:58:48,3625766896,1432855420482334725,@larsklingbeil Klingbeil und was ist mit der S...,Twitter for iPhone,0,0,0,2015-09-12 00:29:18,3,174,326,Wilder Süden,1,RitschF
2,SPD,2021-08-31 23:58:42,1170234519951159296,1432855397963083780,"Hauptsächlich Esken das ref, spielt sich vor d...",Twitter Web App,0,0,1,2019-09-07 07:17:17,1680,1434,43503,irgendwo bei Lummerland,1,dummchens
3,SPD,2021-08-31 23:52:45,1170234519951159296,1432853899514421248,"Scholz ist blöder als lang..\nMister Teflon.,\...",Twitter Web App,1,1,1,2019-09-07 07:17:17,1680,1434,43503,irgendwo bei Lummerland,1,dummchens
4,SPD,2021-08-31 23:58:12,1730034720,1432855271869661184,"""Menschen dürfen nicht GEGENEINANDER ausgespie...",Twitter Web App,0,0,0,2013-09-04 22:24:43,7,46,5377,,1,HmmelinkReinhol
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276112,CDU,2021-08-27 00:11:45,884094208923103236,1431046741130874884,"@SearchSquirrel Wenns so richtig rund läuft, z...",Twitter Web App,0,0,2,2017-07-09 16:57:40,1044,791,62699,,1,imageschaden
276113,CDU,2021-08-27 00:10:44,1418165833205960706,1431046485492260870,@georgrestle @ArminLaschet @MarkusBlume Ansons...,Twitter for iPhone,0,0,0,2021-07-22 11:07:59,57,374,2270,Nowhereland,1,HerrThinkPink
276123,CDU,2021-08-27 00:04:41,18309034,1431044965640118279,@danielreitzig @ArminLaschet Nachdem die CDU b...,Twitter Web App,0,0,2,2008-12-22 15:42:26,309,1505,14250,Planet Erde,1,kickahh
276124,CDU,2021-08-27 00:01:20,18309034,1431044121393836034,@abendblatt Herr Röhse hat jahrelang den ÖPNV...,Twitter Web App,0,0,0,2008-12-22 15:42:26,309,1505,14250,Planet Erde,-2,kickahh


In [269]:
old_master = pd.read_csv("/Users/nicolas/Downloads/df_all_v2.csv", lineterminator="\n")

In [274]:
old_master.drop(columns=["Unnamed: 0", "avg_len_of_tweet"], inplace = True)

In [277]:
new_master = pd.concat([old_master, data2])

In [284]:
new_master.to_csv("new_master_clean_for_upload.csv", index=False)

In [285]:
test = pd.read_csv("new_master_clean_for_upload.csv", lineterminator="\n")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [252]:
data.to_csv("correct_tweet_database_08_31.csv", index=False)

In [253]:
data2 = load_and_clean_csv(data)

In [255]:
mock_prediction = {"AFD": 0.24, "CDU": 0.11, "GRUENE": 0.13, "FDP": 0.17, "SPD": 0.08, "LINKE": 0.07, "OTHER": 0.2}

In [201]:
data.to_csv("fixing_bug.csv", index=False)

In [200]:
data = load_and_clean_csv(data)

In [209]:
heute_clean = load_and_clean_csv(heute)

In [240]:
fixing = pd.read_csv("~/Downloads/tweet_database_08_31.csv", lineterminator="\n")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [56]:
def get_data(headers, start_time = False, end_time=False):
    """
    Returns a DataFrame containing all tweets for one day for 7 diffrent search keywords
    for each party defined in query_dict
    """

    # create a master DataFrame for all parties
    all_parties_df = pd.DataFrame()

    # set max results per api call
    max_results = 500

    if not (start_time and end_time):
        # convert to ISO 8601: YYYY-MM-DDTHH:mm:sssZ
        # this is UTC; we are not accounting for german time zone +02:00
        start_time = f"{get_date_n_days_ago(1).replace('_', '-')}T00:00:01.000Z"
        end_time = f"{get_date_n_days_ago().replace('_', '-')}T00:00:01.000Z"
    else:
        start_time = start_time
        end_time = end_time

    # for key, value in query_dict from twitter_api_params.py
    for party, keywords in query_dict.items():

        # set maximum tweet amount from keywords dict
        tweet_amount =  keywords[1]

        # collect the DataFrame for one party-keyword combination
        one_party_df = search_twitter(headers, party, keywords[0], start_time,
                             end_time, max_results, tweet_amount)

        # concat to master DataFrame
        all_parties_df = pd.concat([all_parties_df, one_party_df], ignore_index=True)

    # save master DataFrame as a csv
    #all_parties_df.to_csv(
    #    f"temp_tweet_database_{str(current_time_yesterday)[5:7]}_{str(current_time_yesterday)[8:10]}.csv"
    #)


    return all_parties_df

In [82]:
raw_data = get_data(headers, "2021-08-27T00:00:01.000Z", "2021-09-01T00:00:01.000Z")

298
2021-08-31T21:53:50.000Z
Currently at party SPD and in total 500 tweets
297
2021-08-31T21:01:21.000Z
Currently at party SPD and in total 998 tweets
296
2021-08-31T20:07:42.000Z
Currently at party SPD and in total 1496 tweets
295
2021-08-31T19:16:59.000Z
Currently at party SPD and in total 1996 tweets
294
2021-08-31T18:24:36.000Z
Currently at party SPD and in total 2495 tweets
293
2021-08-31T17:24:10.000Z
Currently at party SPD and in total 2993 tweets
292
2021-08-31T16:33:15.000Z
Currently at party SPD and in total 3493 tweets
291
2021-08-31T15:45:11.000Z
Currently at party SPD and in total 3993 tweets
290
2021-08-31T14:50:50.000Z
Currently at party SPD and in total 4492 tweets
289
2021-08-31T13:56:37.000Z
Currently at party SPD and in total 4991 tweets
288
2021-08-31T13:10:29.000Z
Currently at party SPD and in total 5491 tweets
287
2021-08-31T12:27:36.000Z
Currently at party SPD and in total 5991 tweets
286
2021-08-31T11:42:00.000Z
Currently at party SPD and in total 6491 tweets
2

192
2021-08-27T09:21:01.000Z
Currently at party SPD and in total 53392 tweets
191
2021-08-27T08:26:09.000Z
Currently at party SPD and in total 53889 tweets
190
2021-08-27T07:33:36.000Z
Currently at party SPD and in total 54385 tweets
189
2021-08-27T06:29:25.000Z
Currently at party SPD and in total 54882 tweets
188
2021-08-27T04:56:31.000Z
Currently at party SPD and in total 55381 tweets
187
2021-08-27T00:00:56.000Z
186
2021-08-31T21:29:51.000Z
Currently at party AFD and in total 500 tweets
185
2021-08-31T20:21:50.000Z
Currently at party AFD and in total 1000 tweets
184
2021-08-31T19:33:10.000Z
Currently at party AFD and in total 1499 tweets
183
2021-08-31T18:48:50.000Z
Currently at party AFD and in total 1998 tweets
182
2021-08-31T18:08:42.000Z
Currently at party AFD and in total 2498 tweets
181
2021-08-31T16:41:39.000Z
Currently at party AFD and in total 2995 tweets
180
2021-08-31T15:16:22.000Z
Currently at party AFD and in total 3494 tweets
179
2021-08-31T14:00:47.000Z
Currently at p

231
2021-08-30T19:23:03.000Z
Currently at party CDU and in total 18989 tweets
230
2021-08-30T18:58:19.000Z
Currently at party CDU and in total 19488 tweets
229
2021-08-30T18:29:49.000Z
Currently at party CDU and in total 19986 tweets
228
2021-08-30T18:01:46.000Z
Currently at party CDU and in total 20486 tweets
227
2021-08-30T17:36:16.000Z
Currently at party CDU and in total 20986 tweets
226
2021-08-30T17:11:45.000Z
Currently at party CDU and in total 21486 tweets
225
2021-08-30T16:47:32.000Z
Currently at party CDU and in total 21985 tweets
224
2021-08-30T16:25:41.000Z
Currently at party CDU and in total 22484 tweets
223
2021-08-30T16:01:11.000Z
Currently at party CDU and in total 22984 tweets
222
2021-08-30T15:40:46.000Z
Currently at party CDU and in total 23482 tweets
221
2021-08-30T15:21:26.000Z
Currently at party CDU and in total 23981 tweets
220
2021-08-30T15:02:55.000Z
Currently at party CDU and in total 24481 tweets
219
2021-08-30T14:41:11.000Z
Currently at party CDU and in total

125
2021-08-29T14:22:07.000Z
Currently at party CDU and in total 71868 tweets
124
2021-08-29T13:38:31.000Z
Currently at party CDU and in total 72368 tweets
299
2021-08-29T13:02:09.000Z
Currently at party CDU and in total 72868 tweets
298
2021-08-29T12:32:49.000Z
Currently at party CDU and in total 73366 tweets
297
2021-08-29T12:06:01.000Z
Currently at party CDU and in total 73866 tweets
296
2021-08-29T11:39:12.000Z
Currently at party CDU and in total 74366 tweets
295
2021-08-29T11:11:36.000Z
Currently at party CDU and in total 74863 tweets
294
2021-08-29T10:48:07.000Z
Currently at party CDU and in total 75361 tweets
293
2021-08-29T10:26:32.000Z
Currently at party CDU and in total 75858 tweets
292
2021-08-29T10:05:51.000Z
Currently at party CDU and in total 76358 tweets
291
2021-08-29T09:42:59.000Z
Currently at party CDU and in total 76857 tweets
290
2021-08-29T09:17:28.000Z
Currently at party CDU and in total 77357 tweets
289
2021-08-29T08:54:23.000Z
Currently at party CDU and in total

195
2021-08-31T19:32:21.000Z
Currently at party GRUENE and in total 998 tweets
194
2021-08-31T18:23:44.000Z
Currently at party GRUENE and in total 1498 tweets
193
2021-08-31T17:04:15.000Z
Currently at party GRUENE and in total 1998 tweets
192
2021-08-31T15:37:40.000Z
Currently at party GRUENE and in total 2498 tweets
191
2021-08-31T14:00:28.000Z
Currently at party GRUENE and in total 2998 tweets
190
2021-08-31T12:42:30.000Z
Currently at party GRUENE and in total 3498 tweets
189
2021-08-31T11:01:03.000Z
Currently at party GRUENE and in total 3998 tweets
188
2021-08-31T09:33:57.000Z
Currently at party GRUENE and in total 4498 tweets
187
2021-08-31T08:09:10.000Z
Currently at party GRUENE and in total 4998 tweets
186
2021-08-31T06:31:58.000Z
Currently at party GRUENE and in total 5498 tweets
185
2021-08-31T02:19:30.000Z
Currently at party GRUENE and in total 5998 tweets
184
2021-08-30T21:20:27.000Z
Currently at party GRUENE and in total 6498 tweets
183
2021-08-30T20:00:03.000Z
Currently at

267
2021-08-29T20:31:03.000Z
Currently at party LINKE and in total 5998 tweets
266
2021-08-29T16:32:18.000Z
Currently at party LINKE and in total 6498 tweets
265
2021-08-29T11:22:47.000Z
Currently at party LINKE and in total 6997 tweets
264
2021-08-29T06:09:01.000Z
Currently at party LINKE and in total 7496 tweets
263
2021-08-28T19:22:00.000Z
Currently at party LINKE and in total 7995 tweets
262
2021-08-28T14:16:07.000Z
Currently at party LINKE and in total 8495 tweets
261
2021-08-28T08:48:23.000Z
Currently at party LINKE and in total 8994 tweets
260
2021-08-27T19:05:10.000Z
Currently at party LINKE and in total 9494 tweets
259
2021-08-27T12:09:48.000Z
Currently at party LINKE and in total 9994 tweets
258
2021-08-27T00:05:13.000Z
257
2021-08-31T18:14:07.000Z
Currently at party OTHERS and in total 500 tweets
256
2021-08-31T13:16:55.000Z
Currently at party OTHERS and in total 999 tweets
255
2021-08-31T06:39:06.000Z
Currently at party OTHERS and in total 1496 tweets
254
2021-08-30T18:25:1

In [83]:
raw_data.to_csv("raw_twitter_08_27", index=False)

In [86]:
pd.read_csv("raw_twitter_08_27", lineterminator="\n")

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,tweet_created_at,tweet_id,source,lang,text,author_id,public_metrics.retweet_count,public_metrics.reply_count,public_metrics.like_count,public_metrics.quote_count,...,public_metrics.tweet_count,public_metrics.listed_count,withheld.copyright,withheld.country_codes_x,withheld.country_codes_y,party,withheld.scope_x,withheld.scope_y,withheld.country_codes,withheld.scope
0,2021-08-31T23:59:24.000Z,1432855573284937730,Twitter Web App,de,Heftige Merkel-Attacke - ist Olaf Scholz eine ...,229106593,1,0,0,0,...,7921,0,,,,SPD,,,,
1,2021-08-31T23:58:48.000Z,1432855420482334725,Twitter for iPhone,de,@larsklingbeil Klingbeil und was ist mit der S...,3625766896,0,0,0,0,...,326,0,,,,SPD,,,,
2,2021-08-31T23:58:42.000Z,1432855397963083780,Twitter Web App,de,"Hauptsächlich Esken das ref, spielt sich vor d...",1170234519951159296,0,0,1,0,...,43503,3,,,,SPD,,,,
3,2021-08-31T23:52:45.000Z,1432853899514421248,Twitter Web App,de,"Scholz ist blöder als lang..\nMister Teflon.,\...",1170234519951159296,1,1,1,0,...,43503,3,,,,SPD,,,,
4,2021-08-31T23:58:12.000Z,1432855271869661184,Twitter Web App,de,"""Menschen dürfen nicht GEGENEINANDER ausgespie...",1730034720,0,0,0,0,...,5377,0,,,,SPD,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275664,2021-08-27T00:06:11.000Z,1431045341604978690,Twitter for Android,de,@Poronyos @marcobuelow @DiePARTEI Spezialisier...,2370044285,0,0,0,0,...,1512,0,,,,OTHERS,,,,
275665,2021-08-27T00:03:29.000Z,1431044660051550210,Twitter for Android,de,@SWeissnicht @Poronyos @marcobuelow @DiePARTEI...,2370044285,0,1,0,0,...,1512,0,,,,OTHERS,,,,
275666,2021-08-27T00:02:48.000Z,1431044491104903177,Twitter for Android,de,@Poronyos @marcobuelow @DiePARTEI Heimat lässt...,2370044285,0,1,0,0,...,1512,0,,,,OTHERS,,,,
275667,2021-08-27T00:00:23.000Z,1431043881278259201,Twitter for Android,de,@Poronyos @marcobuelow @DiePARTEI Ich will dir...,2370044285,0,1,0,0,...,1512,0,,,,OTHERS,,,,


In [36]:
import os
from dotenv import load_dotenv
from os.path import join, dirname


def get_credentials(headers):


    # create authorization dict for the api
    return headers

# all queries:
query_cdu = """(@cducsubt OR @CDU OR @ArminLaschet  OR #Laschet OR #ArminLaschet  OR #arminlaschet OR #laschet OR #cdu OR #CDU OR CDU/CSU OR Laschet)
lang:de -is:retweet
-#GRUENEN -@Die_Gruenen -Baerbock -@ABaerbock
-#SPD -@spdde -Scholz -@OlafScholz
-#AFD -@AfD -Weidel -@Alice_Weidel -Chrupalla -@Tino_Chrupalla
-#FDP -@fdp -Lindner -@c_lindner
-#DieLinke -@dieLinke -Wissler -@Janine_Wissler -Bartsch -@DietmarBartsch
-#FreieWaehler -@FREIEWAEHLER_BV
-#diePARTEI -@DiePARTEI
-@Tierschutzparte -NPD -@Piratenpartei -#Piraten -#dieBasis -@diebasispartei -#Volt -@VoltDeutschland"""

query_linke = """(@dieLinke OR @Janine_Wissler OR  @DietmarBartsch OR #DieLinke OR #DieLINKE OR #Linke OR #dielinke OR #Bartsch OR #Wissler OR Wissler OR DieLinke)
lang:de -is:retweet
-@cducsubt -@CDU -@ArminLaschet -#Laschet -#ArminLaschet -#laschet -#cdu -#CDU -CDU/CSU -Laschet
-@Die_Gruenen -@ABaerbock -@GrueneBundestag -#Gruene -#Grünen -#Grüne -#GRUENEN -#AnnalenaBaerbock -#Baerbock -#baerbock -Baerbock -Grüne -Gruene
-@spdde -@OlafScholz -@spdbt -#SPD -#spd -#Spd -#Scholz -#OlafScholz -#SCHOLZ -#scholz -Scholz -SPD
-@AfD -@Alice_Weidel -@Tino_Chrupalla -#AFD -#AfD  -#afd  -#Weidel -#weidel -#Chrupalla -AFD -Weidel -Chrupalla
-@fdp -@fdpbt -@c_lindner -#FDP -#fdp -#Fdp -#Lindner -#lindner -#LINDNER -#ChristianLindner -Lindner -FDP
-#FreieWaehler -@FREIEWAEHLER_BV
-#diePARTEI -@DiePARTEI
-@Tierschutzparte -NPD -@Piratenpartei -#Piraten -#dieBasis -@diebasispartei -#Volt -@VoltDeutschland"""

query_afd = """( @AfD OR @Alice_Weidel OR @Tino_Chrupalla OR #AFD OR #AfD OR #afd OR #AlternativefürDeutschland OR #Weidel OR #weidel OR #AliceWeidel OR #WEIDEL OR #Chrupalla OR #chrupalla OR #TinoChruppala OR AFD OR Weidel OR Chrupalla)
lang:de -is:retweet
-@cducsubt -@CDU -@ArminLaschet -#Laschet -#ArminLaschet -#arminlaschet -#laschet -#cdu -#CDU -CDU/CSU -Laschet
-@Die_Gruenen -@ABaerbock -@GrueneBundestag -#Gruene -#Grünen -#Grüne -#GRUENEN -#AnnalenaBaerbock -#Baerbock -#baerbock -Baerbock -Grüne -Gruene
-@spdde -@OlafScholz -@spdbt -#SPD -#spd -#Spd -#Scholz -#OlafScholz -#SCHOLZ -#scholz -Scholz -SPD -Sozialdemokraten
-#FDP -@fdp -Lindner -@c_lindner
-#DieLinke -@dieLinke -Wissler -@Janine_Wissler -Bartsch -@DietmarBartsch
-#FreieWaehler -@FREIEWAEHLER_BV
-#diePARTEI -@DiePARTEI
-@Tierschutzparte -NPD -@Piratenpartei -#Piraten -#dieBasis -@diebasispartei -#Volt -@VoltDeutschland"""

query_fdp = """(@fdp OR @fdpbt OR @c_lindner OR #FDP OR #fdp OR #Fdp OR #Lindner OR #lindner OR #LINDNER OR #ChristianLindner OR Lindner OR FDP)
lang:de -is:retweet
-@cducsubt -@CDU -@ArminLaschet -#Laschet -#ArminLaschet -#laschet -#cdu -#CDU -CDU/CSU -Laschet
-@Die_Gruenen -@ABaerbock -@GrueneBundestag -#Gruene -#Grünen -#Grüne -#GRUENEN -#AnnalenaBaerbock -#Baerbock -#baerbock -Baerbock -Grüne -Gruene
-@spdde -@OlafScholz -@spdbt -#SPD -#spd -#Spd -#Scholz -#OlafScholz -#SCHOLZ -#scholz -Scholz -SPD
-@AfD -@Alice_Weidel -@Tino_Chrupalla -#AFD -#AfD -#afd -#Weidel -#weidel -#Chrupalla -AFD -Weidel -Chrupalla
-#DieLinke -@dieLinke -Wissler -@Janine_Wissler -Bartsch -@DietmarBartsch
-#FreieWaehler -@FREIEWAEHLER_BV
-#diePARTEI -@DiePARTEI
-@Tierschutzparte -NPD -@Piratenpartei -#Piraten -#dieBasis -@diebasispartei -#Volt -@VoltDeutschland"""

query_others = """(#FreieWaehler OR #FreieWähler OR @HubertAiwanger OR #FREIEWÄHLER OR @FREIEWAEHLER_BV #freiewähler2021 OR #diePARTEI OR @DiePARTEI OR @Tierschutzparte OR NPD OR @Piratenpartei OR #Piraten OR #dieBasis OR @diebasispartei OR #Volt OR @VoltDeutschland OR @oedp_de OR @bgepartei OR @TodenhoeferTeam OR #TeamTodenhoefer)
lang:de -is:retweet"""

query_spd = """( @spdde OR @OlafScholz OR @spdbt OR #SPD OR #spd OR #Spd OR #Scholz OR #OlafScholz OR #SCHOLZ OR #scholz OR Scholz OR SPD OR Sozialdemokraten)
lang:de -is:retweet
-@cducsubt -@CDU -@ArminLaschet -#Laschet -#ArminLaschet -#arminlaschet -#laschet -#cdu -#CDU -CDU/CSU -Laschet
-@Die_Gruenen -@ABaerbock -@GrueneBundestag -#Gruene -#Grünen -#Grüne -#GRUENEN -#AnnalenaBaerbock -#Baerbock -#baerbock -Baerbock -Grüne -Gruene
-#AFD -@AfD -Weidel -@Alice_Weidel -Chrupalla -@Tino_Chrupalla
-#FDP -@fdp -Lindner -@c_lindner
-#DieLinke -@dieLinke -Wissler -@Janine_Wissler -Bartsch -@DietmarBartsch
-#FreieWaehler -@FREIEWAEHLER_BV
-#diePARTEI -@DiePARTEI
-@Tierschutzparte -NPD -@Piratenpartei -#Piraten -#dieBasis -@diebasispartei -#Volt -@VoltDeutschland"""


query_gruene = """( @Die_Gruenen OR @ABaerbock OR @GrueneBundestag OR #Gruene OR #Grünen OR #Grüne OR #GRUENEN OR #AnnalenaBaerbock OR #Baerbock OR #baerbock OR Baerbock OR Grüne OR Gruene)
lang:de -is:retweet
-@cducsubt -@CDU -@ArminLaschet -#Laschet -#ArminLaschet -#arminlaschet -#laschet -#cdu -#CDU -CDU/CSU -Laschet
-#SPD -@spdde -Scholz -@OlafScholz
-#AFD -@AfD -Weidel -@Alice_Weidel -Chrupalla -@Tino_Chrupalla
-#FDP -@fdp -Lindner -@c_lindner
-#DieLinke -@dieLinke -Wissler -@Janine_Wissler -Bartsch -@DietmarBartsch
-#FreieWaehler -@FREIEWAEHLER_BV
-#diePARTEI -@DiePARTEI
-@Tierschutzparte -NPD -@Piratenpartei -#Piraten -#dieBasis -@diebasispartei -#Volt -@VoltDeutschland"""

# create a query dict to iterate over with the party name, its search string and the maximum number of tweets
query_dict = {
    "SPD": (query_spd, 100000),
    "AFD": (query_afd, 50000),
    "CDU": (query_cdu, 100000),
    "FDP": (query_fdp, 50000),
    "GRUENE": (query_gruene, 100000),
    "LINKE": (query_linke, 40000),
    "OTHERS": (query_others, 40000)
}


In [190]:
def load_and_clean_csv(df):
    '''
    Function loads DF data from the Twitter API+Sentiment and returns a cleaned DF
    '''
    # rename columns
    df = df.rename(columns={"tweet_created_at": "tweet_date",
                                "public_metrics.retweet_count": "retweet_count",
                                "public_metrics.reply_count": "reply_count",
                                "public_metrics.like_count": "like_count",
                                "profile_created_at": "profile_creation_date",
                                "public_metrics.followers_count": "followers_count",
                                "public_metrics.following_count": "following_count",
                                "public_metrics.tweet_count": "user_tweet_count"
                                })

    # Including only columns that we want to use in the future
    df = df[['party',
                'tweet_date',
                'author_id',
                'tweet_id',
                'text',
                'source',
                'retweet_count',
                'reply_count',
                'like_count',
                'profile_creation_date',
                'followers_count',
                'following_count',
                'user_tweet_count',
                'location',
                'sentiment',
                'username'
                ]]

    # Clean dataset columns:
    # Change dtype
    df["tweet_date"] = df["tweet_date"].astype(str)
    df = df[df.tweet_date.str.match('(\d{4}-\d{2}-\d{2}.\d{2}:\d{2}:\d{2})')]
    df = df[(df.tweet_date.str.len() == 23) | (df.tweet_date.str.len() == 24)]
    df['tweet_date'] = df['tweet_date'].str.slice(0,19)
    df["tweet_date"] = pd.to_datetime(df["tweet_date"])

    df["profile_creation_date"] = df["profile_creation_date"].astype(str)
    df = df[df.profile_creation_date.str.match(
        '(\d{4}-\d{2}-\d{2}.\d{2}:\d{2}:\d{2})')]
    df['profile_creation_date'] = df['profile_creation_date'].str.slice(0,19)
    df["profile_creation_date"] = pd.to_datetime(df["profile_creation_date"])
    # Drop duplicates
    df = df.drop_duplicates()
    # Transform sentiment to numeric type
    dict_to_numeric = {"negative": -2, "neutral": 1, "positive": 2}
    df["sentiment"].replace(dict_to_numeric, inplace=True)

    return df

In [100]:
heute = pd.read_csv("/Users/nicolas/Downloads/tweet_database_08_31.csv", lineterminator="\n")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [105]:
test = load_and_clean_csv(heute)

In [106]:
test 

Unnamed: 0,party,tweet_date,author_id,tweet_id,text,source,retweet_count,reply_count,like_count,profile_creation_date,followers_count,following_count,user_tweet_count,location,sentiment
0,SPD,2021-08-31 23:59:24,229106593,1432855573284937730,Heftige Merkel-Attacke - ist Olaf Scholz eine ...,Twitter Web App,1,0,0,2010-12-21 14:55:20,56,118,7921,Cuxhaven,1
1,SPD,2021-08-31 23:58:48,3625766896,1432855420482334725,@larsklingbeil Klingbeil und was ist mit der S...,Twitter for iPhone,0,0,0,2015-09-12 00:29:18,3,174,326,Wilder Süden,1
2,SPD,2021-08-31 23:58:42,1170234519951159296,1432855397963083780,"Hauptsächlich Esken das ref, spielt sich vor d...",Twitter Web App,0,0,1,2019-09-07 07:17:17,1680,1434,43503,irgendwo bei Lummerland,1
3,SPD,2021-08-31 23:52:45,1170234519951159296,1432853899514421248,"Scholz ist blöder als lang..\nMister Teflon.,\...",Twitter Web App,1,1,1,2019-09-07 07:17:17,1680,1434,43503,irgendwo bei Lummerland,1
4,SPD,2021-08-31 23:58:12,1730034720,1432855271869661184,"""Menschen dürfen nicht GEGENEINANDER ausgespie...",Twitter Web App,0,0,0,2013-09-04 22:24:43,7,46,5377,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276112,CDU,2021-08-27 00:11:45,884094208923103236,1431046741130874884,"@SearchSquirrel Wenns so richtig rund läuft, z...",Twitter Web App,0,0,2,2017-07-09 16:57:40,1044,791,62699,,1
276113,CDU,2021-08-27 00:10:44,1418165833205960706,1431046485492260870,@georgrestle @ArminLaschet @MarkusBlume Ansons...,Twitter for iPhone,0,0,0,2021-07-22 11:07:59,57,374,2270,Nowhereland,1
276123,CDU,2021-08-27 00:04:41,18309034,1431044965640118279,@danielreitzig @ArminLaschet Nachdem die CDU b...,Twitter Web App,0,0,2,2008-12-22 15:42:26,309,1505,14250,Planet Erde,1
276124,CDU,2021-08-27 00:01:20,18309034,1431044121393836034,@abendblatt Herr Röhse hat jahrelang den ÖPNV...,Twitter Web App,0,0,0,2008-12-22 15:42:26,309,1505,14250,Planet Erde,-2


In [101]:
alt = pd.read_csv("/Users/nicolas/Downloads/df_all_v2.csv")

In [118]:
alt.drop(columns=["Unnamed: 0", "avg_len_of_tweet"], inplace=True)

In [143]:
x = list(alt.columns)

In [144]:
x == y

True

In [142]:
y = list(test.columns)

In [122]:
final = pd.concat([alt, test])

In [145]:
z = list(final.columns) 

In [135]:
final.to_csv("final_database.csv")

In [170]:
alt.columns

Index(['party', 'tweet_date', 'author_id', 'tweet_id', 'text', 'source',
       'retweet_count', 'reply_count', 'like_count', 'profile_creation_date',
       'followers_count', 'following_count', 'user_tweet_count', 'location',
       'sentiment'],
      dtype='object')

In [150]:
len(why)

1548375

In [165]:
p = test.groupby(["party", "sentiment"]).sum()

In [167]:
p.reset_index()

Unnamed: 0,party,sentiment,author_id,tweet_id,retweet_count,reply_count,like_count,followers_count,following_count,user_tweet_count
0,AFD,-2,1.522466e+22,2.604563e+22,23571.0,19450.0,170583.0,23502324.0,12399763.0,303649800.0
1,AFD,1,9.524636e+21,1.701117e+22,27466.0,13847.0,144419.0,56902980.0,9958140.0,257752000.0
2,AFD,2,1.558836e+21,2.523203e+21,3527.0,1474.0,15313.0,2068643.0,1697277.0,29004390.0
3,CDU,-2,3.055902e+22,6.670265e+22,49181.0,30066.0,582531.0,99351417.0,31615655.0,800532100.0
4,CDU,1,3.059118e+22,7.061437e+22,81920.0,55253.0,825609.0,229007637.0,38124432.0,1044086000.0
5,CDU,2,2.901859e+21,6.463014e+21,3896.0,6059.0,69226.0,14078650.0,3474812.0,83875460.0
6,FDP,-2,9.123059e+21,1.730651e+22,4333.0,8345.0,69684.0,18601476.0,8181011.0,199899400.0
7,FDP,1,6.574044e+21,1.397969e+22,8016.0,8822.0,82624.0,33180991.0,7769059.0,200111400.0
8,FDP,2,1.006617e+21,1.979056e+21,405.0,900.0,7237.0,2227933.0,962262.0,19544800.0
9,GRUENE,-2,1.311074e+22,2.56169e+22,12753.0,17138.0,122035.0,40952181.0,12122662.0,294671600.0


In [204]:
why = pd.read_csv("final_database.csv", lineterminator="\n")

In [206]:
test

Unnamed: 0,party,tweet_date,author_id,tweet_id,text,source,retweet_count,reply_count,like_count,profile_creation_date,followers_count,following_count,user_tweet_count,location,sentiment
0,SPD,2021-08-31 23:59:24,229106593,1432855573284937730,Heftige Merkel-Attacke - ist Olaf Scholz eine ...,Twitter Web App,1,0,0,2010-12-21 14:55:20,56,118,7921,Cuxhaven,1
1,SPD,2021-08-31 23:58:48,3625766896,1432855420482334725,@larsklingbeil Klingbeil und was ist mit der S...,Twitter for iPhone,0,0,0,2015-09-12 00:29:18,3,174,326,Wilder Süden,1
2,SPD,2021-08-31 23:58:42,1170234519951159296,1432855397963083780,"Hauptsächlich Esken das ref, spielt sich vor d...",Twitter Web App,0,0,1,2019-09-07 07:17:17,1680,1434,43503,irgendwo bei Lummerland,1
3,SPD,2021-08-31 23:52:45,1170234519951159296,1432853899514421248,"Scholz ist blöder als lang..\nMister Teflon.,\...",Twitter Web App,1,1,1,2019-09-07 07:17:17,1680,1434,43503,irgendwo bei Lummerland,1
4,SPD,2021-08-31 23:58:12,1730034720,1432855271869661184,"""Menschen dürfen nicht GEGENEINANDER ausgespie...",Twitter Web App,0,0,0,2013-09-04 22:24:43,7,46,5377,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
276112,CDU,2021-08-27 00:11:45,884094208923103236,1431046741130874884,"@SearchSquirrel Wenns so richtig rund läuft, z...",Twitter Web App,0,0,2,2017-07-09 16:57:40,1044,791,62699,,1
276113,CDU,2021-08-27 00:10:44,1418165833205960706,1431046485492260870,@georgrestle @ArminLaschet @MarkusBlume Ansons...,Twitter for iPhone,0,0,0,2021-07-22 11:07:59,57,374,2270,Nowhereland,1
276123,CDU,2021-08-27 00:04:41,18309034,1431044965640118279,@danielreitzig @ArminLaschet Nachdem die CDU b...,Twitter Web App,0,0,2,2008-12-22 15:42:26,309,1505,14250,Planet Erde,1
276124,CDU,2021-08-27 00:01:20,18309034,1431044121393836034,@abendblatt Herr Röhse hat jahrelang den ÖPNV...,Twitter Web App,0,0,0,2008-12-22 15:42:26,309,1505,14250,Planet Erde,-2


In [141]:
len(final),len(why) - len(heute), len(alt)

(1548375, 1431776, 1272583)

In [109]:
heute.columns

Index(['Unnamed: 0', 'tweet_created_at', 'tweet_id', 'source', 'lang', 'text',
       'author_id', 'public_metrics.retweet_count',
       'public_metrics.reply_count', 'public_metrics.like_count',
       'public_metrics.quote_count', 'geo.place_id', 'name',
       'profile_created_at', 'username', 'location',
       'public_metrics.followers_count', 'public_metrics.following_count',
       'public_metrics.tweet_count', 'public_metrics.listed_count',
       'withheld.copyright', 'withheld.country_codes_x',
       'withheld.country_codes_y', 'party', 'withheld.scope_x',
       'withheld.scope_y', 'withheld.country_codes', 'withheld.scope',
       'sentiment'],
      dtype='object')