In [28]:
import pandas as pd
import requests
import time
import datetime

In [29]:
def get_date_n_days_ago(ndelta=0):
    """
    Returns date n days ago in format YYYY_MM_DD. Without parameter it returns today's date.
    """
    # get current time as datetime
    current_time = datetime.datetime.today()

    # check if a historic date is requested
    if ndelta == 0:

        current_time = str(current_time)

        return f"{current_time[:4]}_{current_time[5:7]}_{current_time[8:10]}"

    # get current time yesterday
    current_time_yesterday = str(current_time - datetime.timedelta(ndelta))

    return f"{current_time_yesterday[:4]}_{current_time_yesterday[5:7]}_{current_time_yesterday[8:10]}"

In [11]:
def create_url(keywords, start_date, end_date, max_results = 10):
    """
    Create the url with current keywords
    """

    #Change to the endpoint you want to collect data from
    search_url = "https://api.twitter.com/2/tweets/search/all"

    #change params based on the endpoint you are using
    query_params = {'query': keywords,
                    'start_time': start_date,
                    'end_time': end_date,
                    'max_results': max_results,
                    'expansions': 'author_id,geo.place_id',
                    'tweet.fields': 'id,text,created_at,lang,public_metrics,source',
                    'user.fields': 'id,created_at,location,public_metrics',
                    'place.fields': 'full_name,country_code,geo,place_type',
                    'next_token': {}}

    # return tuple: [0] is search_url and [1] is the dict with query params
    return (search_url, query_params)



In [12]:
def connect_to_endpoint(url, headers, params, next_token):
    """
    Returns the api response in a json format and sets 'next_token' value
    """

    #params dict received from create_url function, set next_token value
    params['next_token'] = next_token

    # call api
    response = requests.request("GET", url, headers = headers, params = params)

    if response.status_code != 200:
        raise Exception(response.status_code, response.text)

    # return api response as json
    return response.json()

In [17]:
def call_api(headers,
             keywords,
             start_time,
             end_time,
             max_results,
             new_token=None):
    """
    Calls the twitter api and returns a dataframe containig all fetched information
    """
    # sleep 4 seconds to not exceed api call limit per 15 Minutes (300)
    time.sleep(4)

    # create url by calling above defined function; url is a tuple
    url = create_url(keywords, start_time, end_time, max_results)

    # call api with above defined function
    response = connect_to_endpoint(url[0], headers, url[1], new_token)

    #response has 3 keys:
    #"data": tweet information in a dict
    #"includes": dict with one key "users" which is a dict of user information
    #"meta": api request information


    # create first DataFrame about tweets out of data key
    tweet_df = pd.json_normalize(response["data"])

    # rename columns to not get identical names with user_df
    tweet_df.rename(columns={'id':'tweet_id',
                             'created_at': "tweet_created_at"},
                    inplace=True)
    print(tweet_df.iloc[1]["tweet_created_at"])
    # create second DataFrame about users out of response key
    user_df = pd.json_normalize(response["includes"]["users"])

    # rename columns to not get identical names with tweet_df
    user_df.rename(columns={'id':'author_id',
                            'created_at': "profile_created_at"},
                    inplace=True)

    # merge two dataframes into one
    merged_df = tweet_df.merge(user_df, how = "outer", on="author_id")


    collected_tweets = response["meta"]["result_count"]
    next_token = response["meta"].get("next_token", False)

    # return a df containing all information from the api call, the next_token for continuous search and number of collected tweets
    return merged_df, next_token, collected_tweets


In [18]:
def search_twitter(party, keywords, start_time,
             end_time, max_results, tweet_amount):
    """
    Searches twitter for a keywords searchstring until a maximum 'tweet_amount' has been reached or
    there are no more results and next_token is False.
    Returns a DataFrame containing all fetched information
    """

    # create a main DataFrame to which all the api call results for one party are concatenated
    one_party_df = pd.DataFrame()
    headers = get_credentials()

    # set collected tweet counter to zero
    counter = 0

    # call the api the first time resulting in a returned DataFrame, the next token and number of collected tweets
    single_api_call_df, next_token, collected_tweets = call_api(headers, keywords,
                                   start_time, end_time,
                                   max_results,
                                   new_token=None)

    # concat the DataFrames to save function call result
    one_party_df = pd.concat([one_party_df, single_api_call_df], ignore_index=True)

    # add number of fetched tweets to counter
    counter += collected_tweets

    # while there is a next token in api call result
    # call the api again with the next_token as additional parameter until the value becomes false
    while next_token:


        # call api with new next_token
        single_api_call_df, next_token, collected_tweets = call_api(headers, keywords,
                                       start_time, end_time,
                                       max_results,
                                       new_token=next_token)
        
        print(f"Currently at party {party} and in total {counter} tweets")
        # save result and add number of tweets
        one_party_df = pd.concat([one_party_df, single_api_call_df], ignore_index=True)
        counter += collected_tweets


        # break the loop if the predefined search limit is reached
        if counter > tweet_amount:
            print(f"Reached predefined search limit of {tweet_amount} tweets")
            break

    # after collecting all tweets create a new column containing the party's name
    one_party_df["party"] = party

    # return a DataFrame with all tweets for one time period
    return one_party_df

In [19]:
def get_data(start_time = False, end_time=False):
    """
    Returns a DataFrame containing all tweets for one day for 7 diffrent search keywords
    for each party defined in query_dict
    """

    # create a master DataFrame for all parties
    all_parties_df = pd.DataFrame()

    # set max results per api call
    max_results = 500

    if not (start_time and end_time):
        # convert to ISO 8601: YYYY-MM-DDTHH:mm:sssZ
        # this is UTC; we are not accounting for german time zone +02:00
        start_time = f"{get_date_n_days_ago(1).replace('_', '-')}T00:00:01.000Z"
        end_time = f"{get_date_n_days_ago().replace('_', '-')}T00:00:01.000Z"
    else:
        start_time = start_time
        end_time = end_time

    # for key, value in query_dict from twitter_api_params.py
    for party, keywords in query_dict.items():

        # set maximum tweet amount from keywords dict
        tweet_amount =  keywords[1]

        # collect the DataFrame for one party-keyword combination
        one_party_df = search_twitter(party, keywords[0], start_time,
                             end_time, max_results, tweet_amount)

        # concat to master DataFrame
        all_parties_df = pd.concat([all_parties_df, one_party_df], ignore_index=True)

    # save master DataFrame as a csv
    #all_parties_df.to_csv(
    #    f"temp_tweet_database_{str(current_time_yesterday)[5:7]}_{str(current_time_yesterday)[8:10]}.csv"
    #)


    return all_parties_df

In [30]:
raw_data = get_data("2021-08-31T12:03:00.000Z", "2021-08-31T12:05:00.000Z")

2021-08-31T12:04:50.000Z
2021-08-31T12:04:34.000Z
2021-08-31T12:04:46.000Z
2021-08-31T12:04:44.000Z
2021-08-31T12:04:39.000Z
2021-08-31T12:04:49.000Z
2021-08-31T12:03:40.000Z


In [34]:
raw_data.shape

(105, 20)

In [33]:
test = load_and_clean_csv(raw_data)

KeyError: "['sentiment'] not in index"

In [16]:
def load_and_clean_csv(df):
    '''
    Function loads DF data from the Twitter API+Sentiment and returns a cleaned DF
    '''
    # rename columns
    df = df.rename(columns={"tweet_created_at": "tweet_date",
                                "public_metrics.retweet_count": "retweet_count",
                                "public_metrics.reply_count": "reply_count",
                                "public_metrics.like_count": "like_count",
                                "profile_created_at": "profile_creation_date",
                                "public_metrics.followers_count": "followers_count",
                                "public_metrics.following_count": "following_count",
                                "public_metrics.tweet_count": "user_tweet_count"
                                })

    # Including only columns that we want to use in the future
    df = df[['party',
                'tweet_date',
                'author_id',
                'tweet_id',
                'text',
                'source',
                'retweet_count',
                'reply_count',
                'like_count',
                'profile_creation_date',
                'followers_count',
                'following_count',
                'user_tweet_count',
                'location',
                'sentiment'
                ]]

    # Clean dataset columns:
    # Change dtype
    df["tweet_date"] = df["tweet_date"].astype(str)
    df = df[df.tweet_date.str.match('(\d{4}-\d{2}-\d{2}.\d{2}:\d{2}:\d{2})')]
    #df = df[(df.tweet_date.str.len() == 23) | (df.tweet_date.str.len() == 24)]
    df['tweet_date'] = df['tweet_date'].str.slice(0,19)
    df["tweet_date"] = pd.to_datetime(df["tweet_date"])
    import ipdb
    ipdb.set_trace()
    df['profile_creation_date'] = df['profile_creation_date'].str.slice(0,19)
    df["profile_creation_date"] = pd.to_datetime(df["profile_creation_date"])
    # Drop duplicates
    df = df.drop_duplicates()
    # Transform sentiment to numeric type
    dict_to_numeric = {"negative": -2, "neutral": 1, "positive": 2}
    df["sentiment"].replace(dict_to_numeric, inplace=True)

    return df

In [36]:
import os
from dotenv import load_dotenv
from os.path import join, dirname


def get_credentials(headers):


    # create authorization dict for the api
    return headers

# all queries:
query_cdu = """(@cducsubt OR @CDU OR @ArminLaschet  OR #Laschet OR #ArminLaschet  OR #arminlaschet OR #laschet OR #cdu OR #CDU OR CDU/CSU OR Laschet)
lang:de -is:retweet
-#GRUENEN -@Die_Gruenen -Baerbock -@ABaerbock
-#SPD -@spdde -Scholz -@OlafScholz
-#AFD -@AfD -Weidel -@Alice_Weidel -Chrupalla -@Tino_Chrupalla
-#FDP -@fdp -Lindner -@c_lindner
-#DieLinke -@dieLinke -Wissler -@Janine_Wissler -Bartsch -@DietmarBartsch
-#FreieWaehler -@FREIEWAEHLER_BV
-#diePARTEI -@DiePARTEI
-@Tierschutzparte -NPD -@Piratenpartei -#Piraten -#dieBasis -@diebasispartei -#Volt -@VoltDeutschland"""

query_linke = """(@dieLinke OR @Janine_Wissler OR  @DietmarBartsch OR #DieLinke OR #DieLINKE OR #Linke OR #dielinke OR #Bartsch OR #Wissler OR Wissler OR DieLinke)
lang:de -is:retweet
-@cducsubt -@CDU -@ArminLaschet -#Laschet -#ArminLaschet -#laschet -#cdu -#CDU -CDU/CSU -Laschet
-@Die_Gruenen -@ABaerbock -@GrueneBundestag -#Gruene -#Grünen -#Grüne -#GRUENEN -#AnnalenaBaerbock -#Baerbock -#baerbock -Baerbock -Grüne -Gruene
-@spdde -@OlafScholz -@spdbt -#SPD -#spd -#Spd -#Scholz -#OlafScholz -#SCHOLZ -#scholz -Scholz -SPD
-@AfD -@Alice_Weidel -@Tino_Chrupalla -#AFD -#AfD  -#afd  -#Weidel -#weidel -#Chrupalla -AFD -Weidel -Chrupalla
-@fdp -@fdpbt -@c_lindner -#FDP -#fdp -#Fdp -#Lindner -#lindner -#LINDNER -#ChristianLindner -Lindner -FDP
-#FreieWaehler -@FREIEWAEHLER_BV
-#diePARTEI -@DiePARTEI
-@Tierschutzparte -NPD -@Piratenpartei -#Piraten -#dieBasis -@diebasispartei -#Volt -@VoltDeutschland"""

query_afd = """( @AfD OR @Alice_Weidel OR @Tino_Chrupalla OR #AFD OR #AfD OR #afd OR #AlternativefürDeutschland OR #Weidel OR #weidel OR #AliceWeidel OR #WEIDEL OR #Chrupalla OR #chrupalla OR #TinoChruppala OR AFD OR Weidel OR Chrupalla)
lang:de -is:retweet
-@cducsubt -@CDU -@ArminLaschet -#Laschet -#ArminLaschet -#arminlaschet -#laschet -#cdu -#CDU -CDU/CSU -Laschet
-@Die_Gruenen -@ABaerbock -@GrueneBundestag -#Gruene -#Grünen -#Grüne -#GRUENEN -#AnnalenaBaerbock -#Baerbock -#baerbock -Baerbock -Grüne -Gruene
-@spdde -@OlafScholz -@spdbt -#SPD -#spd -#Spd -#Scholz -#OlafScholz -#SCHOLZ -#scholz -Scholz -SPD -Sozialdemokraten
-#FDP -@fdp -Lindner -@c_lindner
-#DieLinke -@dieLinke -Wissler -@Janine_Wissler -Bartsch -@DietmarBartsch
-#FreieWaehler -@FREIEWAEHLER_BV
-#diePARTEI -@DiePARTEI
-@Tierschutzparte -NPD -@Piratenpartei -#Piraten -#dieBasis -@diebasispartei -#Volt -@VoltDeutschland"""

query_fdp = """(@fdp OR @fdpbt OR @c_lindner OR #FDP OR #fdp OR #Fdp OR #Lindner OR #lindner OR #LINDNER OR #ChristianLindner OR Lindner OR FDP)
lang:de -is:retweet
-@cducsubt -@CDU -@ArminLaschet -#Laschet -#ArminLaschet -#laschet -#cdu -#CDU -CDU/CSU -Laschet
-@Die_Gruenen -@ABaerbock -@GrueneBundestag -#Gruene -#Grünen -#Grüne -#GRUENEN -#AnnalenaBaerbock -#Baerbock -#baerbock -Baerbock -Grüne -Gruene
-@spdde -@OlafScholz -@spdbt -#SPD -#spd -#Spd -#Scholz -#OlafScholz -#SCHOLZ -#scholz -Scholz -SPD
-@AfD -@Alice_Weidel -@Tino_Chrupalla -#AFD -#AfD -#afd -#Weidel -#weidel -#Chrupalla -AFD -Weidel -Chrupalla
-#DieLinke -@dieLinke -Wissler -@Janine_Wissler -Bartsch -@DietmarBartsch
-#FreieWaehler -@FREIEWAEHLER_BV
-#diePARTEI -@DiePARTEI
-@Tierschutzparte -NPD -@Piratenpartei -#Piraten -#dieBasis -@diebasispartei -#Volt -@VoltDeutschland"""

query_others = """(#FreieWaehler OR #FreieWähler OR @HubertAiwanger OR #FREIEWÄHLER OR @FREIEWAEHLER_BV #freiewähler2021 OR #diePARTEI OR @DiePARTEI OR @Tierschutzparte OR NPD OR @Piratenpartei OR #Piraten OR #dieBasis OR @diebasispartei OR #Volt OR @VoltDeutschland OR @oedp_de OR @bgepartei OR @TodenhoeferTeam OR #TeamTodenhoefer)
lang:de -is:retweet"""

query_spd = """( @spdde OR @OlafScholz OR @spdbt OR #SPD OR #spd OR #Spd OR #Scholz OR #OlafScholz OR #SCHOLZ OR #scholz OR Scholz OR SPD OR Sozialdemokraten)
lang:de -is:retweet
-@cducsubt -@CDU -@ArminLaschet -#Laschet -#ArminLaschet -#arminlaschet -#laschet -#cdu -#CDU -CDU/CSU -Laschet
-@Die_Gruenen -@ABaerbock -@GrueneBundestag -#Gruene -#Grünen -#Grüne -#GRUENEN -#AnnalenaBaerbock -#Baerbock -#baerbock -Baerbock -Grüne -Gruene
-#AFD -@AfD -Weidel -@Alice_Weidel -Chrupalla -@Tino_Chrupalla
-#FDP -@fdp -Lindner -@c_lindner
-#DieLinke -@dieLinke -Wissler -@Janine_Wissler -Bartsch -@DietmarBartsch
-#FreieWaehler -@FREIEWAEHLER_BV
-#diePARTEI -@DiePARTEI
-@Tierschutzparte -NPD -@Piratenpartei -#Piraten -#dieBasis -@diebasispartei -#Volt -@VoltDeutschland"""


query_gruene = """( @Die_Gruenen OR @ABaerbock OR @GrueneBundestag OR #Gruene OR #Grünen OR #Grüne OR #GRUENEN OR #AnnalenaBaerbock OR #Baerbock OR #baerbock OR Baerbock OR Grüne OR Gruene)
lang:de -is:retweet
-@cducsubt -@CDU -@ArminLaschet -#Laschet -#ArminLaschet -#arminlaschet -#laschet -#cdu -#CDU -CDU/CSU -Laschet
-#SPD -@spdde -Scholz -@OlafScholz
-#AFD -@AfD -Weidel -@Alice_Weidel -Chrupalla -@Tino_Chrupalla
-#FDP -@fdp -Lindner -@c_lindner
-#DieLinke -@dieLinke -Wissler -@Janine_Wissler -Bartsch -@DietmarBartsch
-#FreieWaehler -@FREIEWAEHLER_BV
-#diePARTEI -@DiePARTEI
-@Tierschutzparte -NPD -@Piratenpartei -#Piraten -#dieBasis -@diebasispartei -#Volt -@VoltDeutschland"""

# create a query dict to iterate over with the party name, its search string and the maximum number of tweets
query_dict = {
    "SPD": (query_spd, 100000),
    "AFD": (query_afd, 50000),
    "CDU": (query_cdu, 100000),
    "FDP": (query_fdp, 50000),
    "GRUENE": (query_gruene, 100000),
    "LINKE": (query_linke, 40000),
    "OTHERS": (query_others, 40000)
}
