# Get Data

I get data from Twitter in 2021 and 2022.

These are queries for both the 9-Euro-Ticket and Verkehrswende more generally. I've downloaded the Verkehrswende data twice, the code below is an update from 2022 (the first file is from 2021).

In [None]:
!twarc2 searches --archive --start-time 2022-03-23 --end-time 2022-07-15 "../Verkehrswende-Vergleich/Code/Queries/9euroticket.txt" "../Verkehrswende-Vergleich/Data/9euroticket.csv"

In [None]:
!twarc2 search --start-time 2021-06-16 --end-time 2022-7-15 --archive "Verkehrswende OR Mobilitätswende -is:retweet" > "../Verkehrswende-Vergleich/Data/Verkehrswende_2022.jsonl"

In [None]:
! twarc2 csv --extra-input-columns "author.withheld.scope" "../Verkehrswende-Vergleich/Data/Verkehrswende_2022.jsonl" "../Verkehrswende-Vergleich/Data/Verkehrswende_2022.csv"

In [None]:
! twarc2 csv --extra-input-columns "author.withheld.scope" "../Verkehrswende-Vergleich/Data/9euroticket.jsonl" "../Verkehrswende-Vergleich/Data/9euroticket.csv"

# Datawrangling

In [None]:
def statistics_tweet(df):
    '''
    This script transforms an existing standard twarc2 Twitter dataset to allow for convenient further analysis.
    Two major results are produced. 
    One, ratios are calculated that ve meaning to tweets' success. Second, hashtags, mentions and urls are extracted from the JSON tagstrings.
    Besides, follower, like and retweet columns are renamed and float numbers transformed into integers.
    '''
    # Get orinal columns
    col = len(df.columns)
    
    # Turn off SettingWithCopyWarning because we indeed want to transform the orinal dataframe
    pd.set_option('mode.chained_assignment', None)
    
    # Create a variable: follower/following ratio
    df['foll_ratio'] = (df['author.public_metrics.followers_count'] / df['author.public_metrics.following_count']) #if df[(df['author.public_metrics.following_count'] != 0)] else 0
        
    # Create a variable: likes/follower
    df['like_foll'] = (df['public_metrics.like_count'] / df['author.public_metrics.followers_count']) #if df[(df['author.public_metrics.followers_count'] != 0)] else 0

    # Delete false data (which include only NaNs) 
    pd.set_option('mode.use_inf_as_na', True)
    df.dropna(how='all') 
    df.dropna(subset=['public_metrics.retweet_count'], inplace=True)
    df.dropna(subset=['public_metrics.like_count'], inplace=True)
    df.dropna(subset=['author.public_metrics.followers_count'], inplace=True)
    
    # Create and make columns more human-readable
    df['entities.mentions'] = df['entities.mentions'].astype(str)
    df['entities.urls'] = df['entities.urls'].astype(str)
    df['entities.hashtags'] = df['entities.hashtags'].astype(str)
    df['retweets'] = df['public_metrics.retweet_count'].astype(int)
    df['likes'] = df['public_metrics.like_count'].astype(int)
    df['followers'] = df['author.public_metrics.followers_count'].astype(int)
    df.drop(['public_metrics.retweet_count', 'public_metrics.like_count', 'author.public_metrics.followers_count', 'attachments.media_keys', 'attachments.poll.end_datetime', 'attachments.poll.id', 'attachments.poll.options', 'attachments.poll.voting_status'], axis=1, inplace=True)
    
    # collect list of mentions, hashtags and urls
    
    def find_mentions(tagstring):
        false = False
        true = True   
        try:
            if tagstring == tagstring:
                list_of_dicts = eval(tagstring)
                mentions = []
                for dct in list_of_dicts:
                    tag = dct['username']
                    mentions.append(tag)
                return mentions
            else:
                return np.nan
        except:
            tagstring is None
    
    df['mentions'] = df['entities.mentions'].apply(find_mentions, lambda col: col.str.lower())

    def find_urls(tagstring):
        try:
            if tagstring == tagstring:
                list_of_dicts = eval(tagstring)
                urls = []
                for dct in list_of_dicts:
                    tag = dct['expanded_url']
                    urls.append(tag)
                return urls
            else:
                return np.nan
        except:
            tagstring is None
            
    df['urls'] = df['entities.urls'].apply(find_urls, lambda col: col.str.lower())

    def find_hashtags(tagstring):
        try:
            if tagstring == tagstring:
                list_of_dicts = eval(tagstring)
                hashtags = []
                for dct in list_of_dicts:
                    tag = dct['tag']
                    hashtags.append(tag)
                return hashtags
            else:
                return np.nan
        except:
            tagstring is None

    df['hashtags'] = df['entities.hashtags'].apply(find_hashtags, lambda col: df['hashtags'].str.lower())
    
    # prepare data for visualization
    df['id'] = df['id'].astype(str)
    df['hashtags'] = df['hashtags'].astype(str)
    df['name'] = df['author.username'] + ' ' + df['hashtags']
    df['type'] = df['type'].fillna('Tweet')
    df['type'] = df['type'].replace(to_replace =[''], value ='Tweet')
    
    # prepara data for Gephi export (remove list-type), thus allowing the Gephi import with spaces
    df['clean_urls'] = df['urls'].astype(str).str.replace(r'\[|\]|,', '', regex=True)
    df['clean_urls'] = df['clean_urls'].astype(str).str.replace(r"'", "", regex=True)
    df['clean_hashtags'] = df['hashtags'].astype(str).str.replace(r'\[|\]|,', '', regex=True)
    df['clean_hashtags'] = df['clean_hashtags'].astype(str).str.replace(r"'", "", regex=True)
    df['clean_mentions'] = df['mentions'].astype(str).str.replace(r'\[|\]|,', '', regex=True)
    df['clean_mentions'] = df['clean_mentions'].astype(str).str.replace(r"'", "", regex=True)
    
    return print('Dataframe with basic statistics transformed. Hashtags, mentions and urls parsed.\nOrinal count of columns: {}, new count of colums: {}.'.format(col,len(df.columns)))


In [None]:
statistics_tweet(df)

In [None]:
df['clean_hashtags']

Dataframe gets written into a CSV-file to save and further analyse with pandas' '.to_csv'-function

In [None]:
df.to_csv("../Verkehrswende-Vergleich/Data/Verkehrswende_2022_transformed.csv")

## Merge datasets

Nun lt es, die bisherigen Daten mit den neuen Daten zusammenzubringen -- dafür müssen wie passbar gemacht werden. Erste Frage: Was ist der richtige Datensatz?

In [None]:
df = pd.read_csv("../Verkehrswende-Vergleich/Data/Verkehrswende/Verkehrswende_transformed.csv", low_memory=False)

In [None]:
frequency = get_tweet_frequency(data_path, "Verkehrswende")
frequency.loc[frequency['tweet_count'].idxmax()]

In [None]:
df_ = pd.read_csv("../Verkehrswende-Vergleich/Data/Verkehrswende_2022_transformed.csv", low_memory=False)

In [None]:
print(df.columns.difference(df_.columns))

In [None]:
print(df_.columns.difference(df.columns))

In [None]:
df = df.drop(['author.withheld.copyright', 'Unnamed: 0.1', 'in_reply_to_user.withheld.scope'], axis=1)

Der Datensatz passt. Nun lt es, zu verbinden mit `pd.concat`.

In [None]:
df_['created_at'].head(-20)

In [None]:
# This is the appropriate method for concatenating.
df_concat = pd.concat([df, df_], join="inner")

In [None]:
df = df.sort_index(axis=1)

In [None]:
df_ = df_.sort_index(axis=1)

In [None]:
# Note the order, otherwise it's not working.
df = df_concat

In [None]:
df_['created_at'].head(-5)

In [None]:
df_concat['created_at'] = pd.to_datetime(df_concat['created_at'], utc=True)
df_concat = df_concat.sort_values(by=['created_at'])

In [None]:
df_.describe()

In [None]:
df = df.sort_values(by=['created_at'])
df['created_at'].head(-5)

In [None]:
df.to_csv("../Verkehrswende-Vergleich/Data/Verkehrswende/Verkehrswende_combined.csv")

## Hashtag harmonization

In [19]:
# Verkehrswende
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("Verkehrs-Wende", "Verkehrswende"))

# Mobilitätswende
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("mobilitaetswende", "Mobilitätswende"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("Mobilitaetswende", "Mobilitätswende"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("Mobilitäts-Wende", "Mobilitätswende"))

# Energiewende
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("Energie-Wende", "Energiewende"))

# ÖPNV
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("öpnv", "ÖPNV"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("oepnv", "ÖPNV"))

# E-Mobilität
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("emobilität", "E-Mobilität"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("e-mobilität", "E-Mobilität"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("emobility", "E-Mobilität"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("Elektromobilität", "E-Mobilität"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("Elektromobilitaet", "E-Mobilität"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("ElektroMobilität", "E-Mobilität"))

# Radwege
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("radweg", "Radwege"))

# Grenzwerte
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("grenzwert", "Grenzwerte"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("grenzwerte", "Grenzwerte"))

# Kleineres
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("klima", "Klima"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("fahrrad", "Fahrrad"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("autokorrektur", "Autokorrektur"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("klimaschutz", "Klimaschutz"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("klimapolitik", "Klimapolitik"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("dannibleibt", "dannibleibt"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("mobilität", "Mobilität"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("autos", "Auto"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("Autos", "Auto"))
df['clean_hashtags'] = df['clean_hashtags'].apply(
    lambda x: x.replace("co2", "CO2"))

# All Lower Case

df['clean_hashtags'] = df['clean_hashtags'].str.lower()

In [20]:
df['clean_hashtags'].head()

0                               none
1                               none
2                           eurobike
3    auto fahrradfahrer sterben bahn
4                    stuttgart21 s21
Name: clean_hashtags, dtype: object

In [None]:
df.to_csv("../Verkehrswende-Vergleich/Data/Verkehrswende/Verkehrswende_combined_hashtagsync.csv")