In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option("display.max_columns", None)

# Load and Clean CSV with API Data

## Function

In [3]:
def load_and_clean_csv(party, csv_path):
    '''
    Function loads CSV data from the Twitter API+Sentiment and returns a cleaned DF
    '''
    # Load CSV Dataset via Path
    df = pd.read_csv(csv_path, lineterminator='\n')
    
    # Create "Party" Column and rename other columns
    df['party'] = party
    df = df.rename(columns={"tweet_created_at": "tweet_date", 
                             "public_metrics.retweet_count": "retweet_count",
                             "public_metrics.reply_count": "reply_count",
                             "public_metrics.like_count": "like_count",
                             "profile_created_at": "profile_creation_date",
                             "public_metrics.followers_count": "followers_count",
                             "public_metrics.following_count": "following_count",
                             "public_metrics.tweet_count": "user_tweet_count"
                            })
    
    # Including only columns that we want to use in the future
    df = df[['party', 
             'tweet_date',
             'author_id',
             'tweet_id',
             'text',
             'source',
             'retweet_count',
             'reply_count',
             'like_count',
             'profile_creation_date',
             'followers_count',
             'following_count',
             'user_tweet_count',
             'location',
             'sentiment'
            ]]
    
    # Clean dataset columns:
       # Change dtype
    df['tweet_date'] = df['tweet_date'].str.slice(0,19)
    df["tweet_date"] = pd.to_datetime(df["tweet_date"])
    df['profile_creation_date'] = df['profile_creation_date'].str.slice(0,19)
    df["profile_creation_date"] = pd.to_datetime(df["profile_creation_date"])
       # Drop duplicates
    df = df.drop_duplicates()
       # Transform sentiment to numeric type
    dict_to_numeric = {"negative": -2, "neutral": 1, "positive": 2}
    df["sentiment"].replace(dict_to_numeric, inplace=True)

    return df

## Load CSV Data

In [4]:
df_cdu = load_and_clean_csv("CDU", "/Users/finnzurmuehlen/Downloads/sentiments_sample_luca.csv")
df_cdu.head()

Unnamed: 0,party,tweet_date,author_id,tweet_id,text,source,retweet_count,reply_count,like_count,profile_creation_date,followers_count,following_count,user_tweet_count,location,sentiment
0,CDU,2021-08-23 23:59:54,1106890880,1429956596575227906,"Heute beim Versuch von SPD Wahl zu überzeugen,...",Twitter for iPad,0,0,0,2013-01-20 18:04:25,815,4999,45639,,-2
1,CDU,2021-08-23 23:59:39,40453076,1429956534465818641,"Wenn ein Laschet ein Mann wäre, würde man so e...",Twitter for Android,0,0,1,2009-05-16 12:10:39,398,1013,6894,"Kiel, Schleswig-Holstein",-2
2,CDU,2021-08-23 23:59:33,379140899,1429956507039371268,Tritt @ArminLaschet jetzt der @AfDimBundestag ...,Twitter for Android,0,0,0,2011-09-24 12:53:33,640,5000,61841,BRD,1
3,CDU,2021-08-23 23:59:32,1321047775681851392,1429956503377690630,@MissBJArmstrong @DPflugk Deutschland steht ku...,Twitter for iPhone,4,1,10,2020-10-27 11:15:33,1204,4482,34815,"Quito, Ecuador",1
4,CDU,2021-08-23 23:59:28,1176821601179971585,1429956487800102914,@ECMOKaragianni1 Gibt es einen Grund warum Pol...,Twitter for Android,0,1,7,2019-09-25 11:31:55,459,745,5560,,1


In [5]:
df_linke = load_and_clean_csv("LINKE", '/Users/finnzurmuehlen/Downloads/sample_api_linke_with_sentiment.csv')
df_linke.head(3)

Unnamed: 0,party,tweet_date,author_id,tweet_id,text,source,retweet_count,reply_count,like_count,profile_creation_date,followers_count,following_count,user_tweet_count,location,sentiment
0,LINKE,2021-08-23 23:59:03,1026773320400818183,1429956383043166208,.@dielinke hätte es gerne gesehen wenn die Men...,Twitter for iPhone,0,1,1,2018-08-07 10:13:30,908,2490,52292,,1
1,LINKE,2021-08-23 23:57:20,1026773320400818183,1429955948328767492,Die Grundrechte sind ein Schutz vor der Diktat...,Twitter for iPhone,4,1,31,2018-08-07 10:13:30,908,2490,52292,,1
2,LINKE,2021-08-23 23:56:49,4078952415,1429955818347184128,@chicksonpolitix @dieLinke Nach den Grünen? Mi...,Twitter Web App,0,1,0,2015-10-31 00:55:02,5,156,179,Dresden,1


# Concat. DataFrames

## Concat. Funtion

In [6]:
def concat_dfs(list_of_dfs):
    '''
    Function concatenates multiple dataframes into one DF
    '''
    df_all = pd.concat(list_of_dfs)
    df_all = df_all.reset_index(drop=True)
    return df_all

## Concat DFs

In [7]:
list_of_dfs = [df_cdu, df_linke]
df_all = concat_dfs(list_of_dfs)

# Feature Engineering

## Engineering Function (Non-Sentiment Features)

In [8]:
def create_non_sentiment_features(df):
    
    # Create: "Len per tweet of each party"
    df["avg_len_of_tweet"] = df["text"].str.len()
    
    #Rename Columns
    df = df.rename(columns={"followers_count": "avg_followers_count",
                       "following_count": "avg_following_count", 
                       "user_tweet_count": "avg_user_tweet_count"
                      })
    
    df_temp = df.groupby([pd.Grouper(key='tweet_date',freq='D'), "party"]).agg({
    "reply_count": "sum", 
    "retweet_count": "sum",
    "like_count": "sum",
    "avg_len_of_tweet": "mean",
    "avg_followers_count": "mean",
    "avg_following_count": "mean",
    "avg_user_tweet_count": "mean"
    }) 
    #Create: Followers Ratio
    df_temp["avg_ff_ratio"] = df_temp["avg_followers_count"] / df_temp["avg_following_count"]
    
    # Create: share of tweets that a party has in comparison to all tweets on a given day 
    df_temp_2 = df.groupby([pd.Grouper(key='tweet_date',freq='D'), "party"]).agg({
    "text": "count"}).groupby(level=0).apply(lambda x: x/x.sum())
    
    # Create: Share of tweets that come from a unique user for each party on a given day
    df_temp_3 = df.groupby([pd.Grouper(key='tweet_date',freq='D'), "party"]).agg({
    "author_id": "nunique",
    "text": "count"})
    df_temp_3["share_unique_users"] = df_temp_3["author_id"] / df_temp_3["text"]
    df_temp_3 = df_temp_3["share_unique_users"]
    
     # Join the different temporary DFs into a final DataFrame
    df_final = df_temp.join(df_temp_2).join(df_temp_3)
    df_final = df_final.rename(columns={'text': "share_of_tweets"})
    
    return df_final
    

In [9]:
df_non_sentiment = create_non_sentiment_features(df_all)
df_non_sentiment.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reply_count,retweet_count,like_count,avg_len_of_tweet,avg_followers_count,avg_following_count,avg_user_tweet_count,avg_ff_ratio,share_of_tweets,share_unique_users
tweet_date,party,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-08-14,CDU,232,445,4673,174.213115,728.45082,707.69877,15750.348361,1.029323,0.497452,0.803279
2021-08-14,LINKE,331,427,2953,171.663286,2152.957404,722.817444,20038.324544,2.978563,0.502548,0.732252
2021-08-15,CDU,232,1924,17818,175.445565,3182.612903,622.550403,23234.497984,5.112217,0.5,0.693548
2021-08-15,LINKE,284,236,1532,187.139113,763.358871,660.407258,16896.21371,1.155891,0.5,0.616935
2021-08-16,CDU,306,571,5417,186.070994,762.279919,870.62069,26863.643002,0.875559,0.49798,0.675456


## Engineering Function (Sentiment Features)

In [10]:
import numpy as np

In [11]:
def create_sentiment_features(df):
    '''
    Generates the following features: "Weighted Sentiment", "Share of positive tweets", "Share of negative tweets".
    '''
    df = df[["tweet_date","party","retweet_count", "like_count", "sentiment"]]
    # Generate "Weighted Sentiment"
    df["like_count"] = df["like_count"]+10
    df["retweet_count"] = df["retweet_count"]+10
    df["weighted_sentiment"] = np.log10(df["like_count"]) * np.log10(df["retweet_count"]) * df["sentiment"]
    
    # Generate "Share of positive tweets"
    df["share_of_positive_tweets"] = df["sentiment"]    
    dict_only_positive = {-2: 0, 1: 0, 2: 1}
    df["share_of_positive_tweets"].replace(dict_only_positive, inplace=True)
    
    # Generate "Share of negative tweets"
    df["share_of_negative_tweets"] = df["sentiment"]
    dict_only_negative = {-2: 1, 1: 0, 2: 0}
    df["share_of_negative_tweets"].replace(dict_only_negative, inplace=True)
    
    df["share_of_positive_tweets2"] = df["share_of_positive_tweets"]
    df["share_of_negative_tweets2"] = df["share_of_negative_tweets"]
    df = df.groupby([pd.Grouper(key='tweet_date',freq='D'), "party"]).agg({
        "weighted_sentiment": "mean",
        "share_of_positive_tweets": "sum",
        "share_of_positive_tweets2": "count",
        "share_of_negative_tweets": "sum",
        "share_of_negative_tweets2": "count",})
    df["share_of_positive_tweets"] = df["share_of_positive_tweets"] / df["share_of_positive_tweets2"]
    df["share_of_negative_tweets"] = df["share_of_negative_tweets"] / df["share_of_negative_tweets2"]
    df = df.drop(columns=["share_of_positive_tweets2", "share_of_negative_tweets2"])
    
    return df

In [12]:
df_sentiment = create_sentiment_features(df_all)
df_sentiment.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["like_count"] = df["like_count"]+10
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["retweet_count"] = df["retweet_count"]+10
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["weighted_sentiment"] = np.log10(df["like_count"]) * np.log10(df["retweet_count"]) * df["sentiment"]
A value is trying

Unnamed: 0_level_0,Unnamed: 1_level_0,weighted_sentiment,share_of_positive_tweets,share_of_negative_tweets
tweet_date,party,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2021-08-14,CDU,-0.354261,0.053279,0.452869
2021-08-14,LINKE,0.047853,0.058824,0.35497
2021-08-15,CDU,-0.443922,0.02621,0.451613
2021-08-15,LINKE,0.025391,0.048387,0.350806
2021-08-16,CDU,-0.564539,0.032454,0.488844


## Join Function (Combine Sentiment & Non-Sentiment Features)

In [13]:
def join_features(df1, df2):
    df_joined = df1.join(df2)
    return df_joined

In [14]:
df_joined = join_features(df_non_sentiment, df_sentiment)
df_joined.tail()

Unnamed: 0_level_0,Unnamed: 1_level_0,reply_count,retweet_count,like_count,avg_len_of_tweet,avg_followers_count,avg_following_count,avg_user_tweet_count,avg_ff_ratio,share_of_tweets,share_unique_users,weighted_sentiment,share_of_positive_tweets,share_of_negative_tweets
tweet_date,party,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2021-08-21,LINKE,363,362,2167,184.799599,1453.296593,616.182365,34740.12024,2.358549,0.499499,0.579158,0.042635,0.07014,0.360721
2021-08-22,CDU,707,786,9656,165.232,1813.296,972.902,21205.772,1.863801,0.501002,0.71,-0.364977,0.054,0.462
2021-08-22,LINKE,310,296,1510,189.335341,1232.351406,814.65261,19690.032129,1.512732,0.498998,0.616466,-0.003416,0.060241,0.349398
2021-08-23,CDU,202,201,1747,182.788,2040.852,737.678,20625.63,2.766589,0.500501,0.712,-0.215111,0.024,0.404
2021-08-23,LINKE,310,372,2567,181.348697,1384.254509,625.87976,18985.222445,2.211694,0.499499,0.725451,-0.124307,0.056112,0.398798


# Final DataFrame

## Load Poll API Data

In [15]:
df_poll = pd.read_csv('/Users/finnzurmuehlen/Downloads/polls_data_updated.csv')[["Date", "CDU/CSU", "Linke"]]
new_row = {'Date':'2021-08-23', 'CDU/CSU': 22.00, 'Linke': 7.00}
df_poll = df_poll.append(new_row, ignore_index=True)
df_poll

Unnamed: 0,Date,CDU/CSU,Linke
0,2021-05-26,25.50,7.88
1,2021-05-27,27.00,8.50
2,2021-05-28,26.75,7.75
3,2021-05-29,25.00,7.00
4,2021-05-30,25.25,6.75
...,...,...,...
85,2021-08-19,24.17,7.17
86,2021-08-20,35.00,10.00
87,2021-08-21,28.50,8.50
88,2021-08-22,22.00,7.00


## Function: Create Final DF for RNN

In [23]:
def create_rnn_final_df(df_poll ,df_joined):
    '''
    Joines (how=outer) engineered features DF and poll DF for the German parties
    '''
    # Rename df_poll columns and change dtype to datetime
    df_poll = df_poll.rename(columns = {"Date": "tweet_date", "Linke": "LINKE","CDU/CSU":"CDU"})
    df_poll["tweet_date"] = pd.to_datetime(df_poll["tweet_date"])
    df_poll = df_poll.set_index("tweet_date")
    
    # Unstack the indexes in order to join on the tweet date and parties
    df_poll = pd.DataFrame(df_poll.T.unstack(level = 0))
    df_poll.index = df_poll.index.set_names(['tweet_date', 'party'])
    
    # Join both DFs together
    df_final = df_poll.join(df_joined, how = "outer")
    
    # Rename new column as "poll"
    df_final = df_final.rename(columns = {0: "poll"})
    
    return df_final

## Create Final Dataframe

In [24]:
df_final = create_rnn_final_df(df_poll, df_joined)
df_final

Unnamed: 0_level_0,Unnamed: 1_level_0,poll,reply_count,retweet_count,like_count,avg_len_of_tweet,avg_followers_count,avg_following_count,avg_user_tweet_count,avg_ff_ratio,share_of_tweets,share_unique_users,weighted_sentiment,share_of_positive_tweets,share_of_negative_tweets
tweet_date,party,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2021-05-26,CDU,25.50,,,,,,,,,,,,,
2021-05-26,LINKE,7.88,,,,,,,,,,,,,
2021-05-27,CDU,27.00,,,,,,,,,,,,,
2021-05-27,LINKE,8.50,,,,,,,,,,,,,
2021-05-28,CDU,26.75,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-08-21,LINKE,8.50,363.0,362.0,2167.0,184.799599,1453.296593,616.182365,34740.120240,2.358549,0.499499,0.579158,0.042635,0.070140,0.360721
2021-08-22,CDU,22.00,707.0,786.0,9656.0,165.232000,1813.296000,972.902000,21205.772000,1.863801,0.501002,0.710000,-0.364977,0.054000,0.462000
2021-08-22,LINKE,7.00,310.0,296.0,1510.0,189.335341,1232.351406,814.652610,19690.032129,1.512732,0.498998,0.616466,-0.003416,0.060241,0.349398
2021-08-23,CDU,22.00,202.0,201.0,1747.0,182.788000,2040.852000,737.678000,20625.630000,2.766589,0.500501,0.712000,-0.215111,0.024000,0.404000
