In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('../../raw_data/final_database.csv', lineterminator= "\n" )


In [3]:
df["tweet_date"] = pd.to_datetime(df["tweet_date"])


In [4]:
def create_non_sentiment_features(df):

    # Create: "Len per tweet of each party"
    df["avg_len_of_tweet"] = df["text"].str.len()

    #Rename Columns
    df = df.rename(columns={"followers_count": "avg_followers_count",
                       "following_count": "avg_following_count",
                       "user_tweet_count": "avg_user_tweet_count"
                      })
    # Change dtypes
    df = df.fillna(0)
    df["reply_count"] = df["reply_count"].astype(float)
    df["retweet_count"] = df["retweet_count"].astype(float)
    df["like_count"] = df["like_count"].astype(float)
    df["avg_len_of_tweet"] = df["avg_len_of_tweet"].astype(float)
    df["avg_followers_count"] = df["avg_followers_count"].astype(float)
    df["avg_following_count"] = df["avg_following_count"].astype(float)
    df["avg_user_tweet_count"] = df["avg_user_tweet_count"].astype(float)

    #Create temporary DF
    df_temp = df.groupby([pd.Grouper(key='tweet_date',freq='D'), 'party']).agg({
    "reply_count": "sum",
    "retweet_count": "sum",
    "like_count": "sum",
    "avg_len_of_tweet": "mean",
    "avg_followers_count": "mean",
    "avg_following_count": "mean",
    "avg_user_tweet_count": "mean"
    })

    #Create: Followers Ratio
    df_temp["avg_ff_ratio"] = df_temp["avg_followers_count"] / df_temp["avg_following_count"]

    # Create: share of tweets that a party has in comparison to all tweets on a given day
    df_temp_2 = df.groupby([pd.Grouper(key='tweet_date',freq='D'), 'party']).agg({
    "text": "count"}).groupby(level=0).apply(lambda x: x/x.sum())

    # Create: Share of tweets that come from a unique user for each party on a given day
    df_temp_3 = df.groupby([pd.Grouper(key='tweet_date',freq='D'), 'party']).agg({
    "author_id": "nunique",
    "text": "count"})
    df_temp_3["share_unique_users"] = df_temp_3["author_id"] / df_temp_3["text"]
    df_temp_3 = df_temp_3["share_unique_users"]

    # Join the different temporary DFs into a final DataFrame
    df_final = df_temp.join(df_temp_2).join(df_temp_3)
    df_final = df_final.rename(columns={'text': "share_of_tweets"})

    return df_final


def create_sentiment_features(df):
    '''
    Generates the following features: "Weighted Sentiment", "Share of positive tweets", "Share of negative tweets".
    '''
    # Change dtype
    df = df.fillna(0)
    df["retweet_count"] = df["retweet_count"].astype(float)
    df["like_count"] = df["like_count"].astype(float)
    df["sentiment"] = df["sentiment"].astype(float)

    df = df[["tweet_date","party","retweet_count", "like_count", "sentiment"]]
    # Generate "Weighted Sentiment"
    df["like_count"] = df["like_count"]+10
    df["retweet_count"] = df["retweet_count"]+10
    df["weighted_sentiment"] = np.log10(df["like_count"]) * np.log10(df["retweet_count"]) * df["sentiment"]

    # Generate "Share of positive tweets"
    df["share_of_positive_tweets"] = df["sentiment"]
    dict_only_positive = {-2: 0, 1: 0, 2: 1}
    df["share_of_positive_tweets"].replace(dict_only_positive, inplace=True)

    # Generate "Share of negative tweets"
    df["share_of_negative_tweets"] = df["sentiment"]
    dict_only_negative = {-2: 1, 1: 0, 2: 0}
    df["share_of_negative_tweets"].replace(dict_only_negative, inplace=True)

    df["share_of_positive_tweets2"] = df["share_of_positive_tweets"]
    df["share_of_negative_tweets2"] = df["share_of_negative_tweets"]
    df = df.groupby([pd.Grouper(key='tweet_date',freq='D'), "party"]).agg({
        "weighted_sentiment": "mean",
        "share_of_positive_tweets": "sum",
        "share_of_positive_tweets2": "count",
        "share_of_negative_tweets": "sum",
        "share_of_negative_tweets2": "count",})
    df["share_of_positive_tweets"] = df["share_of_positive_tweets"] / df["share_of_positive_tweets2"]
    df["share_of_negative_tweets"] = df["share_of_negative_tweets"] / df["share_of_negative_tweets2"]
    df = df.drop(columns=["share_of_positive_tweets2", "share_of_negative_tweets2"])

    return df


def join_features(df1, df2):
    df_joined = df1.join(df2)
    return df_joined


def load_poll_df():
    df = clean_data()
    df = df[["Date", "CDU/CSU", 'SPD', 'Grüne', 'FDP', "Linke", 'AfD', 'other']]
    return df


def create_rnn_final_df(df_poll ,df_joined):
    '''
    Joines (how=outer) engineered features DF and poll DF for the German parties
    '''
    # Rename df_poll columns and change dtype to datetime
    df_poll = df_poll.rename(columns = {"Date": "tweet_date",
                                        "CDU/CSU":"CDU",
                                        "Grüne": "GRUENE",
                                        "Linke": "LINKE",
                                        "AfD": "AFD",
                                        "other": "OTHER"
                                       })
    df_poll["tweet_date"] = pd.to_datetime(df_poll["tweet_date"])
    df_poll = df_poll.set_index("tweet_date")

    # Unstack the indexes in order to join on the tweet date and parties
    df_poll = pd.DataFrame(df_poll.T.unstack(level = 0))
    df_poll.index = df_poll.index.set_names(['tweet_date', 'party'])

    # Join both DFs together
    df_final = df_poll.join(df_joined, how = "outer")

    # Rename new column as "poll"
    df_final = df_final.rename(columns = {0: "poll"})

    print("Success")
    #df_final.to_csv('/Users/finnzurmuehlen/Downloads/df_final_py_test_2.csv')
    return df_final

In [5]:
df_non_sentiment = create_non_sentiment_features(df)
df_sentiment = create_sentiment_features(df)
df_joined = join_features(df_non_sentiment, df_sentiment)

In [59]:
df_joined = df_joined.reset_index().replace("OTHERS", "OTHER")

In [72]:
df_joined

Unnamed: 0_level_0,Unnamed: 1_level_0,reply_count,retweet_count,like_count,avg_len_of_tweet,avg_followers_count,avg_following_count,avg_user_tweet_count,avg_ff_ratio,share_of_tweets,share_unique_users,weighted_sentiment,share_of_positive_tweets,share_of_negative_tweets
tweet_date,party,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2021-07-23,AFD,5414.0,7430.0,50589.0,183.483988,9897.783065,844.707798,30258.523792,11.717405,0.241723,0.599421,-0.822698,0.028768,0.587842
2021-07-23,CDU,5430.0,8987.0,53965.0,191.227455,5095.104329,921.350581,26716.583105,5.530039,0.207085,0.673073,-0.466696,0.035058,0.480465
2021-07-23,FDP,2551.0,1854.0,15224.0,171.541570,4400.461124,791.273287,26521.715935,5.561241,0.113623,0.675905,-0.385271,0.061971,0.476905
2021-07-23,GRUENE,6288.0,7621.0,60041.0,182.306444,3682.779195,773.788140,24578.680470,4.759415,0.264028,0.619016,-0.267626,0.060460,0.439125
2021-07-23,LINKE,401.0,268.0,1625.0,181.312292,1976.892027,746.588040,36722.935216,2.647902,0.026328,0.616279,-0.050804,0.061462,0.373754
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-08-31,FDP,4082.0,2626.0,30829.0,148.846072,2399.575094,725.615001,20007.450027,3.306954,0.112983,0.721005,-0.545316,0.051131,0.523784
2021-08-31,GRUENE,4542.0,4733.0,38891.0,180.245516,5248.775876,772.032746,23105.896824,6.798644,0.122323,0.688827,-0.115527,0.063847,0.393780
2021-08-31,LINKE,2134.0,1411.0,10948.0,176.158121,2309.812119,883.784511,27386.891000,2.613547,0.056139,0.547508,-0.117520,0.057727,0.389745
2021-08-31,OTHER,1183.0,1580.0,9130.0,164.704657,1844.708946,689.155025,16861.637255,2.676769,0.032850,0.670343,0.000828,0.088848,0.371936


In [62]:
df_joined = df_joined.groupby([pd.Grouper(key='tweet_date',freq='D'), "party"]).agg({
    "reply_count": "sum", 
    "retweet_count": "sum",
    "like_count": "sum",
    "avg_len_of_tweet": "mean" ,
   "avg_followers_count": "mean" ,
    "avg_following_count": "mean" ,
    "avg_user_tweet_count" :"mean",
    "avg_ff_ratio" :"mean",
    "share_of_tweets" :"mean",
    "share_unique_users": "mean",
    "weighted_sentiment" : "mean",
    "share_of_positive_tweets": "mean",
    "share_of_negative_tweets" : "mean"
    }) 

In [86]:
df_poll = pd.read_csv('polls_data_2021_v7.csv')

In [87]:
df_final = create_rnn_final_df(df_poll, df_joined)


Success


In [90]:
df_final.to_csv("rnn_model_df_31.csv")