In [1]:
import pandas as pd
import numpy as np

In [2]:
pd.set_option("display.max_columns", None)

# Load and Clean CSV with API Data

## Function

In [3]:
def load_and_clean_csv(party, csv_path):
    '''
    Function loads CSV data from the Twitter API+Sentiment and returns a cleaned DF
    '''
    # Load CSV Dataset via Path
    df = pd.read_csv(csv_path, lineterminator='\n')
    
    # Create "Party" Column and rename other columns
    df['party'] = party
    df = df.rename(columns={"tweet_created_at": "tweet_date", 
                             "public_metrics.retweet_count": "retweet_count",
                             "public_metrics.reply_count": "reply_count",
                             "public_metrics.like_count": "like_count",
                             "profile_created_at": "profile_creation_date",
                             "public_metrics.followers_count": "followers_count",
                             "public_metrics.following_count": "following_count",
                             "public_metrics.tweet_count": "user_tweet_count"
                            })
    
    # Including only columns that we want to use in the future
    df = df[['party', 
             'tweet_date',
             'author_id',
             'tweet_id',
             'text',
             'source',
             'retweet_count',
             'reply_count',
             'like_count',
             'profile_creation_date',
             'followers_count',
             'following_count',
             'user_tweet_count',
             'location',
             'sentiment'
            ]]
    
    # Clean dataset columns:
       # Change dtype
    df["tweet_date"] = pd.to_datetime(df["tweet_date"])
    df["profile_creation_date"] = pd.to_datetime(df["profile_creation_date"])
       # Drop duplicates
    df = df.drop_duplicates()
       # Transform sentiment to numeric type
    dict_to_numeric = {"negative": -2, "neutral": 1, "positive": 2}
    df["sentiment"].replace(dict_to_numeric, inplace=True)

    return df

## Load CSV Data

In [4]:
df_cdu = load_and_clean_csv("CDU", "/Users/finnzurmuehlen/Downloads/sentiments_sample_luca.csv")
df_cdu.head(3)

Unnamed: 0,party,tweet_date,author_id,tweet_id,text,source,retweet_count,reply_count,like_count,profile_creation_date,followers_count,following_count,user_tweet_count,location,sentiment
0,CDU,2021-08-23 23:59:54+00:00,1106890880,1429956596575227906,"Heute beim Versuch von SPD Wahl zu überzeugen,...",Twitter for iPad,0,0,0,2013-01-20 18:04:25+00:00,815,4999,45639,,-2
1,CDU,2021-08-23 23:59:39+00:00,40453076,1429956534465818641,"Wenn ein Laschet ein Mann wäre, würde man so e...",Twitter for Android,0,0,1,2009-05-16 12:10:39+00:00,398,1013,6894,"Kiel, Schleswig-Holstein",-2
2,CDU,2021-08-23 23:59:33+00:00,379140899,1429956507039371268,Tritt @ArminLaschet jetzt der @AfDimBundestag ...,Twitter for Android,0,0,0,2011-09-24 12:53:33+00:00,640,5000,61841,BRD,1


In [5]:
df_linke = load_and_clean_csv("LINKE", '/Users/finnzurmuehlen/Downloads/sample_api_linke_with_sentiment.csv')
df_linke.head(3)

Unnamed: 0,party,tweet_date,author_id,tweet_id,text,source,retweet_count,reply_count,like_count,profile_creation_date,followers_count,following_count,user_tweet_count,location,sentiment
0,LINKE,2021-08-23 23:59:03+00:00,1026773320400818183,1429956383043166208,.@dielinke hätte es gerne gesehen wenn die Men...,Twitter for iPhone,0,1,1,2018-08-07 10:13:30+00:00,908,2490,52292,,1
1,LINKE,2021-08-23 23:57:20+00:00,1026773320400818183,1429955948328767492,Die Grundrechte sind ein Schutz vor der Diktat...,Twitter for iPhone,4,1,31,2018-08-07 10:13:30+00:00,908,2490,52292,,1
2,LINKE,2021-08-23 23:56:49+00:00,4078952415,1429955818347184128,@chicksonpolitix @dieLinke Nach den Grünen? Mi...,Twitter Web App,0,1,0,2015-10-31 00:55:02+00:00,5,156,179,Dresden,1


# Concat. DataFrames

## Concat. Funtion

In [6]:
def concat_dfs(list_of_dfs):
    '''
    Function concatenates multiple dataframes into one DF
    '''
    df_all = pd.concat(list_of_dfs)
    df_all = df_all.reset_index(drop=True)
    return df_all

## Concat DFs

In [7]:
list_of_dfs = [df_cdu, df_linke]
df_all = concat_dfs(list_of_dfs)

# Feature Engineering

## Engineering Function (Non-Sentiment Features)

In [47]:
def create_non_sentiment_features(df):
    
    # Create: "Len per tweet of each party"
    df["avg_len_of_tweet"] = df["text"].str.len()
    
    #Rename Columns
    df = df.rename(columns={"followers_count": "avg_followers_count",
                       "following_count": "avg_following_count", 
                       "user_tweet_count": "avg_user_tweet_count"
                      })
    
    df_temp = df.groupby([pd.Grouper(key='tweet_date',freq='D'), "party"]).agg({
    "reply_count": "sum", 
    "retweet_count": "sum",
    "like_count": "sum",
    "avg_len_of_tweet": "mean",
    "avg_followers_count": "mean",
    "avg_following_count": "mean",
    "avg_user_tweet_count": "mean"
    }) 
    #Create: Followers Ratio
    df_temp["avg_ff_ratio"] = df_temp["avg_followers_count"] / df_temp["avg_following_count"]
    
    # Create: share of tweets that a party has in comparison to all tweets on a given day 
    df_temp_2 = df.groupby([pd.Grouper(key='tweet_date',freq='D'), "party"]).agg({
    "text": "count"}).groupby(level=0).apply(lambda x: x/x.sum())
    
    # Create: Share of tweets that come from a unique user for each party on a given day
    df_temp_3 = df.groupby([pd.Grouper(key='tweet_date',freq='D'), "party"]).agg({
    "author_id": "nunique",
    "text": "count"})
    df_temp_3["share_unique_users"] = df_temp_3["author_id"] / df_temp_3["text"]
    df_temp_3 = df_temp_3["share_unique_users"]
    
     # Join the different temporary DFs into a final DataFrame
    df_final = df_temp.join(df_temp_2).join(df_temp_3)
    df_final = df_final.rename(columns={'text': "share_of_tweets"})
    
    return df_final
    

In [51]:
df_final = create_non_sentiment_features(df_all)
df_final.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,reply_count,retweet_count,like_count,avg_len_of_tweet,avg_followers_count,avg_following_count,avg_user_tweet_count,avg_ff_ratio,share_of_tweets,share_unique_users
tweet_date,party,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2021-08-14 00:00:00+00:00,CDU,232,445,4673,174.213115,728.45082,707.69877,15750.348361,1.029323,0.497452,0.803279
2021-08-14 00:00:00+00:00,LINKE,331,427,2953,171.663286,2152.957404,722.817444,20038.324544,2.978563,0.502548,0.732252
2021-08-15 00:00:00+00:00,CDU,232,1924,17818,175.445565,3182.612903,622.550403,23234.497984,5.112217,0.5,0.693548
2021-08-15 00:00:00+00:00,LINKE,284,236,1532,187.139113,763.358871,660.407258,16896.21371,1.155891,0.5,0.616935
2021-08-16 00:00:00+00:00,CDU,306,571,5417,186.070994,762.279919,870.62069,26863.643002,0.875559,0.49798,0.675456


## Engineering Function (Sentiment Features)

In [53]:
df_all.head(1)

Unnamed: 0,party,tweet_date,author_id,tweet_id,text,source,retweet_count,reply_count,like_count,profile_creation_date,followers_count,following_count,user_tweet_count,location,sentiment,len_of_tweet,followers_following_ratio,avg_len_of_tweet
0,CDU,2021-08-23 23:59:54+00:00,1106890880,1429956596575227906,"Heute beim Versuch von SPD Wahl zu überzeugen,...",Twitter for iPad,0,0,0,2013-01-20 18:04:25+00:00,815,4999,45639,,-2,268,0.163033,268


In [None]:
def sample_feature_engineering(df):
    '''
    Generates the following features: "Weighted Sentiment", "Share of positive tweets", "Share of negative tweets".
    '''
    df = df[["retweet_count", "like_count", "sentiment"]]
    # Generate "Weighted Sentiment"
    df["like_count"] = df["like_count"]+10
    df["retweet_count"] = df["retweet_count"]+10
    df["weighted_sentiment"] = np(df_sample["like_count"]) * np.log10(df_sample["retweet_count"]) * df_sample["sentiment"]

    # Generate "Share of positive tweets"
    df_sample["share_of_positive_tweets"] = df_sample["sentiment"]    
    dict_only_positive = {-2: 0, 1: 0, 1: 1}
    df_sample["share_of_positive_tweets"].replace(dict_only_positive, inplace=True)
    
    # Generate "Share of negative tweets"
    df_sample["share_of_negative_tweets"] = df_sample["sentiment"]
    dict_only_negative = {-1: 1, 0: 0, 1: 0}
    df_sample["share_of_negative_tweets"].replace(dict_only_negative, inplace=True)
    
    # Return DF
    return df_sample