In [1]:
# Imports
from os import link
import pandas as pd
import numpy as np
import datetime


cdu_path = '/Users/finnzurmuehlen/Downloads/2021_cdu_with_sentiment.csv'
spd_path = '/Users/finnzurmuehlen/Downloads/2021_spd_with_sentiment.csv'
gruene_path = '/Users/finnzurmuehlen/Downloads/2021_fdp_with_sentiment.csv'
fdp_path = '/Users/finnzurmuehlen/Downloads/2021_gruene_with_sentiment.csv'
linke_path = '/Users/finnzurmuehlen/Downloads/2021_0723-0826_Tweets_Linken_sentiment.csv'
afd_path = '/Users/finnzurmuehlen/Downloads/2021_afd_with_sentiment.csv'
other_path = '/Users/finnzurmuehlen/Downloads/2021_others_with_sentiment.csv'

parties = ["CDU", "SPD", "GRUENE", "FDP", "LINKE", "AFD", "OTHER"]
paths = [cdu_path, spd_path, gruene_path, fdp_path, linke_path, afd_path, other_path]

def load_and_clean_csv(parties, paths):
    '''
    Function loads CSV data from the Twitter API+Sentiment and returns a cleaned DF
    '''
    list_of_dfs = []
    for party, path in zip(parties, paths):
        # Load CSV Dataset via Path
        df = pd.read_csv(path, lineterminator='\n', low_memory=False)

        # Create "Party" Column and rename other columns
        df['party'] = party
        df = df.rename(columns={"tweet_created_at": "tweet_date",
                                "public_metrics.retweet_count": "retweet_count",
                                "public_metrics.reply_count": "reply_count",
                                "public_metrics.like_count": "like_count",
                                "profile_created_at": "profile_creation_date",
                                "public_metrics.followers_count": "followers_count",
                                "public_metrics.following_count": "following_count",
                                "public_metrics.tweet_count": "user_tweet_count"
                                })

        # Including only columns that we want to use in the future
        df = df[['party',
                'tweet_date',
                'author_id',
                'tweet_id',
                'text',
                'source',
                'retweet_count',
                'reply_count',
                'like_count',
                'profile_creation_date',
                'followers_count',
                'following_count',
                'user_tweet_count',
                'location',
                'sentiment'
                ]]

        # Clean dataset columns:
        # Change dtype
        df["tweet_date"] = df["tweet_date"].astype(str)
        df = df[df.tweet_date.str.match('(\d{4}-\d{2}-\d{2}.\d{2}:\d{2}:\d{2})')]
        df = df[(df.tweet_date.str.len() == 23) | (df.tweet_date.str.len() == 24)]
        df['tweet_date'] = df['tweet_date'].str.slice(0,19)
        df["tweet_date"] = pd.to_datetime(df["tweet_date"])
        df['profile_creation_date'] = df['profile_creation_date'].str.slice(0,19)
        df["profile_creation_date"] = pd.to_datetime(df["profile_creation_date"])
        # Drop duplicates
        df = df.drop_duplicates()
        # Transform sentiment to numeric type
        dict_to_numeric = {"negative": -2, "neutral": 1, "positive": 2}
        df["sentiment"].replace(dict_to_numeric, inplace=True)

        list_of_dfs.append(df)

    print(list_of_dfs)
    return list_of_dfs

In [3]:
list_of_dfs = load_and_clean_csv(parties, paths)

[       party          tweet_date               author_id  \
0        CDU 2021-08-26 23:59:59              20253637.0   
1        CDU 2021-08-26 23:56:43            4715414661.0   
2        CDU 2021-08-26 23:55:40            3214660877.0   
3        CDU 2021-08-26 23:54:03             923923849.0   
4        CDU 2021-08-26 23:53:42  1.3810139609174098e+18   
...      ...                 ...                     ...   
370554   CDU 2021-07-23 04:05:05     1386575024233062406   
370555   CDU 2021-07-23 04:02:13               612503689   
370556   CDU 2021-07-23 04:01:26     1133313594802802688   
370557   CDU 2021-07-23 04:00:56      882160247594418177   
370558   CDU 2021-07-23 04:00:35     1360490553411043336   

                      tweet_id  \
0        1.431043780610859e+18   
1       1.4310429607250575e+18   
2        1.431042695691133e+18   
3       1.4310422871029514e+18   
4       1.4310422005954888e+18   
...                        ...   
370554     1418421884966813703   
370555

In [4]:
len(list_of_dfs)

7

In [5]:
import pandas as pd
df_test = pd.read_csv('/Users/finnzurmuehlen/Downloads/df_final_py_test.csv')
df_test

Unnamed: 0,tweet_date,party,poll,reply_count,retweet_count,like_count,avg_len_of_tweet,avg_followers_count,avg_following_count,avg_user_tweet_count,avg_ff_ratio,share_of_tweets,share_unique_users,weighted_sentiment,share_of_positive_tweets,share_of_negative_tweets
0,2021-05-26,AFD,10.50,,,,,,,,,,,,,
1,2021-05-26,CDU,25.67,,,,,,,,,,,,,
2,2021-05-26,FDP,12.83,,,,,,,,,,,,,
3,2021-05-26,GRUENE,23.33,,,,,,,,,,,,,
4,2021-05-26,LINKE,6.17,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
646,2021-08-26,FDP,12.00,7423.0,9063.0,63348.0,178.698807,4663.179284,690.310107,20607.243817,6.755195,0.193819,0.638167,-0.149995,0.062775,0.406026
647,2021-08-26,GRUENE,18.00,3976.0,3199.0,29066.0,155.032074,3785.480448,749.313049,17960.210457,5.051935,0.110768,0.716828,-0.577197,0.056459,0.527460
648,2021-08-26,LINKE,7.00,887.0,713.0,5282.0,190.801338,963.538462,684.646823,20400.862207,1.407351,0.036379,0.590635,-0.134164,0.060201,0.400669
649,2021-08-26,OTHER,6.00,1550.0,2166.0,11618.0,175.401146,1340.969436,638.417861,14171.402579,2.100457,0.050955,0.579752,-0.129097,0.087393,0.409742
