# Twitter RNN - Bundestagswahlen in Deutschland 2017 
## Maren, Sophia & Malin 

## Preprocessing 
### "Chunk 2"

In [1]:
import pandas as pd
import json
import numpy as np
import os
import re

In [2]:
# Funktionen die wir während des Preprocessings anwenden 

''' Get all tweets by members of a certain party
    input:  Party = list of the screennames of the members of a political party
            all_tweets = a file with all tweets
    output: all tweets by either of the screennames provided
'''
def get_tweets_by(party, all_tweets):
    tweets = []
    # go through all tweets
    for i in range(len(all_tweets)):
        # go through all members of the party
        for member in party:
            # if the screenname of the current tweet is in the list -> append
            if (all_tweets[i]['user']['screen_name'] == member):
                tweets.append(all_tweets[i]['text'])
    return tweets

''' Remove unwanted symbols from the text
    input:  a string file
    output: the string without the unwanted symbols
'''
def clean_text(text):
    # remove URl links
    text = re.sub(r"http\S+", "", text)
    # remove @screenames
    text = re.sub(r"@\S+", "", text)
    # remove the indicator for a retweet "RT"
    text = re.sub(r"RT", "", text)
    #text = re.sub(r"\'", "", text)
    # remove emojis etc
    text = re.sub(r"🇩🇪", "", text)
    text = re.sub(r"😁", "", text)
    text = re.sub(r"😀", "", text)
    text = re.sub(r"'❤'", "", text)
    text = re.sub(r"❤", "", text)
    text = re.sub(r"🎉", "", text)
    text = re.sub(r"'️'", "", text)
    text = re.sub(r"'\S+", "", text)
   
    # replace "Umlaute" and ß
    text = re.sub(r"ü", "ue", text)
    text = re.sub(r"ö", "oe", text)
    text = re.sub(r"ä", "ae", text)
    text = re.sub(r"ß", "ss", text)
    # add a space before each dot to keep it as its own token
    text = re.sub(r"\.", " .", text)
    # remove single symbols
    ''.join( c for c in text if  c not in '[,],/,:,&,_,1,2,3,4,5,6,7,8,9,0,,' )
    return text

In [3]:
# öffnen der json files, in denen gespeichert ist welche Account namen zu welcher Partei gehören
with open('followed-accounts.json') as json_file:
    followed_accounts = json.load(json_file)
    
CDU = followed_accounts['CDU/CSU']
SPD = followed_accounts['SPD']
FDP = followed_accounts['FDP']
LINKE = followed_accounts['Linke']
GRUENE = followed_accounts['Grüne']
AFD = followed_accounts['AfD']


In [6]:
# path wo die twitter daten gespeichert werden
# wir öffnen hier den zweiten von 3 chunks in die wir die 10GB Daten unterteilt haben 
# (um die Verarbeitung zu beschleunigen)

path = 'recorded-tweets/chunk2'

# geh in alle directories, wo die twitter daten gespeichert sind 
#wir haben die daten in 2 blocks unterteilt, damit das durchführen schneller geht

all_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.json')]

data1 = []
for i in range(200):
    file = pd.read_json(all_files[i], orient='index')
    data1.append(file)

In [9]:
# path wo die Twitterdaten liegen
path = 'recorded-tweets/chunk2'
# geh durch alle directories der twitterdaten durch
all_files = [os.path.join(path, f) for f in os.listdir(path) if f.endswith('.json')]

data2 = []
for i in range(201,436):
    file = pd.read_json(all_files[i], orient='index')
    data2.append(file)


## Speichern der Tweets für jede Partei separat

### AFD data1

In [163]:
# Lege eine leere Liste um die Tweets einer Partei abzuspeichern. 
# Hier sind immer mal wieder Fehlermeldungen für einzelne Tweets gekommen, diese haben wir dann einfach manuell 
# übersprungen, daher gibt es viele kleinere for Schleifen, anhand von einer Schleife, die durch alle durch läuft

#hier wird die data1 Liste durchgelaufen

tweets_AFD = []
for i in range(76):
    tweets_AFD.append(get_tweets_by(AFD, data1[i]))
    
for i in range(77,87):
    tweets_AFD.append(get_tweets_by(AFD, data1[i]))

for i in range(88,125):
    tweets_AFD.append(get_tweets_by(AFD, data1[i]))
    
for i in range(126,172):
    tweets_AFD.append(get_tweets_by(AFD, data1[i]))
    
for i in range(173,178):
    tweets_AFD.append(get_tweets_by(AFD, data1[i]))
    
for i in range(179,200):
    tweets_AFD.append(get_tweets_by(AFD, data1[i]))

### AFD data 2

In [164]:
# hier wird die data2 Liste durchgelaufen
# angehängt werden die tweets jedoch in die gleiche Datei (tweets_AFD)
for i in range(20):
    tweets_AFD.append(get_tweets_by(AFD, data2[i]))
    
for i in range(21,76):
    tweets_AFD.append(get_tweets_by(AFD, data2[i]))
    
for i in range(77,95):
    tweets_AFD.append(get_tweets_by(AFD, data2[i]))    
    
for i in range(96, 122):
    tweets_AFD.append(get_tweets_by(AFD, data2[i]))
    
for i in range(123,177):
    tweets_AFD.append(get_tweets_by(AFD, data2[i]))
    
for i in range(178,235):
    tweets_AFD.append(get_tweets_by(AFD, data2[i]))  

### Linke data 1

In [165]:
# der gleiche Prozess wird nun für die anderen 5 Parteien durchlaufen
# die Fehlermeldungen sind Partei unabhänging und scheinen am Tweet selbst zu liegen, daher ist die
# Aufteilung der for Schleifen für jede Partei identisch

tweets_LINKE = []
for i in range(76):
    tweets_LINKE.append(get_tweets_by(LINKE, data1[i]))
    
for i in range(77,87):
    tweets_LINKE.append(get_tweets_by(LINKE, data1[i]))

for i in range(88,125):
    tweets_LINKE.append(get_tweets_by(LINKE, data1[i]))
    
for i in range(126,172):
    tweets_LINKE.append(get_tweets_by(LINKE, data1[i]))
    
for i in range(173,178):
    tweets_LINKE.append(get_tweets_by(LINKE, data1[i]))
    
for i in range(179,200):
    tweets_LINKE.append(get_tweets_by(LINKE, data1[i]))

### Linke data2

In [166]:
for i in range(20):
    tweets_LINKE.append(get_tweets_by(LINKE, data2[i]))
    
for i in range(21,76):
    tweets_LINKE.append(get_tweets_by(LINKE, data2[i]))
    
for i in range(77,95):
    tweets_LINKE.append(get_tweets_by(LINKE, data2[i]))    
    
for i in range(96, 122):
    tweets_LINKE.append(get_tweets_by(LINKE, data2[i]))
    
for i in range(123,177):
    tweets_LINKE.append(get_tweets_by(LINKE, data2[i]))
    
for i in range(178,235):
    tweets_LINKE.append(get_tweets_by(LINKE, data2[i]))  

### CDU data 1

In [167]:
tweets_CDU = []
for i in range(76):
    tweets_CDU.append(get_tweets_by(CDU, data1[i]))
    
for i in range(77,87):
    tweets_CDU.append(get_tweets_by(CDU, data1[i]))

for i in range(88,125):
    tweets_CDU.append(get_tweets_by(CDU, data1[i]))
    
for i in range(126,172):
    tweets_CDU.append(get_tweets_by(CDU, data1[i]))
    
for i in range(173,178):
    tweets_CDU.append(get_tweets_by(CDU, data1[i]))
    
for i in range(179,200):
    tweets_CDU.append(get_tweets_by(CDU, data1[i]))

### CDU data 2 

In [168]:
for i in range(20):
    tweets_CDU.append(get_tweets_by(CDU, data2[i]))
    
for i in range(21,76):
    tweets_CDU.append(get_tweets_by(CDU, data2[i]))
    
for i in range(77,95):
    tweets_CDU.append(get_tweets_by(CDU, data2[i]))    
    
for i in range(96, 122):
    tweets_CDU.append(get_tweets_by(CDU, data2[i]))
    
for i in range(123,177):
    tweets_CDU.append(get_tweets_by(CDU, data2[i]))
    
for i in range(178,235):
    tweets_CDU.append(get_tweets_by(CDU, data2[i]))  

### SPD data

In [169]:
# data1
tweets_SPD = []
for i in range(76):
    tweets_SPD.append(get_tweets_by(SPD, data1[i]))
    
for i in range(77,87):
    tweets_SPD.append(get_tweets_by(SPD, data1[i]))

for i in range(88,125):
    tweets_SPD.append(get_tweets_by(SPD, data1[i]))
    
for i in range(126,172):
    tweets_SPD.append(get_tweets_by(SPD, data1[i]))
    
for i in range(173,178):
    tweets_SPD.append(get_tweets_by(SPD, data1[i]))
    
for i in range(179,200):
    tweets_SPD.append(get_tweets_by(SPD, data1[i]))
    
# data2

for i in range(20):
    tweets_SPD.append(get_tweets_by(SPD, data2[i]))
    
for i in range(21,76):
    tweets_SPD.append(get_tweets_by(SPD, data2[i]))
    
for i in range(77,95):
    tweets_SPD.append(get_tweets_by(SPD, data2[i]))    
    
for i in range(96, 122):
    tweets_SPD.append(get_tweets_by(SPD, data2[i]))
    
for i in range(123,177):
    tweets_SPD.append(get_tweets_by(SPD, data2[i]))
    
for i in range(178,235):
    tweets_SPD.append(get_tweets_by(SPD, data2[i]))  

### Grüne data

In [170]:
# data1
tweets_GRUENE = []
for i in range(76):
    tweets_GRUENE.append(get_tweets_by(GRUENE, data1[i]))
    
for i in range(77,87):
    tweets_GRUENE.append(get_tweets_by(GRUENE, data1[i]))

for i in range(88,125):
    tweets_GRUENE.append(get_tweets_by(GRUENE, data1[i]))
    
for i in range(126,172):
    tweets_GRUENE.append(get_tweets_by(GRUENE, data1[i]))
    
for i in range(173,178):
    tweets_GRUENE.append(get_tweets_by(GRUENE, data1[i]))
    
for i in range(179,200):
    tweets_GRUENE.append(get_tweets_by(GRUENE, data1[i]))
    
    
# data2
for i in range(20):
    tweets_GRUENE.append(get_tweets_by(GRUENE, data2[i]))
    
for i in range(21,76):
    tweets_GRUENE.append(get_tweets_by(GRUENE, data2[i]))
    
for i in range(77,95):
    tweets_GRUENE.append(get_tweets_by(GRUENE, data2[i]))    
    
for i in range(96, 122):
    tweets_GRUENE.append(get_tweets_by(GRUENE, data2[i]))
    
for i in range(123,177):
    tweets_GRUENE.append(get_tweets_by(GRUENE, data2[i]))
    
for i in range(178,235):
    tweets_GRUENE.append(get_tweets_by(GRUENE, data2[i]))  

### FDP data

In [171]:
#data1
tweets_FDP = []
for i in range(76):
    tweets_FDP.append(get_tweets_by(FDP, data1[i]))
    
for i in range(77,87):
    tweets_FDP.append(get_tweets_by(FDP, data1[i]))

for i in range(88,125):
    tweets_FDP.append(get_tweets_by(FDP, data1[i]))
    
for i in range(126,172):
    tweets_FDP.append(get_tweets_by(FDP, data1[i]))
    
for i in range(173,178):
    tweets_FDP.append(get_tweets_by(FDP, data1[i]))
    
for i in range(179,200):
    tweets_FDP.append(get_tweets_by(FDP, data1[i]))
    

#data2
for i in range(20):
    tweets_FDP.append(get_tweets_by(FDP, data2[i]))
    
for i in range(21,76):
    tweets_FDP.append(get_tweets_by(FDP, data2[i]))
    
for i in range(77,95):
    tweets_FDP.append(get_tweets_by(FDP, data2[i]))    
    
for i in range(96, 122):
    tweets_FDP.append(get_tweets_by(FDP, data2[i]))
    
for i in range(123,177):
    tweets_FDP.append(get_tweets_by(FDP, data2[i]))
    
for i in range(178,235):
    tweets_FDP.append(get_tweets_by(FDP, data2[i]))  

### Cleanen der strings, separat für jede Partei

In [172]:
# hier werden strings für jede Partei erstellt, und die clean Funktion wird angewendet
str_AFD = ''.join(map(str,tweets_AFD))
str_LINKE = ''.join(map(str,tweets_LINKE))
str_CDU = ''.join(map(str,tweets_CDU))
str_SPD = ''.join(map(str,tweets_SPD))
str_GRUENE = ''.join(map(str,tweets_GRUENE))
str_FDP = ''.join(map(str,tweets_FDP))

clean_str_AFD = clean_text(str_AFD)
clean_str_LINKE = clean_text(str_LINKE)
clean_str_CDU = clean_text(str_CDU)
clean_str_SPD = clean_text(str_SPD)
clean_str_GRUENE = clean_text(str_GRUENE)
clean_str_FDP = clean_text(str_FDP)

### Erstelle .txt files - eine pro Partei

In [173]:
f= open("AFD_2.txt","w+",encoding="utf8", errors='ignore')     
f.write(clean_str_AFD)
f.close()

In [174]:
f= open("LINKE_2.txt","w+",encoding="utf8", errors='ignore')     
f.write(clean_str_LINKE)
f.close()

In [175]:
f= open("CDU_2.txt","w+",encoding="utf8", errors='ignore')     
f.write(clean_str_CDU)
f.close()

In [176]:
f= open("SPD_2.txt","w+",encoding="utf8", errors='ignore')     
f.write(clean_str_SPD)
f.close()

In [177]:
f= open("GRUENE_2.txt","w+",encoding="utf8", errors='ignore')     
f.write(clean_str_GRUENE)
f.close()

In [178]:
f= open("FDP_2.txt","w+",encoding="utf8", errors='ignore')     
f.write(clean_str_FDP)
f.close()