In [1]:
# Heatmap
import os
import gzip
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

# Reset font to default
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['DejaVu Sans']

# Folder with JSON files
folder_path = '/Users/hamzabartl/Documents/DataSciencePy/pythonProject/7528718/german-tweet-sample-2019-04'

# Reading all JSON files
all_tweets = []

for filename in os.listdir(folder_path):
    if filename.endswith('.json.gz'):
        with gzip.open(os.path.join(folder_path, filename), 'rt', encoding='utf-8', errors='ignore') as file:
            try:
                json_data = json.load(file)
                all_tweets.extend(json_data)
            except json.JSONDecodeError:
                print(f"Fehler beim Lesen der Datei: {filename}")

# Converting JSON data to DataFrame
df = pd.json_normalize(all_tweets, sep='_')

# Ensure the 'text' column is of type string and remove NaNs in 'text'
df['text'] = df['text'].astype(str)
df = df.dropna(subset=['text'])

# Remove rts
df = df[~df['text'].str.startswith('RT')]

# Ensure the 'created_at' column is treated as datetime
df['created_at'] = pd.to_datetime(df['created_at'])

# Extract hour and weekday
df['hour'] = df['created_at'].dt.hour
df['weekday'] = df['created_at'].dt.dayofweek

# Aggregate data by hour and weekday
heatmap_data = df.groupby(['weekday', 'hour']).size().unstack(fill_value=0)

# Creating the heatmap
plt.figure(figsize=(12, 6))
sns.heatmap(heatmap_data, cmap='YlGnBu', linewidths=.5)
plt.xlabel('Stunde des Tages')
plt.ylabel('Wochentag')
plt.title('Twitter-Aktivität zu verschiedenen Tageszeiten und Wochentagen')
plt.yticks([0, 1, 2, 3, 4, 5, 6], ['Montag', 'Dienstag', 'Mittwoch', 'Donnerstag', 'Freitag', 'Samstag', 'Sonntag'], rotation=0)
plt.show()


In [2]:
#Kreisdiagramm Stimmung mit neutral
import os
import json
import pandas as pd
import re
from textblob import TextBlob
import matplotlib.pyplot as plt

# Folder with JSON files
folder_path = '/Users/hamzabartl/Documents/DataSciencePy/pythonProject/recorded-tweets'

# Reading all JSON files
all_tweets = []
file_count = 0
max_files = 500

for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8', errors='ignore') as file:
            try:
                json_data = json.load(file)
                all_tweets.extend(json_data)
                file_count += 1
                if file_count >= max_files:
                    break
            except json.JSONDecodeError:
                print(f"Fehler beim Lesen der Datei: {filename}")
                
# Converting JSON data to DataFrame
df = pd.json_normalize(all_tweets, sep='_')

# Ensure the 'text' column is of type string and remove NaNs in 'text'
df['text'] = df['text'].astype(str)
df = df.dropna(subset=['text'])

# Remove rts
df = df[~df['text'].str.startswith('RT')]

# Cleaning tweets
def clean_tweet(text):
    text = re.sub(r'http\S+', '', text)  # Entfernen von URLs
    text = re.sub(r'@\w+', '', text)     # Entfernen von @-Erwähnungen
    text = re.sub(r'#\w+', '', text)     # Entfernen von Hashtags
    text = re.sub(r'\s+', ' ', text).strip()  # Entfernen von überflüssigen Leerzeichen
    return text

df['cleaned_text'] = df['text'].apply(clean_tweet)

def extract_hashtags(text):
    return re.findall(r'#(\w+)', text)

df['hashtags'] = df['text'].apply(extract_hashtags)

# sentiment for pos neg and neut
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'
    elif analysis.sentiment.polarity == 0:
        return 'neutral'

df['sentiment'] = df['cleaned_text'].apply(get_sentiment)

def sentiment_for_hashtags(df, hashtags):
    hashtag_sentiments = {}
    for hashtag in hashtags:
        df_hashtag = df[df['hashtags'].apply(lambda x: hashtag in x)]
        if not df_hashtag.empty:
            sentiment_counts = df_hashtag['sentiment'].value_counts()
            hashtag_sentiments[hashtag] = sentiment_counts
    return hashtag_sentiments

hashtags_to_analyze = ['AFD']

hashtag_sentiments = sentiment_for_hashtags(df, hashtags_to_analyze)

for hashtag, sentiment_counts in hashtag_sentiments.items():
    plt.figure(figsize=(8, 6))
    plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140)
    plt.title(f'Stimmungsverteilung für #{hashtag}')
    plt.axis('equal')
    plt.show()


In [3]:
#Kreisdiagramm Stimmung ohne neutral
import os
import json
import pandas as pd
import re
from textblob import TextBlob
import matplotlib.pyplot as plt

# Folder with JSON files
folder_path = '/Users/hamzabartl/Documents/DataSciencePy/pythonProject/recorded-tweets'

# Reading all JSON files
all_tweets = []
file_count = 0
max_files = 500

for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8', errors='ignore') as file:
            try:
                json_data = json.load(file)
                all_tweets.extend(json_data)
                file_count += 1
                if file_count >= max_files:
                    break
            except json.JSONDecodeError:
                print(f"Fehler beim Lesen der Datei: {filename}")
                
# Converting JSON data to DataFrame
df = pd.json_normalize(all_tweets, sep='_')

# Ensure the 'text' column is of type string and remove NaNs in 'text'
df['text'] = df['text'].astype(str)
df = df.dropna(subset=['text'])

# Remove rts
df = df[~df['text'].str.startswith('RT')]

# Cleaning tweets
def clean_tweet(text):
    text = re.sub(r'http\S+', '', text)  # Entfernen von URLs
    text = re.sub(r'@\w+', '', text)     # Entfernen von @-Erwähnungen
    text = re.sub(r'#\w+', '', text)     # Entfernen von Hashtags
    text = re.sub(r'\s+', ' ', text).strip()  # Entfernen von überflüssigen Leerzeichen
    return text

df['cleaned_text'] = df['text'].apply(clean_tweet)

def extract_hashtags(text):
    return re.findall(r'#(\w+)', text)

df['hashtags'] = df['text'].apply(extract_hashtags)

# sentiment for pos neg and neut
def get_sentiment(text):
    analysis = TextBlob(text)
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'

df['sentiment'] = df['cleaned_text'].apply(get_sentiment)

def sentiment_for_hashtags(df, hashtags):
    hashtag_sentiments = {}
    for hashtag in hashtags:
        df_hashtag = df[df['hashtags'].apply(lambda x: hashtag in x)]
        if not df_hashtag.empty:
            sentiment_counts = df_hashtag['sentiment'].value_counts()
            hashtag_sentiments[hashtag] = sentiment_counts
    return hashtag_sentiments

hashtags_to_analyze = ['AFD']

hashtag_sentiments = sentiment_for_hashtags(df, hashtags_to_analyze)

for hashtag, sentiment_counts in hashtag_sentiments.items():
    plt.figure(figsize=(8, 6))
    plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', startangle=140)
    plt.title(f'Stimmungsverteilung für #{hashtag}')
    plt.axis('equal')
    plt.show()


In [None]:
import os
import gzip
import json
import pandas as pd
import re
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module='urllib3')

# Schriftart auf Standard zurücksetzen
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['DejaVu Sans']

# Ordner mit JSON.GZ-Dateien
folder_path = '/Users/hamzabartl/Documents/DataSciencePy/pythonProject/7528718/german-tweet-sample-2019-04'

# Alle JSON.GZ-Dateien im Ordner einlesen
all_tweets = []

for filename in os.listdir(folder_path):
    if filename.endswith('.json.gz'):
        with gzip.open(os.path.join(folder_path, filename), 'rt', encoding='utf-8', errors='ignore') as file:
            try:
                json_data = json.load(file)
                all_tweets.extend(json_data)
            except json.JSONDecodeError:
                print(f"Fehler beim Lesen der Datei: {filename}")

# Die JSON-Daten in ein DataFrame umwandeln
df = pd.json_normalize(all_tweets, sep='_')

# Sicherstellen, dass die 'text'-Spalte vom Typ string ist und NaN-Werte in 'text' entfernen
df['text'] = df['text'].astype(str)
df = df.dropna(subset=['text'])

# Entfernen von Retweets
df = df[~df['text'].str.startswith('RT')]

# Extrahieren von Hashtags
def extract_hashtags(text):
    return re.findall(r'#(\w+)', text)

df['hashtags'] = df['text'].apply(extract_hashtags)

# Filtern der Tweets, die den Hashtag #AfD enthalten
df_afd = df[df['hashtags'].apply(lambda x: 'AfD' in x)]

# Sicherstellen, dass die 'created_at'-Spalte als Datetime-Objekt behandelt wird
df_afd['created_at'] = pd.to_datetime(df_afd['created_at'])

# Filtern nach Tweets im April 2019
df_afd_april_2019 = df_afd[(df_afd['created_at'] >= '2019-04-01') & (df_afd['created_at'] < '2019-05-01')]

# Aggregieren der Daten nach Datum
df_afd_april_2019['date'] = df_afd_april_2019['created_at'].dt.date
afd_counts_april_2019 = df_afd_april_2019.groupby('date').size()

# Visualisieren der aggregierten Daten in einem Liniendiagramm
plt.figure(figsize=(12, 6))
afd_counts_april_2019.plot(kind='line', marker='o')
plt.xlabel('Datum')
plt.ylabel('Anzahl der Tweets')
plt.title('Anzahl der Tweets mit dem Hashtag #AfD im April 2019')
plt.grid(True)
plt.show()

# Finden des Tages mit den meisten Tweets
max_date = afd_counts_april_2019.idxmax()

# Filtern der Tweets an diesem Tag
df_max_date = df_afd_april_2019[df_afd_april_2019['date'] == max_date]

# Aggregieren der Daten nach Uhrzeit (auf Stundenebene)
df_max_date['hour'] = df_max_date['created_at'].dt.hour
afd_counts_hour = df_max_date.groupby('hour').size()

# Visualisieren der aggregierten Daten in einem Liniendiagramm
plt.figure(figsize=(12, 6))
afd_counts_hour.plot(kind='line', marker='o')
plt.xlabel('Uhrzeit')
plt.ylabel('Anzahl der Tweets')
plt.title(f'Anzahl der Tweets mit dem Hashtag #AfD am {max_date}')
plt.grid(True)
plt.xticks(range(0, 24, 2))  # Nur gerade Stunden anzeigen
plt.show()


In [5]:
import os
import gzip
import json
import pandas as pd
import re
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore", category=UserWarning, module='urllib3')

# Schriftart auf Standard zurücksetzen
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['DejaVu Sans']

# Ordner mit JSON.GZ-Dateien
folder_path = '/Users/hamzabartl/Documents/DataSciencePy/pythonProject/7528718/german-tweet-sample-2019-04'

# Alle JSON.GZ-Dateien im Ordner einlesen
all_tweets = []

for filename in os.listdir(folder_path):
    if filename.endswith('.json.gz'):
        with gzip.open(os.path.join(folder_path, filename), 'rt', encoding='utf-8', errors='ignore') as file:
            try:
                json_data = json.load(file)
                all_tweets.extend(json_data)
            except json.JSONDecodeError:
                print(f"Fehler beim Lesen der Datei: {filename}")

# Die JSON-Daten in ein DataFrame umwandeln
df = pd.json_normalize(all_tweets, sep='_')

# Sicherstellen, dass die 'text'-Spalte vom Typ string ist und NaN-Werte in 'text' entfernen
df['text'] = df['text'].astype(str)
df = df.dropna(subset=['text'])

# Entfernen von Retweets
df = df[~df['text'].str.startswith('RT')]

# Extrahieren von Hashtags
def extract_hashtags(text):
    return re.findall(r'#(\w+)', text)

df['hashtags'] = df['text'].apply(extract_hashtags)

# Filtern der Tweets, die den Hashtag #AfD enthalten
df_afd = df[df['hashtags'].apply(lambda x: 'AfD' in x)]

# Sicherstellen, dass die 'created_at'-Spalte als Datetime-Objekt behandelt wird
df_afd['created_at'] = pd.to_datetime(df_afd['created_at'])

# Filtern nach Tweets im April 2019
df_afd_april_2019 = df_afd[(df_afd['created_at'] >= '2019-04-01') & (df_afd['created_at'] < '2019-05-01')]

# Aggregieren der Daten nach Datum
df_afd_april_2019['date'] = df_afd_april_2019['created_at'].dt.date
afd_counts_april_2019 = df_afd_april_2019.groupby('date').size()

# Visualisieren der aggregierten Daten in einem Liniendiagramm
plt.figure(figsize=(12, 6))
afd_counts_april_2019.plot(kind='line', marker='o')
plt.xlabel('Datum')
plt.ylabel('Anzahl der Tweets')
plt.title('Anzahl der Tweets mit dem Hashtag #AfD im April 2019')
plt.grid(True)
plt.show()

# Finden des Tages mit den meisten Tweets
max_date = afd_counts_april_2019.idxmax()

# Filtern der Tweets an diesem Tag
df_max_date = df_afd_april_2019[df_afd_april_2019['date'] == max_date]

# Aggregieren der Daten nach Uhrzeit (auf Stundenebene)
df_max_date['hour'] = df_max_date['created_at'].dt.hour
afd_counts_hour = df_max_date.groupby('hour').size()

# Visualisieren der aggregierten Daten in einem Liniendiagramm
plt.figure(figsize=(12, 6))
afd_counts_hour.plot(kind='line', marker='o')
plt.xlabel('Uhrzeit')
plt.ylabel('Anzahl der Tweets')
plt.title(f'Anzahl der Tweets mit dem Hashtag #AfD am {max_date}')
plt.grid(True)
plt.xticks(range(0, 24, 2))  # Nur gerade Stunden anzeigen
plt.show()


In [6]:
import os
import json
import pandas as pd
import re
from collections import Counter
import matplotlib.pyplot as plt

# Ordner mit JSON-Dateien
folder_path = '/Users/hamzabartl/Documents/DataSciencePy/pythonProject/recorded-tweets'

# Alle JSON-Dateien im Ordner einlesen
all_tweets = []
file_count = 0
max_files = 600

for filename in os.listdir(folder_path):
    if filename.endswith('.json'):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8', errors='ignore') as file:
            try:
                json_data = json.load(file)
                all_tweets.extend(json_data)
                file_count += 1
                if file_count >= max_files:
                    break
            except json.JSONDecodeError:
                print(f"Fehler beim Lesen der Datei: {filename}")

# Die JSON-Daten in ein DataFrame umwandeln
df = pd.json_normalize(all_tweets, sep='_')

# Sicherstellen, dass die 'text'-Spalte vom Typ string ist und NaN-Werte in 'text' entfernen
df['text'] = df['text'].astype(str)
df = df.dropna(subset=['text'])

# Entfernen von Retweets
df = df[~df['text'].str.startswith('RT')]

# Extrahieren von Hashtags
def extract_hashtags(text):
    return re.findall(r'#(\w+)', text)

df['hashtags'] = df['text'].apply(extract_hashtags)

# Zählen der Häufigkeit der Hashtags
hashtags = Counter([hashtag for hashtags in df['hashtags'] for hashtag in hashtags])

# Die 10 häufigsten Hashtags visualisieren
common_hashtags = hashtags.most_common(10)
labels, values = zip(*common_hashtags)

plt.figure(figsize=(10, 6))
plt.bar(labels, values)
plt.xlabel('Hashtags')
plt.ylabel('Häufigkeit')
plt.title('Top 10 Hashtags')
plt.show()

In [7]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

archive_url = "https://web.archive.org/web/20190507094611/https://twitter.com/AfD"

response = requests.get(archive_url)
response.raise_for_status()  # Überprüfen, ob die Anfrage erfolgreich war

soup = BeautifulSoup(response.text, 'html.parser')

tweets = soup.find_all('div', {'class': 'tweet'})

tweets_list = []

for tweet in tweets:
    tweet_id = tweet['data-item-id']
    tweet_text = tweet.find('p', {'class': 'tweet-text'}).text if tweet.find('p', {'class': 'tweet-text'}) else ''
    tweet_date = tweet.find('a', {'class': 'tweet-timestamp'})['title'] if tweet.find('a', {'class': 'tweet-timestamp'}) else ''
    username = tweet['data-screen-name'] if 'data-screen-name' in tweet.attrs else ''
    tweets_list.append([tweet_date, tweet_id, tweet_text, username])

tweets_df = pd.DataFrame(tweets_list, columns=['date', 'id', 'content', 'username'])
print(tweets_df.head())
tweets_df.to_csv("archived_tweets_AfD_May4_2019.csv", index=False)
