In [1]:
import json
import requests
import pandas as pd
import matplotlib.pyplot as plt



In [2]:
def get_posts_with_keyword(keyword, sample_size):
    sample_size = abs(round(sample_size))
    URL = f'https://mastodon.social/api/v1/timelines/tag/{keyword}'
    params = {
        'limit': 40
    }
    
    results = []
    
    loop_max = round(sample_size / 40)
    index = 0
    while True:
        if index == loop_max:
            break
        
        r = requests.get(URL, params=params)
        toots = json.loads(r.text)
    
        if len(toots) == 0:
            break
        
        results.extend(toots)
        
        max_id = toots[-1]['id']
        params['max_id'] = max_id
    
        index += 1
        if index % 10 == 0:
            print('loop counter: ' + str(index) + ' of ' + str(loop_max))
        
    df = pd.DataFrame(results)
    return df

In [3]:
def keywords_occurrences_in_posts(dataframe, content_keyword_in_dataframe, keywords_to_search):
    contents = list(dataframe[content_keyword_in_dataframe])
    counter = 0
    for content in contents:
        content = content.lower().replace(" ", "").replace("-", "").replace("_", "")
        for keyword in keywords_to_search:
            keyword = keyword.replace(" ", "")
            if keyword in content:
                counter += 1
                break
    return counter / len(contents)

In [4]:
def keywords_occurrences_in_posts_for_two_dataframes(dataframe1, dataframe2, content_keyword_in_dataframe, keywords_to_search):
    contents1 = list(dataframe1[content_keyword_in_dataframe])
    counter1 = 0
    for content in contents1:
        content = content.lower().replace(" ", "").replace("-", "").replace("_", "")
        for keyword in keywords_to_search:
            keyword = keyword.replace(" ", "")
            if keyword in content:
                counter1 += 1
                break
    
    contents2 = list(dataframe2[content_keyword_in_dataframe])
    counter2 = 0
    for content in contents2:
        content = content.lower().replace(" ", "").replace("-", "").replace("_", "")
        for keyword in keywords_to_search:
            keyword = keyword.replace(" ", "")
            if keyword in content:
                counter2 += 1
                break
    
    return [counter1 / len(contents1), counter2 / len(contents2)]

In [5]:
def plot_occurrences(dataframe, title, content_keyword_in_dataframe, keywords_list_to_search):
    title_result = ""
    for title_word in title.split(" "):
        title_result += title_word.upper()[0] + title_word.lower()[1:]
        title_result += " "
    title = title_result.strip()

    keywords_occurrences = {}
    for keywords in keywords_list_to_search:
        keywords_occurrences[keywords[0].upper()[0] + keywords[0].lower()[1:]] = keywords_occurrences_in_posts(dataframe, content_keyword_in_dataframe, keywords)
    
    keywords_occurrences = {k: v for k, v in sorted(keywords_occurrences.items(), key=lambda item: item[1])}
    plt.bar(list(keywords_occurrences.keys()), list(keywords_occurrences.values()))
    
    plt.title('Keywords Occurrences in Percentages for Posts containing ' + title)
    plt.xlabel('Keywords')
    plt.ylabel('Occurrences in Percentages')
    plt.yscale('log')
    
    yticks = [0.020, 0.010, 0.005, 0.002, 0.001]
    plt.yticks(yticks, yticks)
    
    fig = plt.gcf()
    fig.set_size_inches(12, 6)
    fig.savefig('plot1.png', format='png', dpi=100)
    
    plt.show()
    plt.close()

In [6]:
def plot_occurrences_for_two_dataframes(dataframe1, dataframe2, title1, title2, content_keyword_in_dataframe, keywords_list_to_search):
    title1_result = ""
    for title1_word in title1.split(" "):
        title1_result += title1_word.upper()[0] + title1_word.lower()[1:]
        title1_result += " "
    title1 = title1_result.strip()
    
    plt.bar([title1.upper()[0] + title1.lower()[1:], title2.upper()[0] + title2.lower()[1:]], keywords_occurrences_in_posts_for_two_dataframes(dataframe1, dataframe2, content_keyword_in_dataframe, keywords_list_to_search))
    plt.title('Keywords Occurrences in Percentages for Posts containing ' + keywords_list_to_search[0].upper()[0] + keywords_list_to_search[0].lower()[1:])
    plt.xlabel('Keywords')
    plt.ylabel('Occurrences in Percentages')
    
    fig = plt.gcf()
    fig.set_size_inches(6, 6)
    fig.savefig('plot2.png', format='png', dpi=100)
    
    plt.show()
    plt.close()

In [7]:
coffee_sample_size = 10000
frankfurt_sample_size = 5000
munich_sample_size = 5000

dataframe_coffee = get_posts_with_keyword('coffee', coffee_sample_size)
dataframe_frankfurt = get_posts_with_keyword('frankfurt', frankfurt_sample_size)
dataframe_munich = get_posts_with_keyword('munich', munich_sample_size)

plot_occurrences(dataframe_coffee, 'Coffee', 'content', [['espresso'], ['caffe latte', 'coffee latte', 'kaffee latte'], ['black coffee', 'schwarzer kaffee'], ['mocha', 'mokka'], ['americano'], ['cappuccino'], ['flat white'], ['cafe au lait', 'milk coffee', 'milchkaffee'], ['macchiato'], ['iced coffee', 'eiskaffee']])
plot_occurrences_for_two_dataframes(dataframe_frankfurt, dataframe_munich, 'Frankfurt', 'München', 'content', ['coffee', 'kaffee'])

10 of 250
20 of 250
30 of 250
40 of 250
50 of 250
60 of 250
70 of 250
80 of 250
90 of 250
100 of 250
110 of 250
120 of 250
130 of 250
140 of 250
150 of 250
160 of 250
170 of 250
180 of 250
190 of 250
200 of 250
210 of 250
220 of 250
230 of 250
240 of 250
250 of 250
10 of 125
20 of 125
30 of 125
40 of 125
50 of 125


KeyError: -1