# agof Use Case

In this use case, we will fetch, process and extract some preliminary results from the 'daily digital facts' market media study by agof (Arbeitsgemeinschaft Online Forschung).

This notebook is divided into three parts:
1. Downloading Raw Data
2. Processing Downloaded Data
3. Extracting Facts

You are encouraged to play around with it. The blocks with comments beginning in `# curiosity: ...` or `# sanity check: ...` have been included to further your understanding of the logic. These blocks are not vital to the core logic of this use case and can therefore be skipped when you translate the notebook to core4 jobs.

Lines of code which produce a large output have been commented out with three hash signs (`###`) for better readability. You can uncomment them in order to run them.

### 1. Downloading Raw Data

In [None]:
# fetching the webpage

import requests

url = "https://www.agof.de/service-downloads/downloadcenter/download-daily-digital-facts/"
rv = requests.get(url)
body = rv.content.decode("utf-8")

In [None]:
# curiosity: how is the content we fetched stored by python?

type(body)

In [None]:
# sanity check: have we fetched the right thing?

### body

In [None]:
# scraping the fetched content

from bs4 import BeautifulSoup

soup = BeautifulSoup(body, "html.parser")
tables_list = soup.find_all("tr")

In [None]:
# sanity check: have we scraped correctly? (1/2)

tables_list[1]

In [None]:
# sanity check: have we scraped correctly? (2/2)

tables_list[1].text

In [None]:
# isolating relevant links from the list of scraped html table rows (<tr>...</tr>)

links = [item for item in tables_list if "Angebote Ranking" in item.text]
links_list = [item for item in links if "xlsx" in item.text]

In [None]:
# sanity check: did we isolate the right links?

str(links_list[0])

In [None]:
# using regular expresssions to extract the link from each string in the list

import re
re.findall("href=[\"\'](.+?)[\"\']", str(links_list[0]))

In [None]:
xls = []
for i in links_list:
    xls.append(re.findall("href=[\"\'](.+?)[\"\']", str(i))[0])

In [None]:
# sanity check: does our list of links look right?

# xls

In [None]:
xls[0]

In [None]:
len(xls)

In [None]:
# process the first item, then create a loop to process a couple more
rv = requests.get(xls[0])

In [None]:
open("/tmp/test.xlsx", "wb").write(rv.content)

In [None]:
import pandas as pd
#df2 = pd.read_excel("/tmp/test.xlsx", skiprows = 8) #- also possible but we loose meta data

In [None]:
# read data without skipping rows and find out the start
df = pd.read_excel("/tmp/test.xlsx", header=None)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
assert df.iloc[0, 0] == "Analyse"
analyse = df.iloc[0, 1]
assert df.iloc[1, 0] == "Grundgesamtheit"
grundgesamtheit = df.iloc[1, 1]
assert df.iloc[2, 0] == "Zeitraum"
zeitraum = df.iloc[2, 1]
assert df.iloc[3, 0] == "Vorfilter"
vorfilter = df.iloc[3, 1]
vorfilter_fallzahl = df.iloc[4, 1]
assert df.iloc[5, 0] == "Zielgruppe"
zielgruppe = df.iloc[5, 1]
zielgruppe_fallzahl = df.iloc[6, 1]

In [None]:
ln = 7
while df.iloc[ln, 0] != "Basis":
    ln += 1
    if ln > 1000:
        raise  RuntimeError("failed to identify start of data")

In [None]:
ln

In [None]:
dframe = df.iloc[ln:].copy()

In [None]:
dframe.head()

In [None]:
dframe = df.iloc[ln:].copy()
cols = list(df.iloc[ln-1])
cols[0] = "Titel"
dframe.columns = ["" if pd.isnull(c)
             else c.replace("\n", " ").replace(".", "") for c in cols]
if "" in dframe.columns:
    dframe.drop([""], axis=1, inplace=True)
dframe["Analyse"] = analyse
dframe["Grundgesamtheit"] = grundgesamtheit
dframe["Zeitraum"] = zeitraum
dframe["Vorfilter"] = vorfilter
dframe["Zielgruppe"] = zielgruppe

In [None]:
dframe.head()

In [None]:
dframe.Zeitraum.value_counts()

In [None]:
def process(df):
    assert df.iloc[0, 0] == "Analyse"
    analyse = df.iloc[0, 1]
    assert df.iloc[1, 0] == "Grundgesamtheit"
    grundgesamtheit = df.iloc[1, 1]
    assert df.iloc[2, 0] == "Zeitraum"
    zeitraum = df.iloc[2, 1]
    assert df.iloc[3, 0] == "Vorfilter"
    vorfilter = df.iloc[3, 1]
    vorfilter_fallzahl = df.iloc[4, 1]
    assert df.iloc[5, 0] == "Zielgruppe"
    zielgruppe = df.iloc[5, 1]
    zielgruppe_fallzahl = df.iloc[6, 1]
    ln = 7
    while df.iloc[ln, 0] != "Basis":
        ln += 1
        if ln > 1000:
            raise  RuntimeError("failed to identify start of data")
    d = df.iloc[ln:].copy()
    cols = list(df.iloc[ln-1])
    cols[0] = "Titel"
    d.columns = ["" if pd.isnull(c)
                 else c.replace("\n", " ").replace(".", "") for c in cols]
    if "" in d.columns:
        d.drop([""], axis=1, inplace=True)
    d["Analyse"] = analyse
    d["Grundgesamtheit"] = grundgesamtheit
    d["Zeitraum"] = zeitraum
    d["Vorfilter"] = vorfilter
    d["Zielgruppe"] = zielgruppe
    return(d)
    

In [None]:
fin_df = list()
fin_df.append(dframe)
for i in range(1,30):
    rv = requests.get(xls[i])
    open("/tmp/test"+str(i)+".xlsx", "wb").write(rv.content)
    df = pd.read_excel("/tmp/test"+str(i)+".xlsx", header=None)
    df_processed = process(df)
    fin_df.append(df_processed)
    
fin_df = pd.concat(fin_df)

In [None]:
fin_df.shape

In [None]:
fin_df.Zeitraum.value_counts()

In [None]:
import datetime
MONAT = {
    "Januar": "01",
    "Februar": "02",
    "März": "03",
    "April": "04",
    "Mai": "05",
    "Juni": "06",
    "Juli": "07",
    "August": "08",
    "September": "09",
    "Oktober": "10",
    "November": "11",
    "Dezember": "12"
}
monat = fin_df.Zeitraum.apply(lambda s: s.replace("Letzter Monat (", "").replace(")", "").split())

In [None]:
type(monat)

In [None]:
monat.head()

In [None]:
fin_df["Monat"] = [datetime.datetime.strptime("01." + MONAT[m[0]] + "." + m[1], "%d.%m.%Y") for m in monat]

In [None]:
fin_df["val"] = fin_df["Kontakte Mio"].apply(pd.to_numeric, errors='coerce')
fin_df['Date'] = fin_df.Monat.apply(lambda x: x.date().isoformat())

In [None]:
%matplotlib notebook
import matplotlib.pyplot as plt

In [None]:
import numpy as np
fin_df = fin_df.replace(np.nan,0)
g = fin_df.groupby(["Date"]).val.sum()
g.plot.bar()
plt.ylabel("Contacts")
plt.tight_layout()

In [None]:
fin_df.columns

In [None]:
fin_df.Medientyp.unique()

In [None]:
%matplotlib notebook
df_new = fin_df[fin_df.Medientyp != 0]
g1 = df_new.groupby(["Medientyp"]).val.sum()
g1.plot.bar()
plt.ylabel("Contacts")
plt.xticks(rotation='horizontal')
plt.tight_layout()

In [None]:
list(g1.index)

In [None]:
%matplotlib notebook

df_new = fin_df[fin_df.Medientyp != 0]
# Monthly contacts for each media group
g1 = df_new.groupby(["Date","Medientyp"]).val.sum().unstack()
# contact of different media group per month
# g1 = df_new.groupby(["Date","Medientyp"]).val.sum().unstack(0)
plt.rcParams["figure.figsize"] = [7,7]
g1.plot.bar(rot=45)

plt.ylabel("Contacts")
plt.legend(fontsize='small')
plt.tight_layout()