In [None]:
# extract the header from the intranet_extract
# the header is the first <h1> tag
from bs4 import BeautifulSoup
import re
import pandas as pd
from numpy import nan

In [None]:
with open("data/archiv.html", "r") as f:
    html_dump = f.read()
soup = BeautifulSoup(html_dump)

intranet_df = pd.DataFrame(columns=["date", "title", "message"])

# find all elemtents with the class csc-default
for element in soup.find_all(class_='csc-default'):
    try:
        # extract the header from the element
        title = element.find('h1').text
        try:
            date, title = title.split("_", 1)
        except ValueError:
            date = None
    
        # concatenate all paragraphs
        message = " ".join([p.text for p in element.find_all('p')])
        message = re.sub(r'\s+', ' ', message).strip()

        intranet_df.loc[len(intranet_df)] = [date, title, message]

    except AttributeError:
        continue

In [None]:
display(intranet_df.head(25))

In [None]:
# sometimes the date can be found in the title
intranet_df.loc[pd.isna(intranet_df.date), "date"] = intranet_df.loc[pd.isna(intranet_df.date), "title"].str.extract("^\w?(\d{2}\.\d{2}\.)", expand=False)
intranet_df.head(10)

In [None]:
intranet_df.loc[pd.isna(intranet_df.date)]

In [None]:
# add point after month (is missing in some cases)
point_column = pd.DataFrame(["." for i in range(len(intranet_df))], columns=["a"])
intranet_df["date"] = intranet_df.date.str.extract("^\w?(\d{2}\.\d{2})", expand=False).str.cat(point_column)
intranet_df.head(10)

In [None]:
temp_df = pd.DataFrame(columns=["date", "year"])
temp_df["date"] = intranet_df.date

year = 2024
precursor_month = 13
temp_df.loc[0, "year"] = 2024
for i in range(1, len(intranet_df)-1):
    if not pd.isna(intranet_df.loc[i, "date"]):
        try:
            precursor_month = intranet_df.loc[i-1, "date"].split(".")[1]
            precursor_month = int(precursor_month)
            precursor_year_available = pd.isna(temp_df.loc[i-1, "year"]) == False
        except AttributeError:
            try:
                precursor_month = intranet_df.loc[i-2, "date"].split(".")[1]
                precursor_month = int(precursor_month)
                precursor_year_available = pd.isna(temp_df.loc[i-2, "year"]) == False
            except AttributeError:
                precursor_month = None
                precursor_year_available = False
        try:
            successor_month = intranet_df.loc[i+1, "date"].split(".")[1]
            successor_month = int(successor_month)
        except AttributeError:
            try:
                successor_month = intranet_df.loc[i+2, "date"].split(".")[1]
                successor_month = int(successor_month)
            except AttributeError:
                successor_month = None
            
        month = intranet_df.loc[i, "date"].split(".")[1]
        month = int(month)

        if precursor_month:
            if month > precursor_month:
                if successor_month:
                    if month == successor_month and precursor_year_available:
                        year -= 1
                else:
                     print("problem with row", i, " month > precursor but no successor available")
                     continue
            else:
                # sometimes the date is not correct
                if successor_month:
                    if successor_month == precursor_month and month != precursor_month:
                        continue
            
        temp_df.loc[i, "year"] = year

# the last entry is from 2020 but has no successor
temp_df.loc[len(intranet_df) - 1, "year"] = 2020

In [None]:
temp_df["concat_date"] = temp_df.date.str.cat(temp_df.year.astype(str))
temp_df.loc[pd.isna(temp_df.year), "concat_date"] = nan
temp_df.loc[~pd.isna(temp_df.year), "timestamp"] = pd.to_datetime(temp_df.loc[~pd.isna(temp_df.year), "concat_date"], format="%d.%m.%Y")
temp_df.head()

In [None]:
temp_df.concat_date.isna().sum()

In [None]:
# iterate through the records with missing timestamp and impute it by taking the mean of the successor and the precursor
for i, row in temp_df.loc[pd.isna(temp_df.year)].iterrows():
    try:
        temp_df.loc[i, "timestamp"] = temp_df.loc[[i-1, i+1], "timestamp"].mean()
    except KeyError:
        pass

In [None]:
temp_df[14:24]

In [None]:
temp_df.tail(10)

In [None]:
len(temp_df) == len(intranet_df)

In [None]:
intranet_df["timestamp"] = temp_df["timestamp"].dt.date
intranet_df["timestamp_imputed"] = pd.isna(temp_df["concat_date"])
intranet_df.drop(columns="date", inplace=True)

In [None]:
intranet_df.tail(20)

In [None]:
intranet_df.to_csv("data/intranet_data.tsv", index=False, encoding="utf-8", sep="\t")