In [1]:
import pandas as pd
from docx import Document

# Extract 1985 data from old report

Up to data year 2018 (i.e. report written in 2019), John Rune included data from 1985 in the tables in section 5 of the report (under the heading "Tilførsler av næringssalter til prioriterte kystavsnitt"). Note that the tables in sections 6 and 7 begin in 1990 - **it is only sections 5.1 to 5.12 that include extra data for 1985**.

I cannot find any raw data in NIVA's systems for 1985, but Miljødirektoratet would like the extra row including if possible. This notebook extracts the values from John Rune's 2017 report (written in 2018) and saves them as a CSV. 

**These values should be used with caution as they are not reproducible**.

In [2]:
docx_path = r"../jse_data_1985/TEOTIL_JSE__utkast_2018.docx"
doc = Document(docx_path)

header = [
    "År",
    "Akvakultur",
    "Jordbruk",
    "Befolkning",
    "Industri",
    "Bakgrunn",
    "Totalt",
    "Menneskeskapt",
]

# Section names for sections 5.1 to 5.12 in blank template
section_names = [
    "Norges kystområder: fosfor (tonn)",
    "Norges kystområder: nitrogen (tonn)",
    "Sverige – Strømtangen fyr: fosfor (tonn)",
    "Sverige – Strømtangen fyr: nitrogen (tonn)",
    "Indre Oslofjord: fosfor (tonn)",
    "Indre Oslofjord: nitrogen (tonn)",
    "Svenskegrensa – Lindesnes: fosfor (tonn)",
    "Svenskegrensa – Lindesnes: nitrogen (tonn)",
    "Lindesnes – Stad: fosfor (tonn)",
    "Lindesnes – Stad: nitrogen (tonn)",
    "Stad – Russland: fosfor (tonn)",
    "Stad – Russland: nitrogen (tonn)",
]

# Get table data from doc
data = []
for table in doc.tables:
    row_count = len(table.rows)
    # Get just the main tables
    if row_count > 10:
        row = table.rows[1]
        row_data = [cell.text for cell in row.cells]
        # Just the tables with 1985 in the first row
        if row_data[0] == "1985":
            # Replace occurrences of '\xa0' with ''
            row_data = [i if i != "\xa0" else "" for i in row_data]
            data.append(row_data)

df = pd.DataFrame.from_records(data, columns=header)
assert len(df) == 12
df["section_name"] = section_names
csv_path = r"../jse_data_1985/data_1985.csv"
df.rename({"Befolkning": "Avløp"}, axis="columns", inplace=True)
df.to_csv(csv_path, index=False)
df

Unnamed: 0,År,Akvakultur,Jordbruk,Avløp,Industri,Bakgrunn,Totalt,Menneskeskapt,section_name
0,1985,361,,2687,601,1269,,,Norges kystområder: fosfor (tonn)
1,1985,1710,,22815,7901,54275,,,Norges kystområder: nitrogen (tonn)
2,1985,0,228.0,222,34,179,662.0,483.0,Sverige – Strømtangen fyr: fosfor (tonn)
3,1985,0,7526.0,2832,201,6377,16937.0,10560.0,Sverige – Strømtangen fyr: nitrogen (tonn)
4,1985,0,18.0,130,7,17,171.0,154.0,Indre Oslofjord: fosfor (tonn)
5,1985,0,474.0,3498,472,380,4824.0,4444.0,Indre Oslofjord: nitrogen (tonn)
6,1985,3,401.0,928,133,369,1834.0,1465.0,Svenskegrensa – Lindesnes: fosfor (tonn)
7,1985,12,14631.0,11929,5659,17660,49891.0,32231.0,Svenskegrensa – Lindesnes: nitrogen (tonn)
8,1985,358,,908,304,198,1768.0,1135.0,Lindesnes – Stad: fosfor (tonn)
9,1985,1698,,5650,717,13865,,,Lindesnes – Stad: nitrogen (tonn)
