# Notebook 1 - preprocessing

## Intro

This Jupyter notebook contains parts of modeling behind the publication: 
> Krych, K. & Pettersen, JB. (2024). Long-term lifetime trends of large appliances since the introduction in Norwegian households. Journal of Industrial Ecology. 

Here, we preprocess the data, e.g., derive complete time series based on data points through interpolation and regression. It is not necessary to run this notebook, unless the raw data has been changed, or processed data has been deleted. 

## Imports and parameter definition

In [None]:
import os
import numpy as np
import pandas as pd
from scipy.interpolate import interp1d
import scipy.stats
from scipy.optimize import curve_fit

In [None]:
durables = ['fridge & fridge freezer' ,'freezer','washing machine', 'tumble dryer', 'dishwasher', 'oven']
TimeStart = 1940
TimeEnd = 2022
MyYears = list(range(TimeStart,TimeEnd+1))
excel = os.path.abspath(os.path.join(os.getcwd(), 'data.xlsx'))
overwrite = True

In [None]:
def df_to_excel_overlay(excel, df, sheet_name):
    writer = pd.ExcelWriter(excel, mode='a', if_sheet_exists="overlay", engine='openpyxl') 
    df.to_excel(writer, sheet_name=sheet_name, index=False,startrow=1, header=False)
    writer.close()

## Inflows of appliances (I) - interpolation

In [None]:
df_i_data = pd.read_excel(excel, sheet_name='I_data')

In [None]:
def perform_inflows_interpolation(df_d, durable, MyYears):
    min_year = min(df_d['time']) # the first year with data
    max_year = max(df_d['time']) # the last year with data
    if min_year > MyYears[0]:
        df_d = pd.concat([df_d, pd.DataFrame({"time": [MyYears[0]], "durable": [durable], "value": [0]})], ignore_index=True)
        df_d = pd.concat([df_d, pd.DataFrame({"time": [min_year-1], "durable": [durable], "value": [0]})], ignore_index=True)
    if max_year < MyYears[-1]:
        last_value = df_d[df_d['time']==max_year]['value'].item()
        df_d = pd.concat([df_d, pd.DataFrame({"time": [MyYears[-1]], "durable": [durable], "value": [last_value]})], ignore_index=True)
    df_d = df_d.groupby('time').mean(numeric_only=True).reset_index()
    x_data = df_d['time']
    y_data = df_d['value']
    f_linear = interp1d(x_data, y_data)
    return f_linear(MyYears)

In [None]:
df_i_ip = pd.DataFrame(index=pd.MultiIndex.from_product([durables, MyYears], names=['durable','time']), columns=['value'])
for durable in durables:
    df_d = df_i_data[df_i_data['durable'] == durable]
    df_i_ip.loc[durable,:] = perform_inflows_interpolation(df_d, durable, MyYears)
df_i_ip = df_i_ip.reset_index()

In [None]:
if overwrite:
    df_to_excel_overlay(excel,df_i_ip,'I')

## People per dwelling (PpD) - regression

In [None]:
df_ppd_data = pd.read_excel(excel, sheet_name='PpD_data')

In [None]:
def logistic(x, ti, a, C0, C1):
    """
    ti: inflection time
    a: slope
    C0: start value
    C1: end value
    x: vector of observation points (time)
    """
    return (C1 - C0) / (1 + np.exp(-a * (x - ti))) + C0

In [None]:
lower_bounds = [1900, 0, 5, 0]
higher_bounds = [2100, 1, 10, 10]
popt, pcov = curve_fit(logistic, df_ppd_data['time'], df_ppd_data['value'], bounds=[lower_bounds, higher_bounds])
df_ppd_rg = pd.DataFrame(data=logistic(MyYears,*popt), index=pd.MultiIndex.from_product([MyYears], names=['time']), columns=['value'])
df_ppd_rg = df_ppd_rg.reset_index()
df_ppd_rg

In [None]:
if overwrite:
    df_to_excel_overlay(excel,df_ppd_rg,'PpD')

## Inflows of cabins (C) - interpolation

In [None]:
df_c_data = pd.read_excel(excel, sheet_name='C_data', usecols="A:D")
f_cabins = interp1d(list(df_c_data['time']), list(df_c_data['value']))
MyYears_extended = list(range(1900,TimeEnd+1))
df_c_ip = pd.DataFrame(data=f_cabins(MyYears_extended), index=pd.MultiIndex.from_product([MyYears_extended], names=['time']), columns=['value'])
df_c_ip = df_c_ip.reset_index()
df_c_ip

In [None]:
if overwrite:
    df_to_excel_overlay(excel,df_c_ip,'C')

## Share of cabins electrified (SoCE) - regression

In [None]:
df_soce = pd.read_excel(excel, sheet_name='SoCE_data', usecols="A:D")
def linear(x, a, b):
    return a * x + b
x_data, y_data = df_soce['time'], df_soce['value']
with np.errstate(divide='ignore'): # to ignore "RuntimeWarning: divide by zero encountered in..."
    y_data_log = np.log(y_data*100)
y_data_log[y_data_log == -np.inf] = 0
popt, pcov = curve_fit(linear, x_data, y_data_log)
soce = np.exp(popt[0]*np.array(MyYears)+popt[1])/100
soce[soce >1] = 1
soce[:1960-TimeStart] = 0 
soce[soce <0] = 0
df_soce_rg = pd.DataFrame(data=soce, index=pd.MultiIndex.from_product([MyYears], names=['time']), columns=['value'])
df_soce_rg = df_soce_rg.reset_index()
df_soce_rg

In [None]:
if overwrite:
    df_to_excel_overlay(excel,df_soce_rg,'SoCE')

## The number of dwellings

The number of dwellings is calculated using various model parameters. During the uncertainty analysis, the values of these parameters might vary, which is why in the preliminary analysis, the number of dwellings is calculated for each model run. However, in the simplified analysis, where many of the parameters are considered fixed, the number of dwellings is also a fixed parameter. Pre-calculating the number of dwellings improves the computational time in the final analysis.

In [None]:
df_ppd = pd.read_excel(excel, sheet_name='PpD')
df_p = pd.read_excel(excel, sheet_name='P')
df_c = pd.read_excel(excel, sheet_name='C')
df_soce = pd.read_excel(excel, sheet_name='SoCE')
df_k_cab = pd.read_excel(excel, sheet_name='k-cab')
df_l_cab = pd.read_excel(excel, sheet_name='lambda-cab')

In [None]:
dwellings = np.array(df_p['value']/df_ppd['value'])

t = df_c['time']
s = df_c['value']
scale = df_l_cab['value'].values[0]
shape = df_k_cab['value'].values[0]
sf = np.zeros((len(t), len(t)))
for m in range(0, len(t)):  # cohort index
    sf[m::,m] = scipy.stats.weibull_min.sf(np.arange(0,len(t)-m), c=shape, loc = 0, scale=scale)

# MFA calculations start (assuming sf[0] != 0 and no negative inflows)
i = np.zeros(len(t))
s_c = np.zeros((len(t), len(t)))
i[0] = s[0] / sf[0, 0]
s_c[:, 0] = i[0] * sf[:, 0]
for m in range(1, len(t)):
    i[m] = (s[m] - s_c[m, :].sum()) / sf[m,m]
    s_c[m::, m] = i[m] * sf[m::, m]

o_c = np.zeros_like(s_c)
o_c[1::,:] = -1 * np.diff(s_c,n=1,axis=0)
o_c[np.diag_indices(len(t))] = i - np.diag(s_c) # allow for outflow in year 0 already

soce = df_soce['value'] # share of cabins electrified
soce[soce >1] = 1
soce[:1960-TimeStart] = 0 
soce[soce <0] = 0
el_cabins = np.einsum('tc,c->t',s_c[40:,40:],soce)
all_dwellings = dwellings+el_cabins

In [None]:
df_d = pd.DataFrame(data=all_dwellings, index=pd.MultiIndex.from_product([MyYears], names=['time']), columns=['value'])
df_d = df_d.reset_index()
df_d['unit'] = 'dwellings'
df_d['source'] = 'calculated using a dwelling sub-model'

In [None]:
if overwrite:
    df_to_excel_overlay(excel,df_d,'D')