In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import datetime
import seaborn as sns

In [3]:
DATA_FOLDER = 'data'

data_file = os.path.join(DATA_FOLDER, 'timeuse.csv')
df = pd.read_csv(data_file, sep=";")

print(df.head())

                         Activity  Time used hours,minutes Men 1979  \
0       01     Gainful employment                              4.00   
1  02     Travel to and from work                              0.18   
2             03     Housekeeping                              0.34   
3         04     Maintenance work                              0.27   
4      05     Other domestic work                              0.11   

   Time used hours,minutes Men 1987  Time used hours,minutes Men 1999  \
0                              4.09                              4.08   
1                              0.20                              0.19   
2                              0.35                              0.40   
3                              0.27                              0.25   
4                              0.12                              0.11   

   Time used hours,minutes Men 2009  Time used hours,minutes Women 1979  \
0                              3.07                        

In [6]:
activities = df["Activity"].unique()
print(activities)

['01     Gainful employment' '02     Travel to and from work'
 '03     Housekeeping' '04     Maintenance work'
 '05     Other domestic work' '06     Childcare'
 '07     Shopping and services' '08     Travel related to domestic work'
 '09     Sleep' '10     Meals' '11     Washing and dressing'
 '12     School or university' '13     Travel related to study'
 '14     Free time study (incl. related travel)'
 '15     Participatory activity' '16     Sports and outdoor activities'
 '17     Entertainment and culture' '18     Reading'
 '19     Listening to radio' '20     Watching television'
 '21     Socialising with family' '22     Socialising with friends'
 '23     Hobbies' '24     Other free time'
 '25     Travel related to free time']


In [8]:
genders = \
    np.unique([c_name.split(" ")[-2] 
               for c_name in df.columns 
               if c_name not in ("Activity", )])
print(genders)

['Men' 'Women']


In [10]:
year = \
    np.unique([c_name.split(" ")[-1] 
               for c_name in df.columns 
               if c_name not in ("Stage in family cycle", "Activity")])
print(year)

['1979' '1987' '1999' '2009']


In [12]:
act_names = [act.split("     ")[-1] for act in activities]
print(act_names)

['Gainful employment', 'Travel to and from work', 'Housekeeping', 'Maintenance work', 'Other domestic work', 'Childcare', 'Shopping and services', 'Travel related to domestic work', 'Sleep', 'Meals', 'Washing and dressing', 'School or university', 'Travel related to study', 'Free time study (incl. related travel)', 'Participatory activity', 'Sports and outdoor activities', 'Entertainment and culture', 'Reading', 'Listening to radio', 'Watching television', 'Socialising with family', 'Socialising with friends', 'Hobbies', 'Other free time', 'Travel related to free time']


In [13]:
old_act_names = activities.tolist()
print(old_act_names)

['01     Gainful employment', '02     Travel to and from work', '03     Housekeeping', '04     Maintenance work', '05     Other domestic work', '06     Childcare', '07     Shopping and services', '08     Travel related to domestic work', '09     Sleep', '10     Meals', '11     Washing and dressing', '12     School or university', '13     Travel related to study', '14     Free time study (incl. related travel)', '15     Participatory activity', '16     Sports and outdoor activities', '17     Entertainment and culture', '18     Reading', '19     Listening to radio', '20     Watching television', '21     Socialising with family', '22     Socialising with friends', '23     Hobbies', '24     Other free time', '25     Travel related to free time']


In [16]:
vi = ["gender", "year"]

new_df = pd.DataFrame(columns=vi + act_names)
    
for i, row in df.iterrows():
    act = row["Activity"]
    act_shortened = act_names[old_act_names.index(act)]
    for g in genders:
        for y in year:
            c_name = f"Time used hours,minutes {g} {y}"
            value = row[c_name]
            str_value = str(value)
            h, m = str_value.split(".")
            try:
                time = int(h) * 60 + int(m)
            except ValueError:
                time = np.nan
                
            is_g = new_df.gender == g
            is_y = new_df.year == y
            is_cond = new_df[is_g & is_y].index
            if len(is_cond):
                new_df.at[is_cond[0], act_shortened] = m

            else:
                new_df = new_df.append(
                    {
                        "gender": g,
                        "year": y,
                        act_shortened: m
                    }, ignore_index=True)

for act in act_names:
    new_df[act] = new_df[act].replace("", np.nan).astype("float")

new_df.to_csv(os.path.join(DATA_FOLDER, "clean_timeuse.csv"), index=False)

print(new_df.head())

  gender  year  Gainful employment  Travel to and from work  Housekeeping  \
0    Men  1979                 0.0                     18.0          34.0   
1    Men  1987                 9.0                      2.0          35.0   
2    Men  1999                 8.0                     19.0           4.0   
3    Men  2009                 7.0                     15.0          47.0   
4  Women  1979                53.0                     14.0          18.0   

   Maintenance work  Other domestic work  Childcare  Shopping and services  \
0              27.0                 11.0        8.0                   19.0   
1              27.0                 12.0        9.0                    2.0   
2              25.0                 11.0        9.0                   21.0   
3              22.0                 14.0       11.0                   22.0   
4               1.0                  1.0       24.0                   25.0   

   Travel related to domestic work  ...  Sports and outdoor activiti