In [15]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import datetime
import seaborn as sns

In [16]:
DATA_FOLDER = 'data'

data_file = os.path.join(DATA_FOLDER, 'stageinfamilycycle.csv')
df = pd.read_csv(data_file, sep=";")

print(df.head())

                    Activity  \
0  01     Gainful employment   
1  01     Gainful employment   
2  01     Gainful employment   
3  01     Gainful employment   
4  01     Gainful employment   

                               Stage in family cycle Men 1987-1988  \
0  Aged under 45, single, no children, living wit...          2.22   
1  Aged under 45, single, no children, living els...          4.18   
2                                      Single parent             .   
3  Aged under 45, married or cohabiting, no children          5.08   
4     Married or cohabiting, youngest child aged 0-6          5.21   

  Men 1999-2000 Men 2009-2010  Women 1987-1988  Women 1999-2000  \
0          1.16          0.56             1.10             0.27   
1          3.32          3.14             3.40             3.05   
2             .             .             3.35             2.39   
3          4.19          3.33             4.01             3.10   
4          5.14          4.54             2.33     

In [17]:
activities = df["Activity"].unique()
print(activities)

['01     Gainful employment' '02     Travel to and from work'
 '03     Housekeeping' '04     Maintenance work'
 '05     Other domestic work' '06     Childcare'
 '07     Shopping and services' '08     Travel related to domestic work'
 '09     Sleep' '10     Meals' '11     Washing and dressing'
 '12     School or university' '13     Travel related to study'
 '14     Free time study (incl. related travel)'
 '15     Participatory activity' '16     Sports and outdoor activities'
 '17     Entertainment and culture' '18     Reading'
 '19     Listening to radio' '20     Watching television'
 '21     Socialising with family' '22     Socialising with friends'
 '23     Hobbies' '24     Other free time'
 '25     Travel related to free time']


In [18]:
stages = df["Stage in family cycle"].unique()
print(stages)

['Aged under 45, single, no children, living with parents'
 'Aged under 45, single, no children, living elsewhere' 'Single parent'
 'Aged under 45, married or cohabiting, no children'
 'Married or cohabiting, youngest child aged 0-6'
 'Married or cohabiting, youngest child aged 7-17'
 'Aged over 45, married or cohibiting, no children'
 'Aged over 45, single, no children']


In [19]:
genders = \
    np.unique([c_name.split(" ")[0] 
               for c_name in df.columns 
               if c_name not in ("Stage in family cycle", "Activity")])
print(genders)

['Men' 'Women']


In [29]:
years = \
    np.unique([c_name.split(" ")[1] 
               for c_name in df.columns 
               if c_name not in ("Stage in family cycle", "Activity")])
print(years)

['1987-1988' '1999-2000' '2009-2010']


In [52]:
vi = ["stage", "gender", "years"]
act_names = [act.split("     ")[-1] for act in activities]
old_act_names = activities.tolist()
new_df = pd.DataFrame(columns=vi + act_names)
    
for i, row in df.iterrows():
    stage = row["Stage in family cycle"]
    act = row["Activity"]
    act_shortened = act_names[old_act_names.index(act)]
    for g in genders:
        for y in years:
            c_name = f"{g} {y}"
            value = row[c_name]
            str_value = str(value)
            h, m = str_value.split(".")
            try:
                time = int(h) * 60 + int(m)
            except ValueError:
                time = np.nan
                
            is_st = new_df.stage == stage
            is_g = new_df.gender == g
            is_y = new_df.years == y
            is_cond = new_df[is_st & is_g & is_y].index
            if len(is_cond):
                new_df.at[is_cond[0], act_shortened] = m

            else:
                new_df = new_df.append(
                    {
                        "stage": stage,
                        "gender": g,
                        "years": y,
                        act_shortened: m
                    }, ignore_index=True)

for act in act_names:
    new_df[act] = new_df[act].replace("", np.nan).astype("float")

new_df.to_csv(os.path.join(DATA_FOLDER, "clean_stageinfamilycycle.csv"), index=False)

print(new_df.head())

                                               stage gender      years  \
0  Aged under 45, single, no children, living wit...    Men  1987-1988   
1  Aged under 45, single, no children, living wit...    Men  1999-2000   
2  Aged under 45, single, no children, living wit...    Men  2009-2010   
3  Aged under 45, single, no children, living wit...  Women  1987-1988   
4  Aged under 45, single, no children, living wit...  Women  1999-2000   

   Gainful employment  Travel to and from work  Housekeeping  \
0                22.0                     10.0          18.0   
1                16.0                      6.0          23.0   
2                56.0                      4.0          18.0   
3                 1.0                      7.0          48.0   
4                27.0                      4.0          39.0   

   Maintenance work  Other domestic work  Childcare  Shopping and services  \
0              24.0                 10.0        1.0                   15.0   
1             