# Posts (whole_text) as basis for trad hh chores

In [1]:
import pandas as pd
from bertopic import BERTopic
import preprocessing_BERTopic as preprocessing
import plotly.io as pio
import gensim
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import plotly.express as px

from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

import re
from docx import Document
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL

In [2]:
re.__version__

'2.2.1'

# Load data and models with reduced outlier docs

In [2]:
# Countvectorizer remove stopwords after training BERTopic
stopwords = list(stopwords.words('english')) + ['http', 'https', "www", "oh", "com", "reddit", "Reddit", "haha", "lol", "daddit", "mommit"]

def reduce_outlier_topic(model, text, stopwords):
    topics=model.topics_
    new_distribution=model.reduce_outliers(text, topics, strategy="distributions")

    # initialize count vectorizer
    vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=(1, 3))

    # update topics
    model.update_topics(text, topics=new_distribution, vectorizer_model=vectorizer_model)

    topic_info=model.get_topic_info()
    doc_info=model.get_document_info(text)

    return model, topic_info, doc_info

In [3]:
def load_dataset(csv:str, model_name:str):
    """"
    loads the mommit and daddit data and their corresponding topic model to create the merged df for both    
    """
    # load in dataset
    data= pd.read_csv(csv, sep = ';').iloc[:,1:]
    # filter out the rows that fall between the given dates
    mask = (data['date_time'] < '2020-06-01 00:00:00') | ((data['date_time'] > '2020-06-30 23:59:59')& (data['date_time'] < '2020-10-01 00:00:00')) | (data['date_time'] > '2020-10-31 23:59:59')
    data_corr_times = data.loc[mask]

    # reset index after filtering
    data_corr_times=data_corr_times.reset_index()
    # create a list of the text column
    text=data_corr_times["whole_text"].to_list()

    # load model
    model = BERTopic.load(model_name)

    # outlier reduction
    model_reducedOutlier, topic_info, doc_info=reduce_outlier_topic(model, text, stopwords)

    # merge the results from the topic model to the dataframe
    merged_df=pd.merge(data_corr_times, doc_info, left_index=True, right_index=True)

    return merged_df, text, model_reducedOutlier, topic_info

In [4]:
# load data and model for r/Mommit
# min topic size =30
# replace mommit_clea with mommit_text_6
data_mommit, text_mommit, model_mommit, topic_info_mommit=load_dataset(csv="mommit_text_6.csv", model_name="topic_models\Ansatz_march_2021+2022\mommit\BERTopic_mommit_dec19_march22")
topic_info_mommit

Unnamed: 0,Topic,Count,Name
0,-1,6,-1_หน_broogie_งใหม เร_brugz
1,0,5792,0_sleep_bed_night_sleeping
2,1,3542,1_thank_sorry_thank much_comments
3,2,2441,2_food_eat_cheese_meal
4,3,1357,3_cute_photos_photo_picture
...,...,...,...
223,222,123,222_sneeze_pee_cough_legs
224,223,62,223_smoke_smoking_around smoke_neighbors
225,224,166,224_kisses_mouth_kiss_face
226,225,119,225_cake_smash_whipped_chocolate


In [5]:
# load data and model for r/daddit
data_daddit, text_daddit, model_daddit, topic_info_daddit=load_dataset(csv="daddit_clean.csv", model_name="topic_models\Ansatz_march_2021+2022\daddit\BERTopic_daddit_dec19_march22")
print(len(data_daddit)) # 78963 = len_data_corr_times
print(len(topic_info_daddit))
topic_info_daddit

78963
217


Unnamed: 0,Topic,Count,Name
0,-1,24,-1_num_num num_aaaaayyyyyyy_yuuuup
1,0,2672,0_dad_dads_father_good
2,1,1800,1_eat_food_cheese_snack
3,2,1979,2_photos_photo_smile_picture
4,3,3303,3_sleep_night_bed_sleeping
...,...,...,...
212,211,182,211_car_cars_one car_two cars
213,212,215,212_balls_poop_tar_poop balls
214,213,143,213_wedding_weddings_common_community
215,214,114,214_stroller_strollers_car_double stroller


In [6]:
# filter for documents containing keywords related with household chores
keywords=[
            # category meal prep/ cooking/ food usw.
            "grocery list", "meal prep", "healthy eating", "recipes", "recipe", "meal planning", "grocery shopping", "grocery", "groceries", "cooking techniques", "kitchen appliances",
            "intolerance", "cooking", "cook", "baking", "bake", "dinner", "lunch", "pan", "carrot", "banana", "cups", "tbsp", "veggies", "lunch boxes", "lunch box", "oatmeal", "diet",
            "breakfast", "snack", "cheese", "nutrition", "ingredients", "set table", "meal", "oven", 
            
            # category Cleaning
            "wash", "housekeeping", "cleaning products", "vacuuming", "dusting", "mopping", "cleaning", "clean", "dishes", "dish washing", "dishwasher", "sweeping", "making beds",
            "rinse", "wipe", "carpet", 
            
            # catgory Laundry
            "closet",  "folding clothes", "washing machine", "dryer", "laundry detergent", "ironing board", "iron", "laundry", "folded", "hand wash",

            # category 'Child_care':
            "feeding", "bathing", "homework", "playtime", "discipline", "take care", "taking care", "playtime", "bedtime story", "care", "playground", "daycare", "day-care",
            "security", "safety", "diaper", "school", "soothing", "coloring", "book", "kindergarden", "nap", "bedtime",

            # category: Planning_Organizing_Running_errands':
            "household", "calendar", "scheduling", "schedule", "reminders", "appointments", "activities", "birthday", "present", "doctor", "cake",  "shopping", "prescription pick-up",
            "prescription", "post office", "dry cleaning", "bank", "errands", "planning", "organization", "supply", "supplies", "anticipate", "prepare","shopping", "Dr", "ped", "pediatrician", 

            # category 'Finances':
            "budgeting", "bill paying", "saving", "investing", "debt management", "dollar", "money", "spending", "cheap", "expensive", "pricey", "cost",
    
            # category 'House_maintanance': [ 
            "interior design", "furniture", "home repairs", "landscaping", "outdoor maintenance",  "plants", "planting", "weeding", "pruning", "lawn care",
            "decoration", "garden", "pets", "cat", "dog",
                
            # category 'Mental_load':
            "busy", "hectic", "organized", "domestic", "routine", "multitasking", "tiring", "fulfilling", "tired", "worry", "care",  "emotional", "fulfilling", "anticipate",
            "comfort", "support child", "anticipate",   "prepare", "comfort child", "comfort husband", "comfort family", "sick child", "ill child", "household", "chores"

            #### van Tienoven ####
            # COVIDTUS Routine --> only changed everything to small letters
            "serving food", "set the table", "cooking", "preparing food", "baking", "making coffee", "making drinks", "storing", "arranging", "preserving food", "washing dishes", "clearing table",
            "loading dishwasher", "unloading dishwasher", "vacuuming", "dusting", "sweeping", "mopping", "toilet cleaning", "clean the toilet", "window cleaning",  "cleaning up", "putting things away",
            "changing sheets", "changing bedclothes", "making the bed", "outside cleaning" "clean terrace", "clean driveway", "clean paths", "clear up rubbish", "put out rubbish", "garbage", "rubbish",
            "recycling of waste", "disposal of waste", "arranging household goods and materials", "groceries away", "write grocery list", "light a ﬁre", "handwashing clothing", "handwashing textiles", 
            "packing for a trip", "washing into washing machine", "removing washing from washing machine", "dryer", "sorting clothes", "hanging up washing", "taking in washing", "ironing", "mangling", 
            "folding clothes", "arranging clothes", "folding textiles", "arranging textiles",  "mending", "adjusting clothes", "sewing on buttons", "handiwork", "polishing shoes", "paying bills", 
            "taxes", "mortgages", "insurance", "household administration", "prepare day", "prepare journey", "prepare party", "plan day", "plan journey", "plan party", "making shopping lists",
            "daily planning", "weekly planning", "shopping groceries", "shop food", "shop clothes", "shop gifts", "shop household goods", "shop electronics", "shop plant", "comfort baby", "wash baby",
            "dress baby", "put baby to bed", "feeding baby", "wash child", "dress child", "put child to bed", "feed children", "supervise eating", "cut meat", "cut food", "help with homework",
            "supervise child with homework", 
            "self-study", "home reading", "talking about school", "taking a child to school", "accompanying a child to school","taking a child to childcare", "accompanying a child to childcare", 
            "talking to teacher", "taking baby to a doctor", "taking child to a doctor", "accompanying baby to a doctor", "accompanying child to a doctor", "taking baby to a hospital visit",
            "taking child to a hospital visit", "accompanying baby to a hospital visit", "accompanying child to a hospital visit",  "taking child to sport activities", "accompanying child to sport activities",
            "taking child to cultural activities", "accompanying child to cultural activities", "taking child to activities", "taking baby to activities", "accompanying child to activities",
            "accompanying baby to activities", "physical care", "cutting hair", "cut hair", "toileting", "take medical care",

            # BTUSI3 Routine
            # only what has not been in Covidtus list
            "cooking", "preparing food", "set table", "washing dishes", "clearing table", "dusting", "sweeping", "packing away", "mopping", "cleaning", "washing car",  "handwashing", 
            "arranging clothes", "arranging textiles", 

            # COVIDTUS non-routine
            "tending pet", "pet", "caring for pets", "clean cage", "clean stable", "clean aquarium",  "indoor plant care", "watering",  "mowing grass", "working in the garden",
            "taking care outdoor plants", "veterinarian", "walking the dog", "walk the dog", "playing with pet", "playing with pets", "construction renovation", "painting",
            "wallpapering", "plumbing", "electrical repairs", "decor changes", "carpentry", "rooﬁng", "maintenance of household equipment", "repair of household equipment",
            "maintenance of household equipment", "maintenance of household appliances","maintenance of household appliances", "maintenance of car", "repair of car",
            "maintenance of bike", "repair of bike", "washing car", "washing bike", "washing scooter", "picking up", "laundromat", "shoemaker", "refueling",  "bank",
            "post ofﬁce",  "council", "centrelink", "tax ofﬁce", "police", "government department", "broker", "insurance", "notary public", "visit garage", "visit mechanic",
            "car wash", "warrant check", "pharmacy", "reading to child", "playing with child", "talking to child ", "going for walks with children", "swimming with children",
            "bike ride", "outdoor games",  "supervision", "give guidance", "teach", 

            # BTUSI3 non-routine
            # only what has not been in Covidtus list
            "online banking",


          # added by me
          "leftovers", "food storage", "kitchen organization", "food preparation", "spices and seasoning", "grilling", "freezing food", "food preservation", "slow cooker",

          "stain removal", "garbage disposal", "cleaning schedule", "cleaning supplies", "steam cleaning", "organizing closets", "decluttering", "household maintenance",

          "dry cleaning", "delicate fabrics", "laundry sorting", "stain treatment", "laundry routine", "laundry symbols", "ironing clothes", "laundry storage",

          "childproofing", "bath time", "potty training", "storytelling", "craft activities", "outdoor play", "child development", "playdate", "children's health",

          "time management", "meal planning", "appointment scheduling", "gift", "party planning", "holiday preparations", "travel arrangements", "home inventory",
          "home improvement projects",

          "budget management", "expense tracking", "bill organization", "financial planning", "investment strategies", "tax preparation", "insurance policies",
          "savings goals",

          "home security", "pest control", "HVAC maintenance", "roof maintenance", "gutter cleaning", "paint touch-ups", "furniture repairs", "home decor updates",

          "time for self-care", "stress management", "family communication", "work-life balance", "self-care routine", "relaxation techniques", "mindfulness practices",
          "prioritizing tasks"
          ]

In [7]:
def build_chores_df(df, column:str):
    # initialize an empty list to store the filtered dataframes
    filtered_dfs = []

    # iterate through the list of keywords and filter the dataframe for each keyword
    for keyword in keywords:
        filtered_df = df[df[column].str.contains(fr'\b{keyword}\b', regex=True)] # only takes EXACT keywords.
        # E.g. if the keyword is "supervise child with homework", documents only containing "child" dont fall into the filter
        filtered_dfs.append(filtered_df)

    # combine the filtered dataframes
    chores_dataframe = pd.concat(filtered_dfs)
    chores_dataframe=chores_dataframe.reset_index()

    return chores_dataframe

# hh chores & care work in the postings

## In how many Reddit postings can the keywords be found, based on the whole_text column?
- more relevant for RQ3
- development of topics over time

In [8]:
# build chores dataframe for mommit and daddit based on keywords being present in the whole_text column
chores_mommit= build_chores_df(data_mommit, "whole_text")
chores_daddit= build_chores_df(data_daddit, "whole_text")

print(len(chores_mommit)) # 43498
print(len(chores_daddit)) # 31700

43989
31700


In [9]:
# some rows are duplicates because every time a keyword is found it gets written into the new df
# --> the same document may have several keyword matches
# almost all topics remain in the dataset

chores_mommit_clean=chores_mommit.drop_duplicates()
unique_values_mommit= chores_mommit_clean["Name"].unique()

chores_daddit_clean=chores_daddit.drop_duplicates()
unique_values_daddit= chores_daddit_clean["Name"].unique()

print(len(chores_mommit_clean))
print(len(unique_values_mommit)) # 227 topics

print(len(chores_daddit_clean))
print(len(unique_values_daddit)) # 216 topics

# 24009
# 227
# 18282
# 216

24009
227
18282
216


In [10]:
# relative values 
# 30 % of postings in Mommit contain household chores while 23% of posts in daddit contain household chores
relative_mommit=len(chores_mommit_clean) / len(data_mommit)
relative_daddit=len(chores_daddit_clean) / len(data_daddit)

print(relative_mommit)
print(relative_daddit)

# 0.30939432989690724
# 0.2315261578207515

0.30939432989690724
0.2315261578207515


## manually label data


In [11]:
len(chores_mommit_clean)

24009

In [11]:
# keyword list
import numpy as np
keywords = {'Meals_(prep and planning)': ["recipes", "recipe", "grocery list", "grocery", "healthy eating", "grocery shopping", "cooking techniques",
                                            "kitchen appliances", "intolerance", "cooking", "baking", "groceries", "cook", "bake", "dinner", "lunch", "cake", "table",
                                            "pan", "carrot", "banana", "cups", "tbsp", "veggies", "lunch boxes", "lunch box", "oatmeal", "diet", "breakfast", "snack", "cheese", "nutrition",
                                            "ingredients", "meal", "oven", "serving food", "serve food", "prepare food", "preparing food", "make coffee", "making coffee",
                                            "preserving food",
                                            "leftovers", "food storage", "kitchen organization", "food preparation", "spices and seasoning", "grilling", "freezing food", "food preservation", "slow cooker",
], # meals (prep and planning)
                                            
             'Cleaning & Laundry': ["wash", "washing", "housekeeping", "vacuuming", "dusting", "mopping", "cleaning", "dishes", "dish washing", "sweeping", "dishwasher", "clean", "making beds","cleaning products",
                          "rinse", "wipe", "carpet", "putting things away", "put things away", "changing sheets", "changing bedclothes", "making the bed", "clear up rubbish", "put out rubbish",
                         "garbage", "rubbish", "recycling of waste", "disposal of waste", "washing car",
                          "stain removal", "organizing closets", "decluttering", "household maintenance", # cleaning
                        "hand wash", "washing machine", "dryer", "ironing board", "iron", "laundry", "closet", "clothes", "folding", "folded", "laundry detergent", "handwashing",
                        "sorting clothes", "hanging up washing", "taking in washing", "ironing", "mangling", "arranging clothes", "folding textiles", "arranging textiles", 
                        "mending", "adjusting clothes", "sewing on buttons", "handiwork", "laundromat",
                        "dry cleaning", "delicate fabrics", "stain treatment", "ironing clothes"], # laundry

             'Child_care': ["feeding", "bathing", "homework", "playtime", "discipline", "care", "playground", "daycare", "day-care", "childcare", "child-care", "support",
                            "security", "safety","bedtime story", "playtime", "take care", "taking care", "diaper", "coloring", "book", "school", "soothing",
                            "wash baby", "dress baby", "put baby to bed", "feeding baby", "wash child", "dress child", "put child to bed", "feed children",
                            "supervise eating","cut meat", "cut food", "self-study", "home reading", "teacher", "taking baby to a doctor",
                            "taking child to a doctor", "accompanying baby to a doctor", "accompanying child to a doctor", "taking baby to a hospital visit",
                            "taking child to a hospital visit", "accompanying baby to a hospital visit","accompanying child to a hospital visit",  "sport activities",
                            "cultural activities","activities", "physical care", "hair", "toileting", "take medical care", "diaper", "diapers", "reading to", "play",
                            "playing", "plays", "go for walk", "swim", "swimming", "bike", "game", "games", "supervision", "give guidance", "teach", "kindergarden",
                            "childproofing", "bath time", "potty", "storytelling", "craft activities", "outdoor play", "child development", "playdate",
                            "children's health", "nap", "bedtime"], # child care
                             
             'Planning_Organizing_Running_errands': ["calendar", "scheduling", "schedule", "reminders", "appointments", "activities", "birthday", "present", "doctor", "prescription",
                        "household", "post office", "dry cleaning", "bank", "errands", "planning", "organization", "supply", "supplies", "anticipate", "prepare", "shopping","cake",
                        "Dr", "ped", "pediatrician", "storing", "packing for a trip", "prepare day", "prepare journey", "prepare party", "plan day", "plan journey", "plan party",
                        "making shopping lists", "daily planning", "weekly planning", "shop", "picking up", "pick up", "shoemaker", "refueling", "council",
                        "tax ofﬁce", "police", "government department","pharmacy",                           
                        "time management", "meal planning", "appointment scheduling", "gift", "party planning", "holiday preparations", "travel arrangements",
                        "home inventory", "home improvement projects", "meal planning", "meal prep"], # planning/ organizing/ running errands
             
             'Finances': ["dollar", "budgeting", "bill paying", "paying bills", "saving", "investing", "debt", "money", "spending", "cheap", "expensive", "pricey", "cost",
                        "taxes", "mortgages", "insurance", "household administration", "broker", "notary public", "warrant check", "online banking", 
                        "budget management", "expense tracking", "bill organization", "financial planning", "investment strategies", "tax preparation", "insurance policies",
                        "savings goals"], # Finances
            
             '(House)-maintanance': ["interior design", "furniture", "home repairs", "landscaping", "outdoor maintenance", "plant", "plants", "planting", "watering", "weeding", "pruning", "decoration",
                        "lawn care", "landscaping", "garden", "pets", "cat", "dog", "outside cleaning" "clean terrace", "clean driveway", "clean paths", "arranging household goods and materials",
                        "light a ﬁre", "polishing shoes", "clean cage", "clean stable", "clean aquarium", "mowing grass", "veterinarian", "walking the dog", "walk the dog", "playing with pet",
                        "playing with pets", "construction renovation", "painting", "wallpapering", "plumbing", "electrical repairs", "decor changes", "carpentry", "rooﬁng",
                        "maintenance of household equipment", "repair of household equipment", "maintenance of household equipment", "maintenance of household appliances",
                        "maintenance of household appliances", "maintenance of car", "repair of car",  "maintenance of bike", "repair of bike", "washing car", "washing bike",
                        "washing scooter", "visit garage", "visit mechanic", "home security", "pest control", "HVAC maintenance", "roof maintenance", "gutter cleaning", "paint touch-ups",
                         "furniture repairs", "home decor updates"], # house maintanance
                    
             'Mental_load': ["care", "worry", "emotional", "busy", "hectic", "organized", "domestic", "routine", "multitasking", "tiring", "tired", "fulfilling", "anticipate", "comfort",
                            "support child", "anticipate",   "prepare", "comfort child", "comfort husband", "comfort family", "sick child", "ill child", "household", "chores", "comfort baby",
                            "self-care", "stress management", "family communication", "work-life balance", "self-care routine", "relaxation techniques", "mindfulness practices",
                            "prioritizing tasks"] # mental load         
                }

In [12]:
# group keywords into 3 categories
keywords_broader_groups={'household chores': ["recipes", "recipe", "meal planning", "meal prep", "grocery list", "grocery", "healthy eating", "grocery shopping", "cooking techniques",
                                            "kitchen appliances", "intolerance", "cooking", "baking", "groceries", "cook", "bake", "dinner", "lunch", "cake", "table",
                                            "pan", "carrot", "banana", "cups", "tbsp", "veggies", "lunch boxes", "lunch box", "oatmeal", "diet", "breakfast", "snack", "cheese", "nutrition",
                                            "ingredients", "meal", "oven", "serving food", "serve food", "prepare food", "preparing food", "make coffee", "making coffee",
                                            "preserving food", "leftovers", "food storage", "kitchen organization", "food preparation", "spices and seasoning", "grilling", "freezing food",
                                            "food preservation", "slow cooker",
                                            
                                            "wash", "washing", "housekeeping", "vacuuming", "dusting", "mopping", "cleaning", "dishes", "dish washing", "sweeping", "dishwasher", "clean", "making beds","cleaning products",
                                            "rinse", "wipe", "carpet", "putting things away", "put things away", "changing sheets", "changing bedclothes", "making the bed", "clear up rubbish", "put out rubbish",
                                            "garbage", "rubbish", "recycling of waste", "disposal of waste", "washing car",
                                            "stain removal", "organizing closets", "decluttering", "household maintenance", # cleaning
                                            "hand wash", "washing machine", "dryer", "ironing board", "iron", "laundry", "closet", "clothes", "folding", "folded", "laundry detergent", "handwashing",
                                            "sorting clothes", "hanging up washing", "taking in washing", "ironing", "mangling", "arranging clothes", "folding textiles", "arranging textiles", 
                                            "mending", "adjusting clothes", "sewing on buttons", "handiwork", "laundromat",
                                            "dry cleaning", "delicate fabrics", "stain treatment", "ironing clothes", # laundry

                                             "interior design", "furniture", "home repairs", "landscaping", "outdoor maintenance", "plant", "plants", "planting", "watering", "weeding", "pruning", "decoration",
                                             "lawn care", "landscaping", "garden", "pets", "cat", "dog", "outside cleaning" "clean terrace", "clean driveway", "clean paths", "arranging household goods and materials",
                                             "light a ﬁre", "polishing shoes", "clean cage", "clean stable", "clean aquarium", "mowing grass", "veterinarian", "walking the dog", "walk the dog", "playing with pet",
                                             "playing with pets", "construction renovation", "painting", "wallpapering", "plumbing", "electrical repairs", "decor changes", "carpentry", "rooﬁng",
                                             "maintenance of household equipment", "repair of household equipment", "maintenance of household equipment", "maintenance of household appliances",
                                             "maintenance of household appliances", "maintenance of car", "repair of car",  "maintenance of bike", "repair of bike", "washing car", "washing bike",
                                             "washing scooter", "visit garage", "visit mechanic", "home security", "pest control", "HVAC maintenance", "roof maintenance", "gutter cleaning", "paint touch-ups",
                                             "furniture repairs", "home decor updates",

                                            ],
                              'care work':["feeding", "bathing", "homework", "playtime", "discipline", "care", "playground", "daycare", "day-care", "childcare", "child-care", "support",
                                        "security", "safety","bedtime story", "playtime", "take care", "taking care", "diaper", "coloring", "book", "school", "soothing",
                                        "wash baby", "dress baby", "put baby to bed", "feeding baby", "wash child", "dress child", "put child to bed", "feed children",
                                        "supervise eating","cut meat", "cut food", "self-study", "home reading", "teacher", "taking baby to a doctor",
                                        "taking child to a doctor", "accompanying baby to a doctor", "accompanying child to a doctor", "taking baby to a hospital visit",
                                        "taking child to a hospital visit", "accompanying baby to a hospital visit","accompanying child to a hospital visit",  "sport activities",
                                        "cultural activities","activities", "physical care", "hair", "toileting", "take medical care", "diaper", "diapers", "reading to", "play",
                                        "playing", "plays", "go for walk", "swim", "swimming", "bike", "game", "games", "supervision", "give guidance", "teach", "kindergarden",
                                        "childproofing", "bath time", "potty", "storytelling", "craft activities", "outdoor play", "child development", "playdate",
                                        "children's health", "nap", "bedtime"],


                              'Organization/ mental load':["calendar", "scheduling", "schedule", "reminders", "appointments", "activities", "birthday", "present", "doctor", "prescription",
                                                            "household", "post office", "dry cleaning", "bank", "errands", "planning", "organization", "supply", "supplies", "anticipate", "prepare", "shopping","cake",
                                                            "Dr", "ped", "pediatrician", "storing", "packing for a trip", "prepare day", "prepare journey", "prepare party", "plan day", "plan journey", "plan party",
                                                            "making shopping lists", "daily planning", "weekly planning", "shop", "picking up", "pick up", "shoemaker", "refueling", "council",
                                                            "tax ofﬁce", "police", "government department","pharmacy",                           
                                                            "time management", "meal planning", "appointment scheduling", "gift", "party planning", "holiday preparations", "travel arrangements",
                                                            "home inventory", "home improvement projects", "meal planning", "meal prep", # organization
                                  
                                  
                                                            "care", "worry", "emotional", "busy", "hectic", "organized", "domestic", "routine", "multitasking", "tiring", "tired", "fulfilling", "anticipate", "comfort",
                                                            "support child", "anticipate",   "prepare", "comfort child", "comfort husband", "comfort family", "sick child", "ill child", "household", "chores", "comfort baby",
                                                            "self-care", "stress management", "family communication", "work-life balance", "self-care routine", "relaxation techniques", "mindfulness practices",
                                                            "prioritizing tasks", # mental load
                                                            
                                                            
                                                            "dollar", "budgeting", "bill paying", "paying bills", "saving", "investing", "debt", "money", "spending", "cheap", "expensive", "pricey", "cost",
                                                            "taxes", "mortgages", "insurance", "household administration", "broker", "notary public", "warrant check", "online banking", 
                                                            "budget management", "expense tracking", "bill organization", "financial planning", "investment strategies", "tax preparation", "insurance policies",
                                                            "savings goals"] # Finances
                              
               
}

In [13]:
# for whole_text column and for data where keywords found in whole_text colummn
# chores_mommit_clean

def assign_label(text_mommit):
    labels = []
    for key, value in keywords_broader_groups.items(): # change keyword list here
        for word in value:
            if re.search(r'\b{}\b'.format(word), text_mommit.lower()):
                labels.append(key)
    if not labels:
        return None  # Return None if no label is assigned
    return labels


def label_to_chores(df):
    """
    df = chores_daddit_clean / chores_mommit_clean
    """
    # Apply the label function to the whole_text column and store the result in a new column called 'label'
    df["label"] = df["whole_text"].apply(lambda x: assign_label(x) or None)

    # Create a new DataFrame with a column for each label
    label_df = pd.get_dummies(df['label'].apply(pd.Series).stack()).sum(level=0)

    # Merge the new DataFrame with the original DataFrame
    df = pd.concat([df, label_df], axis=1)

    # Replace NaN values with 0
    df = df.fillna(0)

    # Optional: Rename the columns to remove spaces and make them lowercase
    df.columns = [col.lower().replace(' ', '_') for col in df.columns]

    return df

In [14]:
# mit keywords_broader_groups keyword list
chores_label_mommit=label_to_chores(chores_mommit_clean)
chores_label_daddit=label_to_chores(chores_daddit_clean)

In [15]:
# label 1-3 as soon as any label is detected one time (or more) it gets written into label columns
labels_reduce= ["organization/_mental_load", "care_work", "household_chores"]

for i, label in enumerate(labels_reduce): # change here label vs labels reduced
    column_name = 'label_' + str(i + 1)
    chores_label_daddit[column_name] = np.where(chores_label_daddit[label] > 0, label, '')
    chores_label_mommit[column_name] = np.where(chores_label_mommit[label] > 0, label, '')

len(chores_label_mommit)

24009

In [16]:
# 3 categories
chores_label_reduced_mommit=label_to_chores(chores_mommit_clean)
chores_label_reduced_daddit=label_to_chores(chores_daddit_clean)

In [17]:
# bring 3 label columns into long format with colum name topic_label
# each document occurs 3 times
# The score column counts how often keywords for the respective label are found --> meaning if 3 keywords are found and labelled as child care --> score there will be 3

def long_df(df):
    long_chores = pd.melt(
        df,
        id_vars=["date_time", "author", "whole_text"],
        # value_vars=['child_care', 'finances', "(house)-maintanance", "cleaning_&_laundry", "meals_(prep_and_planning)", "mental_load", "planning_organizing_running_errands"],
        value_vars=["organization/_mental_load", "care_work", "household_chores"],
        var_name='topic_label',
        value_name='score'
    )
    return long_chores

long_chores_mommit=long_df(chores_label_mommit)
long_chores_daddit=long_df(chores_label_daddit)

In [18]:
# binary_score: as soon as score >= 1, binary_score is 1 
## Mommit
long_chores_mommit['binary_score'] = long_chores_mommit['score'].apply(lambda x: 1 if x >= 1 else 0)
## daddit
long_chores_daddit['binary_score'] = long_chores_daddit['score'].apply(lambda x: 1 if x >= 1 else 0)

## visualize hh chores with labels

In [19]:
# count how often each label occurs by week and by month
def monthly_weekly_labels(df):
    df['date_time'] = pd.to_datetime(df['date_time'])
    weekly = df.groupby([pd.Grouper(key='date_time', freq='W-SUN'), 'topic_label'])['binary_score'].sum().reset_index()
    monthly=df.groupby([pd.Grouper(key='date_time', freq='M'), 'topic_label'])['binary_score'].sum().reset_index()
    #result = grouped.to_frame().reset_index()

    return weekly, monthly

mommit_weekly_labels, mommit_monthly_labels=monthly_weekly_labels(long_chores_mommit)
daddit_weekly_labels, daddit_monthly_labels=monthly_weekly_labels(long_chores_daddit)

mommit_monthly_labels

Unnamed: 0,date_time,topic_label,binary_score
0,2019-12-31,care_work,717
1,2019-12-31,household_chores,445
2,2019-12-31,organization/_mental_load,895
3,2020-01-31,care_work,776
4,2020-01-31,household_chores,523
5,2020-01-31,organization/_mental_load,829
6,2020-02-29,care_work,736
7,2020-02-29,household_chores,458
8,2020-02-29,organization/_mental_load,843
9,2020-03-31,care_work,890


### absolute 

- in a document at least one keyword that is grouped into a broader category is found
- a doucment can occur in more than one category

In [20]:
# by category weekly
dataframes=[mommit_weekly_labels, daddit_weekly_labels]
titles = ['Mommit weekly household chores with labels', 'Daddit weekly household chores with labels']  # Specify the titles for each dataframe

for i, dataframe in enumerate(dataframes):
    fig = px.line(dataframe, x='date_time', y="binary_score", color='topic_label', title=f"{titles[i]}")
    fig.add_vline(x="2020-03-11", line_color="black", line_width=3)
    fig.update_layout(yaxis=dict(range=[-100, 2500]))
    fig.show()

In [21]:
# by category monthly
dataframes=[mommit_monthly_labels, daddit_monthly_labels]
titles = ['Mommit monthly household chores with labels', 'Daddit monthly household chores with labels']  # Specify the titles for each dataframe

for i, dataframe in enumerate(dataframes):
    fig = px.line(dataframe, x='date_time', y="binary_score", color='topic_label', title=f"{titles[i]}")
    fig.add_vline(x="2020-03-11", line_color="black", line_width=3)
    fig.update_layout(yaxis=dict(range=[-100, 9000]))

    fig.show()

### relative

#### relative to whole df

In [22]:
def monthly_weekly_basis(df):
    # create monthly data for the whole mommit data
    df['date_time'] = pd.to_datetime(df['date_time'])

    monthly_data = df.groupby([pd.Grouper(key='date_time', freq='M'), 'category']).size().reset_index(name='count') # monthly 
    monthly_output = monthly_data.pivot(index='date_time', columns='category', values='count').fillna(0).astype(int).reset_index()
    monthly_output["date_time"]=monthly_output["date_time"].astype("str")

    weekly_data = df.groupby([pd.Grouper(key='date_time', freq='W-SUN'), 'category']).size().reset_index(name='count') # weekly
    weekly_output = weekly_data.pivot(index='date_time', columns='category', values='count').fillna(0).astype(int).reset_index()
    weekly_output["date_time"]=weekly_output["date_time"].astype("str")

    monthly_output["sum_postings"]=monthly_output["comments"]+monthly_output["submissions"]
    weekly_output["sum_postings"]=weekly_output["comments"]+weekly_output["submissions"]

    return monthly_output, weekly_output

In [23]:
mommit_monhtly, mommit_weekly=monthly_weekly_basis(data_mommit)
daddit_monhtly, daddit_weekly=monthly_weekly_basis(data_daddit)
mommit_monhtly

category,date_time,comments,submissions,sum_postings
0,2019-12-31,3972,726,4698
1,2020-01-31,4269,716,4985
2,2020-02-29,4556,809,5365
3,2020-03-31,4261,860,5121
4,2020-04-30,3913,798,4711
5,2021-03-31,5411,910,6321
6,2022-03-31,43883,2516,46399


In [24]:
def relative_chores(df, df_orig_monthly, df_orig_weekly, subreddit):
    """
    df: chores dataframe long
    df_orig_weekly: original dataset (as loaded from csv in the beginning) aggregated by week
    df_orig_monthly: original dataset (as loaded from csv in the beginning) aggregated by month
    """
    single_chores=df.drop_duplicates(subset=["date_time", "whole_text"])

    # only keep cases, where the binary score is 1 and thus indicates that the posting includes the grouping lable
    single_chores=single_chores[single_chores["binary_score"]!=0]

    
    # count number of documents per week
    single_chores['date_time'] = pd.to_datetime(df['date_time'])
    per_week = single_chores.groupby([pd.Grouper(key='date_time', freq='W-SUN')]).size().reset_index(name='count_chores')

    # count number of documents per month
    per_month = single_chores.groupby([pd.Grouper(key='date_time', freq='M')]).size().reset_index(name='count_chores')

    # turn date_time into a string (same way as output is)
    per_month["date_time"]=per_month["date_time"].astype("str")
    per_week["date_time"]=per_week["date_time"].astype("str")

    # drop rows where count = 0
    per_month_final = per_month[per_month['count_chores'] != 0]
    per_week_final = per_week[per_week['count_chores'] != 0]

    # merge with monthly_output --> da steht drin wie viele posts pro monat/ pro week
    relative_basis_month = pd.merge(per_month_final, df_orig_monthly, on='date_time')
    relative_basis_week = pd.merge(per_week_final, df_orig_weekly, on='date_time')


    # calculate relative share of household related posts, 3 decimals
    relative_basis_month["share_chores_to_all"]=round((relative_basis_month["count_chores"]/relative_basis_month["sum_postings"])*100,3)
    relative_basis_week["share_chores_to_all"]=round((relative_basis_week["count_chores"]/relative_basis_week["sum_postings"])*100,3)

    relative_basis_month["subreddit"]=f"{subreddit}"
    relative_basis_week["subreddit"]=f"{subreddit}"

    return relative_basis_month, relative_basis_week, single_chores


In [25]:
relative_monhtly_mommit, relative_weekly_mommit,single_chores_mommit =relative_chores(df=long_chores_mommit, df_orig_monthly=mommit_monhtly, df_orig_weekly=mommit_weekly, subreddit="Mommit")
relative_monhtly_daddit, relative_weekly_daddit, single_chores_daddit=relative_chores(df=long_chores_daddit, df_orig_monthly=daddit_monhtly, df_orig_weekly=daddit_weekly, subreddit="daddit")

In [26]:
# create df with monthly both subreddits and weekly both subreddits
relative_monhtly_subreddits = pd.concat([relative_monhtly_mommit, relative_monhtly_daddit], ignore_index=True)
relative_weekly_subreddits = pd.concat([relative_weekly_mommit, relative_weekly_daddit], ignore_index=True)
relative_weekly_subreddits

#relative_monhtly_subreddits.to_csv('vis_basis/monthly_subreddits_whole_text.csv', index=False)
#relative_weekly_subreddits.to_csv(vis_basis/weekly_subreddits_whole_text.csv', index=False)

Unnamed: 0,date_time,count_chores,comments,submissions,sum_postings,share_chores_to_all,subreddit
0,2019-12-01,49,102,19,121,40.496,Mommit
1,2019-12-08,217,988,153,1141,19.018,Mommit
2,2019-12-15,214,959,187,1146,18.674,Mommit
3,2019-12-22,214,984,160,1144,18.706,Mommit
4,2019-12-29,147,694,164,858,17.133,Mommit
...,...,...,...,...,...,...,...
61,2022-03-06,727,5601,432,6033,12.050,daddit
62,2022-03-13,1005,6763,449,7212,13.935,daddit
63,2022-03-20,1300,7622,439,8061,16.127,daddit
64,2022-03-27,1489,7847,413,8260,18.027,daddit


#### visualize

##### overall

- at least one occurance of hh chores in a posting
- how many does not matter

In [27]:
dataframes=[relative_monhtly_mommit, relative_monhtly_daddit]
titles = ['r/Mommit: Share of postings on household chores out of all posts, monthly in %', 'r/daddit: Share of postings on household chores out of all posts, monthly in %']  # Specify the titles for each dataframe

# Find the maximum value across all dataframes
max_value = max(df['share_chores_to_all'].max() for df in dataframes)

for i, dataframe in enumerate(dataframes):
    fig = px.line(dataframe, x='date_time', y="share_chores_to_all", title=f"{titles[i]}")
    fig.add_vline(x="2020-03-08", line_color="black", line_width=3)

    # Set the same y-axis scale for both graphs
    fig.update_layout(yaxis=dict(range=[0, max_value+10]))
    fig.show()

In [28]:
dataframes=[relative_weekly_mommit, relative_weekly_daddit]
titles = ['Mommit: Share of postings on household chores out of all posts, weekly in %', 'Daddit: Share of postings on household chores out of all posts, weekly in %']  # Specify the titles for each dataframe

# Find the maximum value across all dataframes
max_value = max(df['share_chores_to_all'].max() for df in dataframes)

for i, dataframe in enumerate(dataframes):
    fig = px.line(dataframe, x='date_time', y="share_chores_to_all", title=f"{titles[i]}")
    fig.add_vline(x="2020-03-08", line_color="black", line_width=3)

    # Set the same y-axis scale for both graphs
    fig.update_layout(yaxis=dict(range=[0, max_value+10]))
    fig.show()

##### grouped

- 3 labels --> at least one occurance of a keyword belonging to a category is found in a posting
- only yes or no counts not how many times a keyword was found
- the same document can therefore occur in one, two or all three categories

In [29]:
def relative_grouped_chores(long_df, weekly_orig, monthly_orig):
    # count number of documents per week
    long_df['date_time'] = pd.to_datetime(long_df['date_time'])
    per_week = long_df.groupby([pd.Grouper(key='date_time', freq='W-SUN'), 'topic_label']).agg({'binary_score': 'sum'}).reset_index()

    # count number of documents per month
    per_month = long_df.groupby([pd.Grouper(key='date_time', freq='M'), 'topic_label']).agg({'binary_score': 'sum'}).reset_index()

    # turn date_time into a string (same way as output is)
    per_month["date_time"]=per_month["date_time"].astype("str")
    per_week["date_time"]=per_week["date_time"].astype("str")

    # merge with monthly_output --> da steht drin wie viele posts pro monat/ pro week
    relative_basis_month = pd.merge(per_month, monthly_orig, on='date_time')
    relative_basis_week = pd.merge(per_week, weekly_orig, on='date_time')

    # # calculate relative share of household related posts, 3 decimals
    relative_basis_month["share_chores_to_all"]=round((relative_basis_month["binary_score"]/relative_basis_month["sum_postings"])*100,3)
    relative_basis_week["share_chores_to_all"]=round((relative_basis_week["binary_score"]/relative_basis_week["sum_postings"])*100,3)

    return relative_basis_week, relative_basis_month

In [30]:
# whole df as basis
relative_basis_week_mommit, relative_basis_month_mommit=relative_grouped_chores(long_chores_mommit, mommit_weekly, mommit_monhtly)
relative_basis_week_daddit, relative_basis_month_daddit=relative_grouped_chores(long_chores_daddit, daddit_weekly, daddit_monhtly)

In [30]:
relative_basis_month_mommit

Unnamed: 0,date_time,topic_label,binary_score,comments,submissions,sum_postings,share_chores_to_all
0,2019-12-31,care_work,717,3972,726,4698,15.262
1,2019-12-31,household_chores,445,3972,726,4698,9.472
2,2019-12-31,organization/_mental_load,895,3972,726,4698,19.051
3,2020-01-31,care_work,776,4269,716,4985,15.567
4,2020-01-31,household_chores,523,4269,716,4985,10.491
5,2020-01-31,organization/_mental_load,829,4269,716,4985,16.63
6,2020-02-29,care_work,736,4556,809,5365,13.719
7,2020-02-29,household_chores,458,4556,809,5365,8.537
8,2020-02-29,organization/_mental_load,843,4556,809,5365,15.713
9,2020-03-31,care_work,890,4261,860,5121,17.379


In [31]:
dataframes=[relative_basis_week_mommit, relative_basis_week_daddit]
titles = ['r/Mommit: Share of postings on household chores out of all posts, weekly in %', 'r/daddit: Share of postings on household chores out of all posts, weekly in %']  # Specify the titles for each dataframe

# Find the maximum value across all dataframes
max_value = max(df['share_chores_to_all'].max() for df in dataframes)

for i, dataframe in enumerate(dataframes):
    fig = px.line(dataframe, x='date_time', y="share_chores_to_all", color="topic_label", title=f"{titles[i]}")
    fig.add_vline(x="2020-03-11", line_color="black", line_width=3)

    # Set the same y-axis scale for both graphs
    fig.update_layout(yaxis=dict(range=[0, max_value+10]))
    fig.show()

In [32]:
dataframes=[relative_basis_month_mommit, relative_basis_month_daddit]
titles = ['r/Mommit: Share of postings on household chores out of all posts, monthly in %', 'r/daddit: Share of postings on household chores out of all posts, monthly in %']  # Specify the titles for each dataframe

# Find the maximum value across all dataframes
max_value = max(df['share_chores_to_all'].max() for df in dataframes)

for i, dataframe in enumerate(dataframes):
    fig = px.line(dataframe, x='date_time', y="share_chores_to_all", color="topic_label", title=f"{titles[i]}")
    fig.add_vline(x="2020-03-11", line_color="black", line_width=3)

    # Set the same y-axis scale for both graphs
    fig.update_layout(yaxis=dict(range=[0, max_value+10]))
    fig.show()

### relatives all in one

In [33]:
import plotly.graph_objects as go
 
# Define the color mapping for each category
color_mapping = {
    'care_work': 'rgb(31, 119, 180)',
    'household_chores': 'rgb(44, 160, 44)',
    'organization/_mental_load': 'rgb(214, 39, 40)',
}

# visualize the broad categories based on top_n_words
dataframes = [relative_basis_week_mommit, relative_basis_week_daddit, relative_basis_month_mommit, relative_basis_month_daddit]
titles = ['r/Mommit: Posts associated with home responsibilities',
          'r/daddit: Posts associated with home responsibilities',
          '', 
          '']  # Specify the titles for each dataframe

for i, dataframe in enumerate(dataframes):
    fig = px.line(dataframe, x='date_time', y="share_chores_to_all", color="topic_label",
                  title=f"{titles[i]}", height=500, width=1300, markers=True)

    # Assign colors to categories using the color_mapping dictionary
    for category, color in color_mapping.items():
        fig.for_each_trace(lambda t: t.update(line=dict(color=color)) if t.name == category else None)

    #fig.add_vline(x="2020-03-11", line_color="black", line_width=3)

    # Set the same y-axis scale for both graphs
    fig.update_layout(yaxis=dict(range=[0, 40]))

    # Add vertical line --> start pandemic
    fig.add_shape(
        type="line",
        x0="2020-03-11", x1="2020-03-11",
        y0=0, y1=2500,
        line=dict(color="black", width=2),
        name="Start of pandemic"
    )

    # Add custom legend items as annotations
    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode='lines',
            marker=dict(color="black", size=10),
            showlegend=True,
            name="Start of pandemic"
        )
    )

    fig.update_layout(
        legend=dict(
            title='',
            title_font=dict(size=14),
            font=dict(size=18),

        ),
        xaxis=dict(
            title="",
            showgrid=True,  # Display x-axis grid lines
            gridcolor='lightgrey',  # Set the color of the x-axis grid lines
            gridwidth=0.5  # Set the width of the x-axis grid lines
            
        ),
        yaxis=dict(
            showgrid=True,  # Display y-axis grid lines
            gridcolor='lightgrey',  
            gridwidth=0.5 
        )
    )

    # Define the new labels for each topic_label
    new_labels = {
        'care_work': 'Care work',
        'household_chores': 'Household chores',
        'organization/_mental_load': 'Organization & mental load',
    }

    for category, color in color_mapping.items():
        fig.for_each_trace(lambda t: t.update(line=dict(color=color)) if t.name == category else None)
        # Change the legend label for each category
        fig.update_traces(name=new_labels[category], selector=dict(name=category))


    # Updating layout and axis labels
    fig.update_layout(
        plot_bgcolor='white',
        title={
            'font': {'size': 32, 'family': 'Calibri'} 
        },
        xaxis_title='',
        yaxis_title='Share (%)',
        xaxis={
            'title': {'font': {'size': 24, 'family': 'Calibri'}},  
            'tickfont': {'size': 24, 'family': 'Calibri'},  
            "range":["2019-12-01", "2022-04-30"],
            "dtick":"M2",
            "ticklabelmode":"period",
            "tickvals":["2019-12-31", "2020-01-31", "2020-02-28", "2020-03-31", "2020-04-30", "2021-03-31", "2022-03-31"],
            "rangebreaks": [
            {"bounds": ["2020-06-01", "2021-02-28"]},  
            {"bounds": ["2021-05-01", "2022-02-28"]}
        ]
        },
        yaxis={
            'title': {'font': {'size': 24, 'family': 'Calibri'}},  
            'tickfont': {'size': 24, 'family': 'Calibri'},  
        }
    )

    fig.show()


not grouped

In [37]:
import plotly.graph_objects as go
 

# visualize the broad categories based on top_n_words
dataframes = [relative_weekly_mommit, relative_weekly_daddit, relative_monhtly_mommit, relative_monhtly_daddit]
titles = ['r/Mommit: Share of postings associated with household chores, weekly in %',
          'r/daddit: Share of postings associated with household chores, weekly in %',
          'r/Mommit: Share of postings associated with household chores, monthly in %', 
          'r/daddit: Share of postings associated with household chores, monthly in %']  # Specify the titles for each dataframe

for i, dataframe in enumerate(dataframes):
    fig = px.line(dataframe, x='date_time', y="share_chores_to_all",
                  title=f"{titles[i]}", height=500, width=1200)

    #fig.add_vline(x="2020-03-11", line_color="black", line_width=3)

    # Set the same y-axis scale for both graphs
    fig.update_layout(yaxis=dict(range=[0, 50]))

    # Add vertical line --> start pandemic
    fig.add_shape(
        type="line",
        x0="2020-03-11", x1="2020-03-11",
        y0=0, y1=2500,
        line=dict(color="black", width=2),
        name="Start of pandemic"
    )

    # Add custom legend items as annotations
    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode='lines',
            marker=dict(color="black", size=10),
            showlegend=True,
            name="Start of pandemic"
        )
    )

    fig.update_layout(
        legend=dict(
            title='',
            title_font=dict(size=14)
        ),
        xaxis=dict(
            showgrid=True,  # Display x-axis grid lines
            gridcolor='lightgrey',  # Set the color of the x-axis grid lines
            gridwidth=0.5  # Set the width of the x-axis grid lines
        ),
        yaxis=dict(
            showgrid=True,  # Display y-axis grid lines
            gridcolor='lightgrey',  # Set the color of the y-axis grid lines
            gridwidth=0.5  # Set the width of the y-axis grid lines
        )
    )


    # Updating layout and axis labels
    fig.update_layout(
        plot_bgcolor='white',
        title={
            'font': {'size': 32, 'family': 'Calibri'}  
        },
        xaxis_title='Date',
        yaxis_title='Frequency in %',
        xaxis={
            'title': {'font': {'size': 24, 'family': 'Calibri'}}, 
            'tickfont': {'size': 24, 'family': 'Calibri'}  
        },
        yaxis={
            'title': {'font': {'size': 24, 'family': 'Calibri'}},  
            'tickfont': {'size': 24, 'family': 'Calibri'},  
        }
    )

    fig.show()
