# file for top_n_words as basis for traditional hh chores
--> household chores in topics

In [1]:
import pandas as pd
import numpy as np
from bertopic import BERTopic
import preprocessing_BERTopic as preprocessing
import plotly.io as pio
import gensim
import gensim.corpora as corpora
from gensim.models.coherencemodel import CoherenceModel
import plotly.express as px
import plotly.graph_objects as go

from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords


from docx import Document
from docx.enum.table import WD_TABLE_ALIGNMENT, WD_ALIGN_VERTICAL

# Load data & BERTopic models

- csv file with the clean data
- BERTopic model -- model with no restrictions for topic nr
    - only set min_topic_size to 30
- do outlier reduction so that most documents are assigned to a topic
- remove stopwords and few additional words from the words representative for the topics

In [2]:
# Countvectorizer remove stopwords after training BERTopic
stopwords = list(stopwords.words('english')) + ['http', 'https', "www", "oh", "com", "reddit", "Reddit", "haha", "lol", "daddit", "mommit"]

def reduce_outlier_topic(model, text, stopwords):
    topics=model.topics_
    new_distribution=model.reduce_outliers(text, topics, strategy="distributions")

    # initalize count vectorizer
    vectorizer_model = CountVectorizer(stop_words=stopwords, ngram_range=(1, 3))

    # update topics
    model.update_topics(text, topics=new_distribution, vectorizer_model=vectorizer_model)

    topic_info=model.get_topic_info()
    doc_info=model.get_document_info(text)

    return model, topic_info, doc_info

In [3]:
def load_dataset(csv:str, model_name:str):
    """"
    loads the mommit and daddit data and their corresponding topic model to create the merged df for both    
    """
    # load in dataset
    data= pd.read_csv(csv, sep = ';').iloc[:,1:]
    # filter out the rows that fall between the given dates
    mask = (data['date_time'] < '2020-06-01 00:00:00') | ((data['date_time'] > '2020-06-30 23:59:59')& (data['date_time'] < '2020-10-01 00:00:00')) | (data['date_time'] > '2020-10-31 23:59:59')
    data_corr_times = data.loc[mask]

    # reset index after filtering
    data_corr_times=data_corr_times.reset_index()
    # create a list of the text column
    text=data_corr_times["whole_text"].to_list()

    # load model
    model = BERTopic.load(model_name)

    # outlier reduction
    model_reducedOutlier, topic_info, doc_info=reduce_outlier_topic(model, text, stopwords)

    # merge the results from the topic model to the dataframe
    merged_df=pd.merge(data_corr_times, doc_info, left_index=True, right_index=True) # debug cell to see dataframe

    return merged_df, text, model_reducedOutlier, topic_info

In [4]:
# load data and model for r/Mommit
# min topic size =30
# replace mommit_clean with mommit_text_6 (or vice versa)
data_mommit, text_mommit, model_mommit, topic_info_mommit=load_dataset(csv="mommit_text_6.csv", model_name="topic_models\Ansatz_march_2021+2022\mommit\BERTopic_mommit_dec19_march22")
topic_info_mommit

Unnamed: 0,Topic,Count,Name
0,-1,6,-1_หน_broogie_งใหม เร_brugz
1,0,5792,0_sleep_bed_night_sleeping
2,1,3542,1_thank_sorry_thank much_comments
3,2,2441,2_food_eat_cheese_meal
4,3,1357,3_cute_photos_photo_picture
...,...,...,...
223,222,123,222_sneeze_pee_cough_legs
224,223,62,223_smoke_smoking_around smoke_neighbors
225,224,166,224_kisses_mouth_kiss_face
226,225,119,225_cake_smash_whipped_chocolate


In [10]:
len(data_mommit) # 77600

77600

In [11]:
# load data and model for r/daddit
data_daddit, text_daddit, model_daddit, topic_info_daddit=load_dataset(csv="daddit_clean.csv", model_name="topic_models\Ansatz_march_2021+2022\daddit\BERTopic_daddit_dec19_march22")
print(len(data_daddit)) # 78963 = len_data_corr_times
print(len(topic_info_daddit)) #216 topics + -1
topic_info_daddit

78963
217


Unnamed: 0,Topic,Count,Name
0,-1,24,-1_num_num num_aaaaayyyyyyy_yuuuup
1,0,2672,0_dad_dads_father_good
2,1,1800,1_eat_food_cheese_snack
3,2,1979,2_photos_photo_smile_picture
4,3,3303,3_sleep_night_bed_sleeping
...,...,...,...
212,211,182,211_car_cars_one car_two cars
213,212,215,212_balls_poop_tar_poop balls
214,213,143,213_wedding_weddings_common_community
215,214,114,214_stroller_strollers_car_double stroller


In [12]:
# write as parquet file for quick access
data_mommit.to_parquet("data_mommit.parquet")
data_daddit.to_parquet("data_daddit.parquet")

# typical female chores at home:

- one way to analyze gender roles is by looking at the distribution of household chores
- Daminger 2019: Distinction between pyhical and cognitive work in the household
    - cognitive work/ mental load: "No meal is made, no dentist appointment scheduled, and no daycare center selected without some amount of foresight, planning, and deciding."

- van Tienoven (2023) : COVIDTUS study and Belgian Time Use survey -> collection of household chores

- some keywords added manually after screening the posts

In [14]:
# filter for documents containing keywords related with household chores
keywords=[
            # category meal prep/ cooking/ food...
            "grocery list", "meal prep", "healthy eating", "recipes", "recipe", "meal planning", "grocery shopping", "grocery", "groceries", "cooking techniques",
            "kitchen appliances","intolerance", "cooking", "cook", "baking", "bake", "dinner", "lunch", "pan", "carrot", "banana", "cups", "tbsp", "veggies",
             "lunch boxes", "lunch box", "oatmeal", "diet","breakfast", "snack", "cheese", "nutrition", "ingredients", "set table", "meal", "oven", 
            
            # category Cleaning
            "wash", "housekeeping", "cleaning products", "vacuuming", "dusting", "mopping", "cleaning", "clean", "dishes", "dish washing", "dishwasher", "sweeping",
            "making beds","rinse", "wipe", "carpet", 
            
            # catgory Laundry
            "closet",  "folding clothes", "washing machine", "dryer", "laundry detergent", "ironing board", "iron", "laundry", "folded", "hand wash",

            # category 'Child_care':
            "feeding", "bathing", "homework", "playtime", "discipline", "take care", "taking care", "playtime", "bedtime story", "care", "playground", "daycare", "day-care",
            "security", "safety", "diaper", "school", "soothing", "coloring", "book", "kindergarden", "nap", "bedtime",

            # category: Planning_Organizing_Running_errands':
            "household", "calendar", "scheduling", "schedule", "reminders", "appointments", "activities", "birthday", "present", "doctor", "cake",  "shopping",
            "prescription pick-up","prescription", "post office", "dry cleaning", "bank", "errands", "planning", "organization", "supply", "supplies", "anticipate",
            "prepare","shopping", "Dr", "ped", "pediatrician", 

            # category 'Finances':
            "budgeting", "bill paying", "saving", "investing", "debt management", "dollar", "money", "spending", "cheap", "expensive", "pricey", "cost",
    
            # category 'House_maintanance': [ 
            "interior design", "furniture", "home repairs", "landscaping", "outdoor maintenance",  "plants", "planting", "weeding", "pruning", "lawn care",
            "decoration", "garden", "pets", "cat", "dog",
                
            # category 'Mental_load':
            "busy", "hectic", "organized", "domestic", "routine", "multitasking", "tiring", "fulfilling", "tired", "worry", "care",  "emotional", "fulfilling", "anticipate",
            "comfort", "support child", "anticipate",   "prepare", "comfort child", "comfort husband", "comfort family", "sick child", "ill child", "household", "chores"

            #### van Tienoven ####
            # COVIDTUS Routine --> only changed everything to small letters
            "serving food", "set the table", "cooking", "preparing food", "baking", "making coffee", "making drinks", "storing", "arranging", "preserving food", "washing dishes",
            "clearing table", "loading dishwasher", "unloading dishwasher", "vacuuming", "dusting", "sweeping", "mopping", "toilet cleaning", "clean the toilet", "window cleaning",
            "cleaning up", "putting things away", "changing sheets", "changing bedclothes", "making the bed", "outside cleaning" "clean terrace", "clean driveway", "clean paths",
            "clear up rubbish", "put out rubbish","garbage", "rubbish", "recycling of waste", "disposal of waste", "arranging household goods and materials", "groceries away",
            "write grocery list", "light a ﬁre","handwashing clothing", "handwashing textiles", "packing for a trip", "washing into washing machine",
            "removing washing from washing machine", "dryer", "sorting clothes", "hanging up washing", "taking in washing", "ironing", "mangling", "folding clothes",
            "arranging clothes", "folding textiles", "arranging textiles","mending", "adjusting clothes", "sewing on buttons", "handiwork", "polishing shoes", "paying bills",
            "taxes", "mortgages", "insurance", "household administration","prepare day", "prepare journey", "prepare party", "plan day", "plan journey", "plan party",
            "making shopping lists", "daily planning", "weekly planning","shopping groceries", "shop food", "shop clothes", "shop gifts", "shop household goods", "shop electronics",
            "shop plant", "comfort baby", "wash baby","dress baby", "put baby to bed", "feeding baby", "wash child", "dress child", "put child to bed", "feed children",
            "supervise eating", "cut meat", "cut food", "help with homework","supervise child with homework", "self-study", "home reading", "talking about school",
            "taking a child to school", "accompanying a child to school","taking a child to childcare", "accompanying a child to childcare", "talking to teacher",
            "taking baby to a doctor", "taking child to a doctor", "accompanying baby to a doctor", "accompanying child to a doctor", "taking baby to a hospital visit",
            "taking child to a hospital visit", "accompanying baby to a hospital visit", "accompanying child to a hospital visit",  "taking child to sport activities",
            "accompanying child to sport activities", "taking child to cultural activities", "accompanying child to cultural activities", "taking child to activities",
            "taking baby to activities", "accompanying child to activities", "accompanying baby to activities", "physical care", "cutting hair", "cut hair", "toileting",
            "take medical care",

            # BTUSI3 Routine
            # only what has not been in Covidtus list
            "cooking", "preparing food", "set table", "washing dishes", "clearing table", "dusting", "sweeping", "packing away", "mopping", "cleaning", "washing car",
            "handwashing", "arranging clothes", "arranging textiles", 

            # COVIDTUS non-routine
            "tending pet", "pet", "caring for pets", "clean cage", "clean stable", "clean aquarium",  "indoor plant care", "watering",  "mowing grass", "working in the garden",
            "taking care outdoor plants", "veterinarian", "walking the dog", "walk the dog", "playing with pet", "playing with pets", "construction renovation", "painting",
            "wallpapering", "plumbing", "electrical repairs", "decor changes", "carpentry", "rooﬁng", "maintenance of household equipment", "repair of household equipment",
            "maintenance of household equipment", "maintenance of household appliances","maintenance of household appliances", "maintenance of car", "repair of car",
            "maintenance of bike", "repair of bike", "washing car", "washing bike", "washing scooter", "picking up", "laundromat", "shoemaker", "refueling",  "bank",
            "post ofﬁce",  "council", "centrelink", "tax ofﬁce", "police", "government department", "broker", "insurance", "notary public", "visit garage", "visit mechanic",
            "car wash", "warrant check", "pharmacy", "reading to child", "playing with child", "talking to child ", "going for walks with children", "swimming with children",
            "bike ride", "outdoor games",  "supervision", "give guidance", "teach", 

            # BTUSI3 non-routine
            # only what has not been in Covidtus list
            "online banking",


          # added by me
          "leftovers", "food storage", "kitchen organization", "food preparation", "spices and seasoning", "grilling", "freezing food", "food preservation", "slow cooker",

          "stain removal", "garbage disposal", "cleaning schedule", "cleaning supplies", "steam cleaning", "organizing closets", "decluttering", "household maintenance",

          "dry cleaning", "delicate fabrics", "laundry sorting", "stain treatment", "laundry routine", "laundry symbols", "ironing clothes", "laundry storage",

          "childproofing", "bath time", "potty training", "storytelling", "craft activities", "outdoor play", "child development", "playdate", "children's health",

          "time management", "meal planning", "appointment scheduling", "gift", "party planning", "holiday preparations", "travel arrangements", "home inventory",
          "home improvement projects",

          "budget management", "expense tracking", "bill organization", "financial planning", "investment strategies", "tax preparation", "insurance policies",
          "savings goals",

          "home security", "pest control", "HVAC maintenance", "roof maintenance", "gutter cleaning", "paint touch-ups", "furniture repairs", "home decor updates",

          "time for self-care", "stress management", "family communication", "work-life balance", "self-care routine", "relaxation techniques", "mindfulness practices",
          "prioritizing tasks"
          ]

In [15]:
def build_chores_df(df, column:str):
    # initialize an empty list to store the filtered dataframes
    filtered_dfs = []

    # iterate through the list of keywords and filter the dataframe for each keyword
    for keyword in keywords:
        filtered_df = df[df[column].str.contains(fr'\b{keyword}\b', regex=True)] # only takes EXACT keywords.
        # E.g. if the keyword is "supervise child with homework", documents only containing "child" dont fall into the filter
        filtered_dfs.append(filtered_df)

    # combine the filtered dataframes
    chores_dataframe = pd.concat(filtered_dfs)
    chores_dataframe=chores_dataframe.reset_index()

    return chores_dataframe

# hh chores and care work in the topics found by bertopic

## In how many topics can the keywords be found, based on the top_n_words ?

- are traditional hh chores represented in the topics?

In [16]:
# build chores dataframe for mommit and daddit, based on whether keywords are found in the Top_n_words 
top_n_chores_mommit= build_chores_df(data_mommit, "Top_n_words")
top_n_chores_daddit= build_chores_df(data_daddit, "Top_n_words")

print(len(top_n_chores_mommit)) # 35230
print(len(top_n_chores_daddit)) # 29290

35230
29290


In [17]:
# drop duplicates, when multiple keywords are found in one posting
## drop duplicates in Mommit
top_n_clean_mommit=top_n_chores_mommit.drop_duplicates()
print(len(top_n_clean_mommit)) # mommit: 24555 

## topics in top_n_clean_mommit
topics_chores_mommit= top_n_clean_mommit["Name"].unique()
print(len(topics_chores_mommit)) # 66 topics where the top n words are in the household chores keywords

## drop duplicates in daddit
top_n_clean_daddit=top_n_chores_daddit.drop_duplicates()
print(len(top_n_clean_daddit)) # daddit: 14867

## topics in top_n_clean_daddit
topics_chores_daddit= top_n_clean_daddit["Name"].unique()
print(len(topics_chores_daddit)) # 45 topics where the top n words are in the household chores keywords

24555
66
14867
45


In [18]:
# relation all topics to household related topics
# 28.94 % der topics in r/Mommit haben in ihren top n words household chores
# 20.737% der topics in r/daddit haben in ihren top n words household chores
relation_topics_mommit=len(topics_chores_mommit)/len(topic_info_mommit)
relation_topics_daddit=len(topics_chores_daddit)/len(topic_info_daddit)

print(relation_topics_mommit)
print(relation_topics_daddit)

0.2894736842105263
0.2073732718894009


### Create a table with topic, Count, Top_n_word and an example document

In [19]:
def add_info_topics_chores(topic_chores, topic_info, data):
    """ add Count and Top_n_words to the dataframe

    topic_chores= array that includes topics with keywords in their top_n_words (representative words for topic)   
    topic_info= output from BERTopic, numeric and string names of topic and topic count
    data= for each posting, topic assignment (here to get the Top_n_words per topic)
    """
    # find the topic names present in the topic_chores df
    filtered_chores = topic_info[topic_info['Name'].isin(topic_chores)]
    merged_df = filtered_chores.merge(data[["Topic", "Name", "Top_n_words"]], on=['Topic', 'Name']).drop_duplicates().reset_index()
    return merged_df

In [20]:
# add Count and Top_n_words for the topics that have keywords on hh chores/ childcare 
topics_chores_mommit=add_info_topics_chores(topics_chores_mommit, topic_info_mommit, data_mommit)
topics_chores_daddit=add_info_topics_chores(topics_chores_daddit, topic_info_daddit, data_daddit)
topics_chores_mommit

Unnamed: 0,index,Topic,Count,Name,Top_n_words
0,0,0,5792,0_sleep_bed_night_sleeping,sleep - bed - night - sleeping - nap - crib - ...
1,5792,2,2441,2_food_eat_cheese_meal,food - eat - cheese - meal - chicken - foods -...
2,8233,8,652,8_book_books_read_series,book - books - read - series - pout - library ...
3,8885,12,1237,12_formula_breastfeeding_milk_breast,formula - breastfeeding - milk - breast - fed ...
4,10122,29,464,29_pediatrician_doctor_appointment_doctors,pediatrician - doctor - appointment - doctors ...
...,...,...,...,...,...
61,24053,203,70,203_taxes_file_tax_irs,taxes - file - tax - irs - filed - 2019 - stim...
62,24123,215,111,215_break_breaks_need break_reading,break - breaks - need break - reading - candle...
63,24234,218,86,218_playground_playgrounds_slide_park,playground - playgrounds - slide - park - tram...
64,24320,225,119,225_cake_smash_whipped_chocolate,cake - smash - whipped - chocolate - cakes - i...


In [21]:
def example_document_df(topic_info, merged_df):
    """
    - add an example document to the df that has topics, with top_n_words from keyword list
    - loop over the topics that have keywords in their top_n_words and find a representative document & one with an assigned probability of 1 

    topic_info= topics_chores_mommit /topics_chores_daddit --> df with topics (string and numeric), count and top_n_words based on occurance of keyword in top_n_words
    merged_df= for each posting, topic assignment (here to get an example document per topic)
    """
    result = []
    unique_topics = topic_info["Topic"].unique()

    for topic in unique_topics:
        filtered_df = merged_df[(merged_df["Topic"] == topic) & (merged_df["Representative_document"] == True) & (merged_df["Probability"] == 1)]
        if not filtered_df.empty:
            document = filtered_df.iloc[0]["Document"]
            top_n_words=filtered_df.iloc[0]["Top_n_words"]
            name = topic_info[topic_info["Topic"] == topic]["Name"].iloc[0]
            count = topic_info[topic_info["Topic"] == topic]["Count"].iloc[0]

            result.append({"Topic": topic, "Name": name, "Count":count, "Top_n_words":top_n_words, "Document": document})

    result_df = pd.DataFrame(result)
    return result_df

In [22]:
topics_chores_daddit=example_document_df(topics_chores_daddit, data_daddit)
topics_chores_mommit=example_document_df(topics_chores_mommit, data_mommit)

In [23]:
def create_table_topics(df, docname):
# Create a new Word document
    doc = Document()

    # Add a table to the document
    table = doc.add_table(rows=1, cols=len(df.columns))
    table.style = 'Table Grid'  # Apply a table style with lines

    # Add column names to the table
    for i, column_name in enumerate(df.columns):
        table.cell(0, i).text = column_name

    # Add data to the table
    for _, row in df.iterrows():
        new_row = table.add_row().cells
        for i, value in enumerate(row):
            new_row[i].text = str(value)

    # Align the table horizontally to the center
    table.alignment = WD_TABLE_ALIGNMENT.CENTER

    # Set vertical alignment for all cells to center
    for row in table.rows:
        for cell in row.cells:
            cell.vertical_alignment = WD_ALIGN_VERTICAL.CENTER

    # Save the document
    doc.save(f'{docname}.docx')

In [18]:
# create a table with example for all topics including a keyword on home responsibilities - r/daddit (appendix)
# create_table_topics(topics_chores_daddit, "tables/chores_topics_daddit") 

In [None]:
# create a table with example for all topics including a keyword on home responsibilities - r/Mommit (appendix)
#create_table_topics(topics_chores_mommit, "tables/chores_topics_mommit") 

### group topics into broader categories

- inductive content analysis

categories by Daminger
- logistics/ scheduling
- care for children
- social relationships
- cleaning/ laundry
- shopping
- food
- travel / leisure
- finances
- home/ car maintenance

In [24]:
# start with damingers list and then expand/ change it
# basis are the topics, where keywords are found in the top_n_words (=representative for a topic)

keywords_broader_daddit= {'Meal planning & Shopping': [1, 203],                          
                    'Cleaning, laundry & house maintanance': [32, 86, 108, 156], 
                    'Childcare':[6, 9, 25, 39, 54, 57, 88, 104, 115,127, 154, 157, 200],
                    'Child health': [12, 120, 159, 165, 188, 206],
                    'Finances':[35, 49, 79, 112, 140],
                    '(Social) Activities':[7, 26, 55, 66, 103, 106, 138, 153, 175, 194, 205],
                    'Mom health & support':[139, 207], 
                    'Other':[34, 212]
}

keywords_broader_mommit = {'Meal planning & Shopping': [2, 109, 131, 164, 165, 186, 225],                          
                    'Cleaning, laundry & house maintanance': [32, 38, 88, 133, 177, 184, 192, 193], 
                    'Childcare':[0, 12, 33, 36, 39, 40, 43, 50, 59, 85, 108, 127, 134,157, 163, 169, 182, 199],
                    'Child health': [29, 30, 54, 90, 115, 124, 130, 146],
                    'Finances':[99, 12, 128, 203],
                    '(Social) Activities':[8, 31, 35, 41, 52,62, 65, 66, 79, 105, 151, 218, 226],
                    'Mom health & support':[63, 141,144, 156, 190, 194, 195, 215],
                    'Other':[142]
}

In [25]:
def assign_category(topic, category_dict):
    """assign braoder categroy labels to topics"""
    for category, topics in category_dict.items():
        if topic in topics:
            return category
    return 'Unknown'  # If no matching category is found

# Add a new column with the assigned categories
top_n_clean_daddit['Category'] = top_n_clean_daddit['Topic'].apply(lambda x: assign_category(x, keywords_broader_daddit))

# Add a new column with the assigned categories
top_n_clean_mommit['Category'] = top_n_clean_mommit['Topic'].apply(lambda x: assign_category(x, keywords_broader_mommit))

In [26]:
print(len(top_n_clean_mommit))
print(len(top_n_clean_daddit))

24555
14867


In [28]:
top_n_clean_mommit["small_categories"] = np.where((top_n_clean_mommit['Category'] == 'Meal planning & Shopping') |
                                                  (top_n_clean_mommit['Category'] == 'Cleaning, laundry & house maintanance'),
                                                  'Household chores',
                                                  np.where((top_n_clean_mommit['Category'] == 'Childcare') |
                                                           (top_n_clean_mommit['Category'] == 'Child health'),
                                                           'Care work',
                                                           np.where((top_n_clean_mommit['Category'] == 'Finances') |
                                                                    (top_n_clean_mommit['Category'] == '(Social) Activities') |
                                                                    (top_n_clean_mommit['Category'] == 'Mom health & support'),
                                                                    'Organization & mental load',
                                                                    top_n_clean_mommit['Category']
                                                                    )
                                                           )
                                                  )

top_n_clean_daddit["small_categories"] = np.where((top_n_clean_daddit['Category'] == 'Meal planning & Shopping') |
                                                  (top_n_clean_daddit['Category'] == 'Cleaning, laundry & house maintanance'),
                                                  'Household chores',
                                                  np.where((top_n_clean_daddit['Category'] == 'Childcare') |
                                                           (top_n_clean_daddit['Category'] == 'Child health'),
                                                           'Care work',
                                                           np.where((top_n_clean_daddit['Category'] == 'Finances') |
                                                                    (top_n_clean_daddit['Category'] == '(Social) Activities') |
                                                                    (top_n_clean_daddit['Category'] == 'Mom health & support'),
                                                                    'Organization & mental load',
                                                                    top_n_clean_daddit['Category']
                                                                    )
                                                           )
                                                  )

#### table of grouped topics

In [29]:
# broader categories
def grouping_table(top_n_chores):
    """creates table with topic and its assigend category plus each topics top_n_words
    - for appendix --> to comprehend grouping
    """

    category_order = ['Childcare', 'Child health', 'Cleaning, laundry & house maintanance', 'Meal planning & Shopping',
                  '(Social) Activities', "Mom health & support", 'Finances', 'Other']

    top_n_chores_reduced=top_n_chores[["Category", "Topic", "Top_n_words"]].drop_duplicates()
    top_n_chores_reduced=top_n_chores_reduced.sort_values(by="Category", key=lambda x: x.map({v: i for i, v in enumerate(category_order)}))

    return top_n_chores_reduced

In [30]:
top_n_clean_mommit_reduced=grouping_table(top_n_clean_mommit)
top_n_clean_daddit_reduced=grouping_table(top_n_clean_daddit)

In [31]:
top_n_clean_mommit_reduced

Unnamed: 0,Category,Topic,Top_n_words
15563,Childcare,108,teacher - school - teachers - 504 - iep - stud...
11526,Childcare,169,year old - shower - old - naked - weird - year...
15561,Childcare,157,kindergarten - school - grade - preschool - st...
12241,Childcare,85,daycare - daycares - provider - home - home da...
9856,Childcare,12,formula - breastfeeding - milk - breast - fed ...
...,...,...,...
24651,Mom health & support,156,ppd - ppa - ppd ppa - depression - doctor - he...
31582,Finances,203,taxes - file - tax - irs - filed - 2019 - stim...
31655,Finances,99,insurance - bill - medicaid - pay - hallway - ...
13245,Finances,128,childcare - daycare - pay - paying - assistanc...


In [32]:
# create a table
# create_table_topics(top_n_clean_mommit_reduced, "tables/grouping_topics_mommit")  

In [None]:
# create_table_topics(top_n_clean_daddit_reduced, "tables/grouping_topics_daddit") 

#### visualize grouped topics

In [33]:
# append the top_n_clean dfs to better visualize in one graph
top_n_clean_append = pd.concat([top_n_clean_mommit, top_n_clean_daddit])
len(top_n_clean_append)

39422

In [34]:
# broad categories
grouped_df = top_n_clean_append.groupby(['Category', 'subreddit']).size().reset_index(name='Count')

In [35]:
# small categories
grouped_df_small = top_n_clean_append.groupby(['small_categories', 'subreddit']).size().reset_index(name='Count')
grouped_df_small

Unnamed: 0,small_categories,subreddit,Count
0,Care work,Mommit,13911
1,Care work,daddit,6277
2,Household chores,Mommit,4856
3,Household chores,daddit,2794
4,Organization & mental load,Mommit,5608
5,Organization & mental load,daddit,5268
6,Other,Mommit,180
7,Other,daddit,528


plot categories

In [36]:
# broader categories
category_order = ['Childcare', 'Child health', 'Cleaning, laundry & house maintanance', 'Meal planning & Shopping',
                  '(Social) Activities', "Mom health & support", 'Finances', 'Other']

colors = ["rgb(241, 133, 64)", "rgb(88, 176, 95)"]


fig = px.bar(grouped_df, x='Category', y="Count", color="subreddit", barmode="group", category_orders={'Category': category_order}, height=800, width=1000) # mit 500 & 1200 vergleichen

# Assigning the shades of grey to bar colors
for i, color in enumerate(colors):
    fig.update_traces(selector=dict(name=grouped_df['subreddit'].unique()[i]), marker_color=color)


fig.update_layout(
    legend=dict(
        title='Subreddit',
        font=(dict(size=18)),
        title_font=dict(size=22)
    ),
    xaxis=dict(
        showgrid=True,  # Display x-axis grid lines
        gridcolor='lightgrey',  # Set the color of the x-axis grid lines
        gridwidth=0.5  # Set the width of the x-axis grid lines
    ),
    yaxis=dict(
        showgrid=True,  # Display y-axis grid lines
        gridcolor='lightgrey',  # Set the color of the y-axis grid lines
        gridwidth=0.5  # Set the width of the y-axis grid lines
    )
)

# Updating layout and axis labels
fig.update_layout(
    plot_bgcolor='white',
    title={
        'text': 'Posts with topics on home responsibilities by category',
        'font': {'size': 32, 'family': 'Calibri'}  # Adjust the size and family of the title font as desired
    },
    xaxis_title='',
    yaxis_title='Count',
    xaxis={
        'title': {'font': {'size': 24, 'family': 'Calibri'}},  # Adjust the size and family of the x-axis label as desired
        'tickfont': {'size': 24, 'family': 'Calibri'}  # Adjust the size and family of the x-axis tick labels as desired
    },
    yaxis={
        'title': {'font': {'size': 24, 'family': 'Calibri'}},  # Adjust the size and family of the y-axis label as desired
        'tickfont': {'size': 24, 'family': 'Calibri'},  # Adjust the size and family of the y-axis tick labels as desired
    }
)


fig.show()


In [37]:
# smaller categories
category_order = ["Care work", 'Household chores', 'Organization & mental load']

colors = ["rgb(241, 133, 64)", "rgb(88, 176, 95)"]


fig = px.bar(grouped_df_small, x='small_categories', y="Count", color="subreddit", barmode="group", category_orders={'small_categories': category_order}, height=800, width=800)

# Assigning the shades of grey to bar colors
for i, color in enumerate(colors):
    fig.update_traces(selector=dict(name=grouped_df_small['subreddit'].unique()[i]), marker_color=color)


fig.update_layout(
    legend=dict(
        title='Subreddit',
        title_font=dict(size=14)
    ),
    xaxis=dict(
        showgrid=True,  # Display x-axis grid lines
        gridcolor='lightgrey',  # Set the color of the x-axis grid lines
        gridwidth=0.5  # Set the width of the x-axis grid lines
    ),
    yaxis=dict(
        showgrid=True,  # Display y-axis grid lines
        gridcolor='lightgrey',  # Set the color of the y-axis grid lines
        gridwidth=0.5  # Set the width of the y-axis grid lines
    )
)

# Updating layout and axis labels
fig.update_layout(
    plot_bgcolor='white',
    title={
        'text': 'Postings by category and subreddit',
        'font': {'size': 32, 'family': 'Calibri'}  # Adjust the size and family of the title font as desired
    },
    xaxis_title='',
    yaxis_title='Count',
    xaxis={
        'title': {'font': {'size': 24, 'family': 'Calibri'}},  # Adjust the size and family of the x-axis label as desired
        'tickfont': {'size': 24, 'family': 'Calibri'}  # Adjust the size and family of the x-axis tick labels as desired
    },
    yaxis={
        'title': {'font': {'size': 24, 'family': 'Calibri'}},  # Adjust the size and family of the y-axis label as desired
        'tickfont': {'size': 24, 'family': 'Calibri'},  # Adjust the size and family of the y-axis tick labels as desired
    }
)


fig.show()


### visualize over time

In [38]:
# count chores occurances in general
def weekly_monthly_chores(df):
    df['date_time'] = pd.to_datetime(df['date_time'])
    weekly = df.groupby(pd.Grouper(key='date_time', freq='W-SUN'))['small_categories'].value_counts().reset_index(name='Count')
    monthly = df.groupby(pd.Grouper(key='date_time', freq='M'))['small_categories'].value_counts().reset_index(name='Count')

    return weekly, monthly

mommit_weekly_chores, mommit_monthly_chores=weekly_monthly_chores(top_n_clean_mommit)
daddit_weekly_chores, daddit_monthly_chores=weekly_monthly_chores(top_n_clean_daddit)

In [39]:
daddit_weekly_chores

Unnamed: 0,date_time,small_categories,Count
0,2019-12-01,Care work,23
1,2019-12-01,Organization & mental load,8
2,2019-12-01,Household chores,3
3,2019-12-08,Care work,153
4,2019-12-08,Organization & mental load,149
...,...,...,...
126,2022-03-27,Other,95
127,2022-04-03,Care work,310
128,2022-04-03,Organization & mental load,206
129,2022-04-03,Household chores,100


In [40]:
mommit_weekly_chores

Unnamed: 0,date_time,small_categories,Count
0,2019-12-01,Organization & mental load,32
1,2019-12-01,Care work,10
2,2019-12-01,Household chores,4
3,2019-12-08,Care work,160
4,2019-12-08,Organization & mental load,133
...,...,...,...
122,2022-03-27,Other,14
123,2022-04-03,Care work,958
124,2022-04-03,Household chores,488
125,2022-04-03,Organization & mental load,250


In [41]:
daddit_monthly_chores

Unnamed: 0,date_time,small_categories,Count
0,2019-12-31,Care work,579
1,2019-12-31,Organization & mental load,469
2,2019-12-31,Household chores,149
3,2019-12-31,Other,35
4,2020-01-31,Care work,795
5,2020-01-31,Organization & mental load,531
6,2020-01-31,Household chores,262
7,2020-01-31,Other,41
8,2020-02-29,Care work,603
9,2020-02-29,Organization & mental load,518


In [42]:
print(daddit_weekly_chores["Count"].sum())
print(mommit_weekly_chores["Count"].sum())

14867
24555


In [43]:
# sanity check:
mommit_carework=mommit_weekly_chores[mommit_weekly_chores["small_categories"]=="Care work"]
print(mommit_carework["Count"].sum()) # sollte 13911 sein

13911


#### absoulte

postings, where the from BERTopic created topics and associated top_n_words have hh keywords in it --> + manual grouping into broader categories
- absolute
- relative

In [44]:
# absoulute weekly

# visualize the broad categories based on top_n_words
dataframes=[mommit_weekly_chores, daddit_weekly_chores]
titles = ['r/Mommit: Absolute postings associated with household chores, weekly',
          'r/daddit: Absolute postings associated with household chores, weekly'
          ]  # Specify the titles for each dataframe

# Find the maximum value across all dataframes
max_value=2000

for i, dataframe in enumerate(dataframes):
    fig = px.line(dataframe, x='date_time', y="Count", color="small_categories", title=f"{titles[i]}")
    fig.add_vline(x="2020-03-11", line_color="black", line_width=3)

    # Set the same y-axis scale for both graphs
    fig.update_layout(yaxis=dict(range=[0, 2500]))
    fig.show()

In [45]:
# absoulute monthly

# visualize the broad categories based on top_n_words
dataframes=[mommit_monthly_chores,daddit_monthly_chores]
titles = ['r/Mommit: Absolute postings associated with household chores, monthly',
          'r/daddit: Absolute postings associated with household chores, monthly',
          ]  # Specify the titles for each dataframe

# Find the maximum value across all dataframes
max_value=7000

for i, dataframe in enumerate(dataframes):
    fig = px.line(dataframe, x='date_time', y="Count", color="small_categories", title=f"{titles[i]}")
    fig.add_vline(x="2020-03-11", line_color="black", line_width=3)

    # Set the same y-axis scale for both graphs
    fig.update_layout(yaxis=dict(range=[0, max_value+10]))
    fig.show()

#### relative

##### grouping

In [46]:
def weekly_monthly_grouping(df):
    """
    Count number of documents per week from the df where keywords are found in top_n_words
    """
    df["date_time"]=pd.to_datetime(df['date_time'])
    per_week = df.groupby([pd.Grouper(key='date_time', freq='W-SUN')]).size().reset_index(name='grouped_count')
    per_month = df.groupby([pd.Grouper(key='date_time', freq="M")]).size().reset_index(name='grouped_count')

    return per_week, per_month

In [47]:
# basis are all data
weekly_mommit2, monthly_mommit2=weekly_monthly_grouping(data_mommit)
weekly_daddit2, monthly_daddit2=weekly_monthly_grouping(data_daddit)
weekly_mommit2

Unnamed: 0,date_time,grouped_count
0,2019-12-01,121
1,2019-12-08,1141
2,2019-12-15,1146
3,2019-12-22,1144
4,2019-12-29,858
...,...,...
118,2022-03-06,6593
119,2022-03-13,11057
120,2022-03-20,11038
121,2022-03-27,11618


In [48]:
def relative_broad_categories(grouped_df, chores_df):
    relative_weeks= pd.merge(grouped_df,chores_df, on="date_time" )
    relative_weeks["relative chores"]= relative_weeks["Count"]/relative_weeks["grouped_count"]*100
    return relative_weeks

# merge weekly grouped data with hh chores 
relative_weeks_mommit=relative_broad_categories(mommit_weekly_chores, weekly_mommit2)
relative_weeks_daddit=relative_broad_categories(daddit_weekly_chores, weekly_daddit2)

relative_weeks_mommit

Unnamed: 0,date_time,small_categories,Count,grouped_count,relative chores
0,2019-12-01,Organization & mental load,32,121,26.446281
1,2019-12-01,Care work,10,121,8.264463
2,2019-12-01,Household chores,4,121,3.305785
3,2019-12-08,Care work,160,1141,14.022787
4,2019-12-08,Organization & mental load,133,1141,11.656442
...,...,...,...,...,...
122,2022-03-27,Other,14,11618,0.120503
123,2022-04-03,Care work,958,6093,15.722961
124,2022-04-03,Household chores,488,6093,8.009191
125,2022-04-03,Organization & mental load,250,6093,4.103069


In [49]:
# merge monthly grouped data with hh chores
relative_months_mommit=relative_broad_categories(mommit_monthly_chores, monthly_mommit2)
relative_months_daddit=relative_broad_categories(daddit_monthly_chores, monthly_daddit2)
relative_months_daddit

Unnamed: 0,date_time,small_categories,Count,grouped_count,relative chores
0,2019-12-31,Care work,579,7589,7.629464
1,2019-12-31,Organization & mental load,469,7589,6.179997
2,2019-12-31,Household chores,149,7589,1.963368
3,2019-12-31,Other,35,7589,0.461194
4,2020-01-31,Care work,795,8930,8.902576
5,2020-01-31,Organization & mental load,531,8930,5.946249
6,2020-01-31,Household chores,262,8930,2.933931
7,2020-01-31,Other,41,8930,0.459127
8,2020-02-29,Care work,603,7966,7.569671
9,2020-02-29,Organization & mental load,518,7966,6.502636


In [50]:
# sanity check
relative_weeks_mommit["Count"].sum()

24555

In [53]:
# to parquet
relative_months_mommit.to_parquet("figure5_percentages.parquet")
relative_months_daddit.to_parquet("figure6_percentages.parquet")

# Figures 5 & 6

In [51]:
# Define the desired order for the legend values
legend_order = ["Care work", "Household chores", "Organization & mental load", "Other"]  # Replace with your desired order
colors = {
    'Care work': 'rgb(31, 119, 180)',
    'Household chores': 'rgb(44, 160, 44)',
    'Organization & mental load': 'rgb(214, 39, 40)',
    'Other':'rgb(127, 127, 127)'
}

dataframes = [relative_weeks_mommit, relative_weeks_daddit, relative_months_mommit, relative_months_daddit]
titles = ['r/Mommit: Posts with topics associated with home responsibilities',
          'r/daddit: Posts with topics associated with home responsibilities',
          '',  # Specify the titles for each dataframe
          ''] #daddit

for i, df in enumerate(dataframes):
    fig = px.line(df, x='date_time', y="relative chores", color="small_categories",
                  title=f"{titles[i]}", height=500, width=1200, category_orders={'small_categories': legend_order}, color_discrete_map=colors, markers=True)


    # Set the same y-axis scale for both graphs
    fig.update_layout(yaxis=dict(range=[0, 30]))

        # Add vertical line --> start pandemic
    fig.add_shape(
        type="line",
        x0="2020-03-11", x1="2020-03-11",
        y0=0, y1=2500,
        line=dict(color="black", width=2),
        name="Start of pandemic"
    )

    # Add custom legend items as annotations
    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode='lines',
            marker=dict(color="black", size=10),
            showlegend=True,
            name="Start of pandemic"
        )
    )

    fig.update_layout(
        legend=dict(
            title='',
            title_font=dict(size=14)
        ),
        xaxis=dict(
            showgrid=True,  # Display x-axis grid lines
            gridcolor='lightgrey',  # Set the color of the x-axis grid lines
            gridwidth=0.5  # Set the width of the x-axis grid lines
        ),
        yaxis=dict(
            showgrid=True,  # Display y-axis grid lines
            gridcolor='lightgrey',  # Set the color of the y-axis grid lines
            gridwidth=0.5  # Set the width of the y-axis grid lines
        )
    )

    # Updating layout and axis labels
    fig.update_layout(
        plot_bgcolor='white',
        title={
            'font': {'size': 32, 'family': 'Calibri'} 
        },
        xaxis_title='',
        yaxis_title='Share (%)',
        xaxis={
            'title': {'font': {'size': 24, 'family': 'Calibri'}},  
            'tickfont': {'size': 24, 'family': 'Calibri'},  
            "range":["2019-12-01", "2022-04-30"],
            "dtick":"M2",
            "ticklabelmode":"period",
            "tickvals":["2019-12-31", "2020-01-31", "2020-02-28", "2020-03-31", "2020-04-30", "2021-03-31", "2022-03-31"],
            "rangebreaks": [
            {"bounds": ["2020-06-01", "2021-02-28"]},  
            {"bounds": ["2021-05-01", "2022-02-28"]}
        ]
            
        },
        yaxis={
            'title': {'font': {'size': 24, 'family': 'Calibri'}},  
            'tickfont': {'size': 24, 'family': 'Calibri'}, 
            'tickmode': 'linear',  # Set the tick mode to linear
            'dtick': 5  # Set the tick step size to 5

        },

        legend=dict(
            title='',
            title_font=dict(size=14),
            font=dict(size=18),
            traceorder='normal',  # Preserve the order of legend items
            itemsizing='trace'  # Adjust the size of legend items based on traces
        )
    )
    fig.update_xaxes(tickangle= 45)  


    fig.show()

##### no grouping

just show documents with top_n _words over time in relation to *all* postings --> also zu data_mommit/daddit weekly/ monthly gruppiert
- weekly
- monthly

In [54]:
# hh chores df grouped
weekly_mommit, monthly_mommit=weekly_monthly_grouping(top_n_clean_mommit)
weekly_daddit, monthly_daddit=weekly_monthly_grouping(top_n_clean_daddit)


weekly_mommit_orig, monthly_mommit_orig=weekly_monthly_grouping(data_mommit)
weekly_daddit_orig, monthly_daddit_orig=weekly_monthly_grouping(data_daddit)
monthly_mommit_orig

Unnamed: 0,date_time,grouped_count
0,2019-12-31,4698
1,2020-01-31,4985
2,2020-02-29,5365
3,2020-03-31,5121
4,2020-04-30,4711
5,2020-05-31,0
6,2020-06-30,0
7,2020-07-31,0
8,2020-08-31,0
9,2020-09-30,0


In [55]:
monthly_mommit.head()

Unnamed: 0,date_time,grouped_count
0,2019-12-31,1576
1,2020-01-31,1728
2,2020-02-29,1619
3,2020-03-31,2013
4,2020-04-30,1586


In [56]:
def rel_orig_groups(monthly_orig, monthly_chores, weekly_orig, weekly_chores, subreddit):

    merged_months = pd.merge(monthly_orig, monthly_chores, on="date_time", suffixes=('_orig', '_chores'))
    merged_months["grouped_count_chores"]=merged_months["grouped_count_chores"].astype(int)
    merged_months["grouped_count_orig"]=merged_months["grouped_count_orig"].astype(int)

    merged_months["relative"]=merged_months["grouped_count_chores"]/merged_months["grouped_count_orig"]*100

    merged_weeks = pd.merge(weekly_orig, weekly_chores, on="date_time", suffixes=('_orig', '_chores'))
    merged_weeks["grouped_count_chores"]=merged_weeks["grouped_count_chores"].astype(int)
    merged_weeks["grouped_count_orig"]=merged_weeks["grouped_count_orig"].astype(int)

    merged_weeks["relative"]=merged_weeks["grouped_count_chores"]/merged_weeks["grouped_count_orig"]*100

    merged_months.fillna(0, inplace=True)
    merged_weeks.fillna(0, inplace=True)
    
    merged_months["subreddit"]=f"{subreddit}"
    merged_weeks["subreddit"]=f"{subreddit}"

    # drop rows where count = 0
    merged_months = merged_months[merged_months['grouped_count_orig'] != 0]
    merged_weeks = merged_weeks[merged_weeks['grouped_count_orig'] != 0]

    return merged_months, merged_weeks

In [57]:
merged_months_mommit, merged_weeks_mommit=rel_orig_groups(monthly_mommit_orig, monthly_mommit, weekly_mommit_orig, weekly_mommit, "Mommit")
merged_months_daddit, merged_weeks_daddit=rel_orig_groups(monthly_daddit_orig, monthly_daddit, weekly_daddit_orig, weekly_daddit, "daddit")
merged_weeks_mommit
merged_months_mommit

Unnamed: 0,date_time,grouped_count_orig,grouped_count_chores,relative,subreddit
0,2019-12-31,4698,1576,33.54619,Mommit
1,2020-01-31,4985,1728,34.663992,Mommit
2,2020-02-29,5365,1619,30.177074,Mommit
3,2020-03-31,5121,2013,39.308729,Mommit
4,2020-04-30,4711,1586,33.665888,Mommit
15,2021-03-31,6321,1856,29.362443,Mommit
27,2022-03-31,46399,14177,30.554538,Mommit


In [58]:
# create df with monthly both subreddits and weekly both subreddits
relative_monhtly_subreddits = pd.concat([merged_months_mommit, merged_months_daddit], ignore_index=True)
relative_weekly_subreddits = pd.concat([merged_weeks_mommit, merged_weeks_daddit], ignore_index=True)
relative_weekly_subreddits

#relative_monhtly_subreddits.to_csv('basis tabellen für rq3/monthly_subreddits_top_n_words.csv', index=False)
#relative_weekly_subreddits.to_csv('basis tabellen für rq3/weekly_subreddits_top_n_words.csv', index=False)

Unnamed: 0,date_time,grouped_count_orig,grouped_count_chores,relative,subreddit
0,2019-12-01,121,46,38.016529,Mommit
1,2019-12-08,1141,339,29.710780,Mommit
2,2019-12-15,1146,368,32.111693,Mommit
3,2019-12-22,1144,418,36.538462,Mommit
4,2019-12-29,858,320,37.296037,Mommit
...,...,...,...,...,...
61,2022-03-06,6033,842,13.956572,daddit
62,2022-03-13,7212,1856,25.734886,daddit
63,2022-03-20,8061,1704,21.138817,daddit
64,2022-03-27,8260,2039,24.685230,daddit


In [59]:
# visualize the broad categories based on top_n_words
dataframes = [merged_months_mommit,merged_months_daddit, merged_weeks_mommit, merged_weeks_daddit]
titles = ['Mommit: Postings with topics associated with household chores, monthly in %',
          'Daddit: Postings with topics associated with household chores, monthly in %',
          'Mommit: Postings with topics associated with household chores, weekly in %',  
          'Daddit: Postings with topics associated with household chores, weekly in %']  # Specify the titles for each dataframe

for i, dataframe in enumerate(dataframes):
    fig = px.line(dataframe, x='date_time', y="relative",
                  title=f"{titles[i]}", height=500, width=1200)
    fig.update_traces(line_color="rgb(158, 158, 158)")

    # Set the same y-axis scale for both graphs
    fig.update_layout(yaxis=dict(range=[0, 80]))

    # Add vertical line --> start pandemic
    fig.add_shape(
        type="line",
        x0="2020-03-11", x1="2020-03-11",
        y0=0, y1=2500,
        line=dict(color="black", width=2),
        name="Start of pandemic"
    )

    # Add custom legend items as annotations
    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode='lines',
            marker=dict(color="black", size=10),
            showlegend=True,
            name="Start of pandemic"
        )
    )

    fig.update_layout(
        legend=dict(
            title='',
            title_font=dict(size=14)
        ),
        xaxis=dict(
            showgrid=True,  # Display x-axis grid lines
            gridcolor='lightgrey',  # Set the color of the x-axis grid lines
            gridwidth=0.5  # Set the width of the x-axis grid lines
        ),
        yaxis=dict(
            showgrid=True,  # Display y-axis grid lines
            gridcolor='lightgrey',  # Set the color of the y-axis grid lines
            gridwidth=0.5  # Set the width of the y-axis grid lines
        )
    )

    # Updating layout and axis labels
    fig.update_layout(
        plot_bgcolor='white',
        title={
            'font': {'size': 32, 'family': 'Calibri'}  
        },
        xaxis_title='Date',
        yaxis_title='Frequency in %',
        xaxis={
            'title': {'font': {'size': 24, 'family': 'Calibri'}},  
            'tickfont': {'size': 24, 'family': 'Calibri'}  
        },
        yaxis={
            'title': {'font': {'size': 24, 'family': 'Calibri'}},  
            'tickfont': {'size': 24, 'family': 'Calibri'}, 
            'range': [-5, 80]
        }
    )

    fig.show()

### weekly and monthly grouping together

In [60]:
relative_weeks_mommit

Unnamed: 0,date_time,small_categories,Count,grouped_count,relative chores
0,2019-12-01,Organization & mental load,32,121,26.446281
1,2019-12-01,Care work,10,121,8.264463
2,2019-12-01,Household chores,4,121,3.305785
3,2019-12-08,Care work,160,1141,14.022787
4,2019-12-08,Organization & mental load,133,1141,11.656442
...,...,...,...,...,...
122,2022-03-27,Other,14,11618,0.120503
123,2022-04-03,Care work,958,6093,15.722961
124,2022-04-03,Household chores,488,6093,8.009191
125,2022-04-03,Organization & mental load,250,6093,4.103069


In [61]:
relative_months_mommit

Unnamed: 0,date_time,small_categories,Count,grouped_count,relative chores
0,2019-12-31,Care work,864,4698,18.390805
1,2019-12-31,Organization & mental load,435,4698,9.259259
2,2019-12-31,Household chores,269,4698,5.725841
3,2019-12-31,Other,8,4698,0.170285
4,2020-01-31,Care work,961,4985,19.277834
5,2020-01-31,Household chores,391,4985,7.843531
6,2020-01-31,Organization & mental load,367,4985,7.362086
7,2020-01-31,Other,9,4985,0.180542
8,2020-02-29,Care work,929,5365,17.315937
9,2020-02-29,Organization & mental load,341,5365,6.356011


In [62]:
relative_weeks_mommit["time"]="weekly"
relative_months_mommit["time"]="monthly"
grouping_mommit=pd.concat([relative_weeks_mommit, relative_months_mommit])
grouping_mommit["line"]=grouping_mommit["small_categories"]+" "+grouping_mommit["time"]
grouping_mommit

Unnamed: 0,date_time,small_categories,Count,grouped_count,relative chores,time,line
0,2019-12-01,Organization & mental load,32,121,26.446281,weekly,Organization & mental load weekly
1,2019-12-01,Care work,10,121,8.264463,weekly,Care work weekly
2,2019-12-01,Household chores,4,121,3.305785,weekly,Household chores weekly
3,2019-12-08,Care work,160,1141,14.022787,weekly,Care work weekly
4,2019-12-08,Organization & mental load,133,1141,11.656442,weekly,Organization & mental load weekly
...,...,...,...,...,...,...,...
23,2021-03-31,Other,6,6321,0.094922,monthly,Other monthly
24,2022-03-31,Care work,8082,46399,17.418479,monthly,Care work monthly
25,2022-03-31,Organization & mental load,3273,46399,7.054031,monthly,Organization & mental load monthly
26,2022-03-31,Household chores,2735,46399,5.894524,monthly,Household chores monthly


In [63]:
colors = ['rgb(31, 119, 180)', 'rgb(214, 39, 40)','rgb(44, 160, 44)', 'rgb(127, 127, 127)']

fig = px.line(grouping_mommit, x='date_time', y='relative chores', color="line", height=500, width=1200,
              title="Weekly and monthly share of posts<br>with home responsibilites in the topic representations",
              color_discrete_sequence=colors)


fig.add_vline(x="2020-03-11", line_color="black", line_width=3)

# Set the same y-axis scale for both graphs
fig.update_layout(yaxis=dict(range=[0, 80]))

    # Add vertical line --> start pandemic
fig.add_shape(
    type="line",
    x0="2020-03-11", x1="2020-03-11",
    y0=0, y1=2500,
    line=dict(color="black", width=2),
    name="Start of pandemic"
)

# Add custom legend items as annotations
fig.add_trace(
    go.Scatter(
        x=[None],
        y=[None],
        mode='lines',
        marker=dict(color="black", size=10),
        showlegend=True,
        name="Start of pandemic"
    )
)

fig.update_layout(
    legend=dict(
        title='',
        title_font=dict(size=14)
    ),
    xaxis=dict(
        showgrid=True,  # Display x-axis grid lines
        gridcolor='lightgrey',  # Set the color of the x-axis grid lines
        gridwidth=0.5  # Set the width of the x-axis grid lines
    ),
    yaxis=dict(
        showgrid=True,  # Display y-axis grid lines
        gridcolor='lightgrey',  # Set the color of the y-axis grid lines
        gridwidth=0.5  # Set the width of the y-axis grid lines
    )
)

# Updating layout and axis labels
fig.update_layout(
    plot_bgcolor='white',
    title={
        'font': {'size': 32, 'family': 'Calibri'}  
    },
    xaxis_title='Date',
    yaxis_title='Frequency in %',
    xaxis={
        'title': "", #{'font': {'size': 24, 'family': 'Calibri'}}, 
        'tickfont': {'size': 24, 'family': 'Calibri'},  
        "range":["2019-12-01", "2022-04-30"],
        "dtick":"M2",
        "ticklabelmode":"period"
    },
    yaxis={
        'title': {'font': {'size': 24, 'family': 'Calibri'}}, 
        'tickfont': {'size': 24, 'family': 'Calibri'}, 
        'range': [-1,30]
    }
)
fig.update_traces(patch={"line": {"color": "rgb(214, 39, 40, 0.30)", "width": 1.5, "dash": 'dot'}}, selector={"legendgroup": "Organization & mental load weekly"}) 
fig.update_traces(patch={"line": {"color": "rgb(31, 119, 180, 0.30)", "width": 1.5, "dash": 'dot'}}, selector={"legendgroup": "Care work weekly"}) 
fig.update_traces(patch={"line": {"color": "rgba(44, 160, 44, 0.30)", "width": 1.5, "dash": 'dot'}}, selector={"legendgroup": "Household chores weekly"}) 
fig.update_traces(patch={"line": {"color": "rgba(127, 127, 127, 0.30)", "width": 1.5, "dash": 'dot'}}, selector={"legendgroup": "Other weekly"}) 

fig.update_xaxes(tickangle= 45)  

fig.show()
