# This notebook is used for data exploration and visualization

In [1]:
import pandas as pd
import plotly
import plotly.express as px
import plotly.graph_objects as go

## original unprocessed data


In [2]:
def load_dataset(csv:str):
    """"
    - loads the mommit and daddit data unprocessed data
    - removes june 2020 and ocotber 2020 (later decided to disregard June and October 2020)
    """
    # load in dataset
    data= pd.read_csv(csv, sep = ';').iloc[:,1:]
    # filter out the rows that fall between the given dates 
    mask = (data['date_time'] < '2020-06-01 00:00:00') | ((data['date_time'] > '2020-06-30 23:59:59')& (data['date_time'] < '2020-10-01 00:00:00')) | (data['date_time'] > '2020-10-31 23:59:59')
    data_corr_times = data.loc[mask]
    # reset index after filtering
    data_corr_times=data_corr_times.reset_index()

    return data_corr_times

In [3]:
# load original data (unprocessed) without June and October 2020
mommit_orig=load_dataset(csv="mommit_subs_comments_final.csv") # len 96545
daddit_orig=load_dataset(csv="daddit_subs_comments_final.csv") # len 109775

### number of posts im Zeitverlauf

In [5]:
def count_over_time(df, category):
    """
    counts the number of submissions/comments per day and per month
    """
    # set date_time to datetime
    df['date_time'] = pd.to_datetime(df['date_time'])

    # filter df for submissions/comments
    df_category=df[df["category"]==category]

    # group by day
    count_per_day = df_category.groupby(pd.Grouper(key='date_time', freq='D')).size().reset_index(name="count")
    # group by month
    count_per_month=df_category.groupby(pd.Grouper(key="date_time", freq="M")).size().reset_index(name="count")
    
    # Calculate the average count per day (only for days where count is not 0 as that are the days of months not in this analysis)
    filtered_count_per_day = count_per_day[count_per_day['count'] > 0]
    average_count = filtered_count_per_day['count'].mean()

    return count_per_day, count_per_month, average_count

In [6]:
def graph_over_time(count_day, average, subreddit, y1, range_y, category):
    """
    Visualizes daily comments/submissions for subreddit of choice
    Args:
        count_day=dataframe with daily counted posts
        average= average count
        subreddit= which subreddit
        y1= range of the line of the start of the pandemic
        range_y = lenght of y axis
        category= submissions or comments
    """
    # line plot
    fig = px.line(count_day, x='date_time', y="count", title="Number of submissions counted per Day - Mommit", height=500, width=1200)

    # Update the line color for the submission count to grey
    fig.update_traces(line_color="rgb(158, 158, 158)")

    # Add vertical line --> start pandemic
    fig.add_shape(
        type="line",
        x0="2020-03-08", x1="2020-03-08",
        y0=0, y1=y1,
        line=dict(color="black", width=2),
        name="Start of pandemic"
    )

    # Add horizontal line for the average submissions per day
    fig.add_shape(
        type="line",
        x0=count_day['date_time'].min(), x1=count_day['date_time'].max(),
        y0=average, y1=average,
        line=dict(color="red", width=2, dash='dash'),
        name="Average submissions per day"
    )

    # Add custom legend items
    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode='lines',
            marker=dict(color="black", size=10),
            showlegend=True,
            name="<span style='font-size: 24px;'>Start of pandemic</span>"
        )
    )

    fig.add_trace(
        go.Scatter(
            x=[None],
            y=[None],
            mode='lines',
            line=dict(color="red", width=2, dash='dash'),
            showlegend=True,
            name="<span style='font-size: 24px;'>Average submissions per day</span>",
        )
    )

    fig.update_layout(
    legend=dict(
        title='',
        title_font=dict(size=14)
    ))

    fig.update_layout(
        xaxis=dict(
            showgrid=True,  # Display x-axis grid lines
            gridcolor='lightgrey',  # Set the color of the x-axis grid lines
            gridwidth=0.5  # Set the width of the x-axis grid lines
        ),
        yaxis=dict(
            showgrid=True,  # Display y-axis grid lines
            gridcolor='lightgrey',  # Set the color of the y-axis grid lines
            gridwidth=0.5  # Set the width of the y-axis grid lines
        )
    )

    # Adjust the layout
    fig.update_layout(
        xaxis_title="",
        yaxis_title="Count",
        title={
            'text': f"Number of {category} counted per Day - {subreddit}",
            'font': {'size': 32, 'family': 'Calibri'}  # Adjust the size and font of title
        },
        margin=dict(l=50, r=50, t=80, b=50),
        plot_bgcolor='white',
        legend={
            'font': {'size': 20, 'family': 'Calibri'} 
        },
        xaxis={
            'title': {'font': {'size': 24, 'family': 'Calibri'}},
            'tickfont': {'size': 24, 'family': 'Calibri'},
            "range":["2019-12-01", "2022-04-30"],
            "dtick":"M2",
            "ticklabelmode":"period"
        },
        yaxis={
            'title': {'font': {'size': 24, 'family': 'Calibri'}},  
            'tickfont': {'size': 24, 'family': 'Calibri'},  
            'range': [0, range_y]  # Adjust the range of the y-axis

        }
    )

    fig.update_xaxes(tickangle= 45)  

    fig.show()

### r/Mommit

In [7]:
# counted submissions per day, month and the average for r/Mommit
day_m, month_m, average_m=count_over_time(mommit_orig, category="submissions")

# counted cooments per day, month and the average for r/Mommit
day_m_comments, month_m_comments, average_m_comments=count_over_time(mommit_orig, category="comments")

average_m_comments

416.44392523364485

In [8]:
# visualize submissions over time (daily) for r/Mommit
graph_over_time(day_m, average_m, "r/Mommit", y1=150, range_y=150, category="submissions")

In [9]:
# visualize comments over time (daily) for r/Mommit
graph_over_time(day_m_comments, average_m_comments, "r/Mommit", y1=2500, range_y=2500, category="comments")

### r/daddit

In [10]:
# counted submissions per day, month and the average for r/daddit
day_d, month_d, average_d=count_over_time(daddit_orig, category="submissions")

# counted cooments per day, month and the average for r/daddit
day_d_comments, month_d_comments, average_d_comments=count_over_time(daddit_orig, category="comments")
average_d_comments

466.9252336448598

In [11]:
# visualize submissions over time (daily) for r/Mommit
graph_over_time(day_d, average_d, "r/daddit", y1=150, range_y=150, category="submissions")

In [12]:
# visualize submissions over time (daily) for r/Mommit
graph_over_time(day_d_comments, average_d_comments, "r/daddit", y1=2500, range_y=2500, category="comments")

### number of unique users im Zeitverlauf


In [13]:
# Convert the 'date_time' column to datetime
mommit_orig['date_time'] = pd.to_datetime(mommit_orig['date_time'])

# Create a new column for month
mommit_orig['month'] = mommit_orig['date_time'].dt.month
mommit_orig['year'] = mommit_orig['date_time'].dt.year

# Group the dataframe by month and author, and count the unique values
# author = unique per month (means author can be unique in january 2020 but also again in march 2020)
result = mommit_orig.groupby(['year', 'month', 'author']).size().reset_index(name='count')

result

Unnamed: 0,year,month,author,count
0,2019,12,-_-k,1
1,2019,12,-cc0unt-nt,1
2,2019,12,-leeson,1
3,2019,12,-lust4life-,1
4,2019,12,-mooncake-,1
...,...,...,...,...
28795,2022,3,zuks28,3
28796,2022,3,zulham_134,1
28797,2022,3,zulupanda,1
28798,2022,3,zypher0_,1


In [14]:
# unique authors over all months
unique_author_count = result['author'].nunique()
unique_author_count

21699

In [15]:
# Create a date column using the year and month columns
result['date'] = pd.to_datetime(result[['year', 'month']].assign(day=1))

# Set the date column as the DataFrame's index
result.set_index('date', inplace=True)

# Resample the DataFrame to monthly frequency and count the occurrences in the author column
count_per_month = result['author'].resample('M').count().reset_index()

# Print the counts per month
count_per_month

Unnamed: 0,date,author
0,2019-12-31,2274
1,2020-01-31,2479
2,2020-02-29,2615
3,2020-03-31,2724
4,2020-04-30,2590
5,2020-05-31,0
6,2020-06-30,0
7,2020-07-31,0
8,2020-08-31,0
9,2020-09-30,0


# Cleaned data (after preprocessing)

In [16]:
# read in the preprocessed data (= cleaned data --> end result of preprocessing_BERT_modularisiert.py)
mommit_text=load_dataset('mommit_clean.csv')
daddit_text=load_dataset('daddit_clean.csv')
print(len(mommit_text))
print(len(daddit_text))

77599
78963


## Upvotes

In [17]:
def count_score(df):
    """ 
    count how often each value of upvotes occures in dataframe
    """
    counts = df['score'].value_counts()
    counts_df = counts.to_frame().reset_index()
    counts_df.columns = ['score', 'count']
    return counts_df

In [18]:
def descriptive_stats_upvotes(df):
    max=df["score"].max()
    min=df["score"].min()
    mean=df["score"].mean()
    counted_scores=count_score(df)

    return max, min, mean, counted_scores

In [19]:
# descriptive statistics r/Mommit
max_m, min_m, mean_m, counted_scores_m = descriptive_stats_upvotes(mommit_text)
mean_m

12.94439361331976

In [20]:
# what is the most frequent upvote --> 1
counted_scores_m=counted_scores_m.sort_values(by="count")
counted_scores_m

Unnamed: 0,score,count
760,980,1
533,309,1
534,493,1
535,560,1
536,464,1
...,...,...
4,5,3264
3,4,3485
2,3,8536
1,2,17030


In [21]:
# descriptive statistics r/daddit
max_d, min_d, mean_d, counted_scores_d = descriptive_stats_upvotes(daddit_text)
mean_d

17.21874802122513

## manually group the upvotes into categories

In [22]:
def upvote_categories(df):
    """
    This function groups the upvotes into categories
    """       
    # Define the categories and labels
    categories = [-float('inf'), 0, 1, 5, 10, 50, 100, 200, 500, 1000]
    labels = ['Negative', '0', '1-5', '5-10', '10-50', '50-100', '100-200', '200-500', '1000+']
    # create new variable "Upvote_Category"
    df['Upvote_Category'] = pd.cut(df['score'], bins=categories, labels=labels)

    # counts per category
    df[["Upvote_Category", "score"]]
    df['Upvote_Category'].value_counts()

    # histogram 
    fig = px.histogram(df['Upvote_Category'], nbins=50)
    fig.update_traces(marker_color="rgb(158, 158, 158)", showlegend=False)

    fig.update_layout(
        plot_bgcolor="white",
        xaxis_title='Upvote category',
        yaxis_title='Frequency',
        title={
            "text": f"Grouped upvotes, r/{df.subreddit[0]}",
            "font": {'size': 32, 'family': 'Calibri'}
        },
        xaxis={
            'title': {'font': {'size': 24, 'family': 'Calibri'}},  
            'tickfont': {'size': 24, 'family': 'Calibri'}  
        },
        yaxis={
            'title': {'font': {'size': 24, 'family': 'Calibri'}},  
            'tickfont': {'size': 24, 'family': 'Calibri'},  
            'range': [0, 50000] 

        }
    ),
    fig.update_xaxes(categoryorder='array', categoryarray= labels)

    fig.show()

In [23]:
# most upvotes are b/w 0 and 5
upvote_categories(mommit_text)
upvote_categories(daddit_text)

## Number of submissions and comments per day by subreddit

### r/Mommit

In [24]:
# number of submissions
## r/Mommit
mommit_submissions=mommit_text[mommit_text["category"]=="submissions"]
print(len(mommit_submissions))

## r/daddit
daddit_submissions=daddit_text[daddit_text["category"]=="submissions"]
print(len(daddit_submissions))

# number of comments
## r/Mommit
mommit_comments=mommit_text[mommit_text["category"]=="comments"]
print(len(mommit_comments))

## r/daddit
daddit_comments=daddit_text[daddit_text["category"]=="comments"]
print(len(daddit_comments))



7334
9740
70265
69223


In [25]:
# counted submissions per day, month and the average for r/Mommit -- cleaned dataset
day_m_clean, month_m_clean, average_m_clean=count_over_time(mommit_text, category="submissions")

# counted comments per day, month and the average for r/Mommit -- cleaned dataset
day_m_comments_clean, month_m_comments_clean, average_m_comments_clean=count_over_time(mommit_text, category="comments")

print(average_m_clean)
print(average_m_comments_clean)

34.271028037383175
328.3411214953271


In [26]:
day_m_clean = day_m_clean[day_m_clean['count'] != 0]
day_m_clean


Unnamed: 0,date_time,count
0,2019-12-01,19
1,2019-12-02,14
2,2019-12-03,26
3,2019-12-04,29
4,2019-12-05,17
...,...,...
847,2022-03-27,83
848,2022-03-28,96
849,2022-03-29,81
850,2022-03-30,92


In [27]:
graph_over_time(day_m_clean, average_m_clean, "r/Mommit", y1=150, range_y=150, category="submissions")

In [28]:
# drop 0 rows for months with no information
day_m_comments_clean = day_m_comments_clean[day_m_comments_clean['count'] != 0]

graph_over_time(day_m_comments_clean, average_m_comments_clean, "r/Mommit", y1=2500, range_y=2500, category="comments")

### r/daddit

In [29]:
# counted submissions per day, month and the average for r/Mommit -- cleaned dataset
day_d_clean, month_d_clean, average_d_clean=count_over_time(daddit_text, category="submissions")

# counted cooments per day, month and the average for r/Mommit -- cleaned dataset
day_d_comments_clean, month_d_comments_clean, average_d_comments_clean=count_over_time(daddit_text, category="comments")

print(average_d_clean)
print(average_d_comments_clean)

45.51401869158879
323.47196261682245


In [30]:
# drop 0 rows for months with no information
day_d_clean = day_d_clean[day_d_clean['count'] != 0]

# r/daddit submissions
graph_over_time(day_d_clean, average_d_clean, "r/daddit", y1=150, range_y=150, category="submissions")

In [31]:
# drop 0 rows for months with no information
day_d_comments_clean = day_d_comments_clean[day_d_comments_clean['count'] != 0]

# r/daddit comments
graph_over_time(day_d_comments_clean, average_d_comments_clean, "r/daddit", y1=2500, range_y=2500, category="comments")

### combined graph with mommit and daddit posts and comments monthly?

In [32]:
# submissions combined
submissions_monthly= pd.merge(month_d_clean, month_m_clean, on="date_time", how="outer")
submissions_monthly= submissions_monthly.rename(columns={'count_x': 'r/daddit Submissions', 'count_y': 'r/Mommit Submissions'})

In [33]:
# comments combined
comments_monthly= pd.merge(month_d_comments_clean, month_m_comments_clean, on="date_time", how="outer")
comments_monthly= comments_monthly.rename(columns={'count_x': 'r/daddit Comments', 'count_y': 'r/Mommit Comments'})

In [34]:
# comments and submissions for both subreddits
all_postings=pd.merge(submissions_monthly, comments_monthly, on="date_time", how="outer")

In [35]:
all_postings["Posts r/daddit"]=all_postings["r/daddit Submissions"]+ all_postings["r/daddit Comments"]
all_postings["Posts r/Mommit"]=all_postings["r/Mommit Submissions"] + all_postings["r/Mommit Comments"]

all_postings = all_postings[all_postings['Posts r/Mommit'] != 0]
all_postings

Unnamed: 0,date_time,r/daddit Submissions,r/Mommit Submissions,r/daddit Comments,r/Mommit Comments,Posts r/daddit,Posts r/Mommit
0,2019-12-31,1376,726,6213,3972,7589,4698
1,2020-01-31,1416,716,7514,4269,8930,4985
2,2020-02-29,1329,809,6637,4556,7966,5365
3,2020-03-31,1315,860,6115,4261,7430,5121
4,2020-04-30,1196,798,5098,3913,6294,4711
15,2021-03-31,1099,910,6353,5411,7452,6321
27,2022-03-31,2009,2515,31293,43883,33302,46398


In [36]:
# turn into long format - submissions
df_melted_posts = pd.melt(all_postings, id_vars='date_time', value_vars=['Posts r/daddit', "Posts r/Mommit" ],
                    var_name='Variable', value_name='Value')
df_melted_posts

Unnamed: 0,date_time,Variable,Value
0,2019-12-31,Posts r/daddit,7589
1,2020-01-31,Posts r/daddit,8930
2,2020-02-29,Posts r/daddit,7966
3,2020-03-31,Posts r/daddit,7430
4,2020-04-30,Posts r/daddit,6294
5,2021-03-31,Posts r/daddit,7452
6,2022-03-31,Posts r/daddit,33302
7,2019-12-31,Posts r/Mommit,4698
8,2020-01-31,Posts r/Mommit,4985
9,2020-02-29,Posts r/Mommit,5365


In [37]:
# turn into long format - submissions
df_melted_submission = pd.melt(all_postings, id_vars='date_time', value_vars=['r/daddit Submissions', 'r/Mommit Submissions' ],
                    var_name='Variable', value_name='Value')

In [38]:
df_melted_submission

Unnamed: 0,date_time,Variable,Value
0,2019-12-31,r/daddit Submissions,1376
1,2020-01-31,r/daddit Submissions,1416
2,2020-02-29,r/daddit Submissions,1329
3,2020-03-31,r/daddit Submissions,1315
4,2020-04-30,r/daddit Submissions,1196
5,2021-03-31,r/daddit Submissions,1099
6,2022-03-31,r/daddit Submissions,2009
7,2019-12-31,r/Mommit Submissions,726
8,2020-01-31,r/Mommit Submissions,716
9,2020-02-29,r/Mommit Submissions,809


In [39]:
# plot - submissions
colors = ["rgb(88, 176, 95)", "rgb(241, 133, 64)"]

fig = px.line(df_melted_submission[df_melted_submission["date_time"]<"2020-05-01"], x='date_time', y='Value', color='Variable', height=500, width=1200, markers=True)

# Assigning the shades of grey to line colors
for i, color in enumerate(colors):
    fig.update_traces(selector=dict(name=df_melted_submission['Variable'].unique()[i]), line_color=color)

# Add vertical line --> start pandemic
fig.add_shape(
    type="line",
    x0="2020-03-08", x1="2020-03-08",
    y0=0, y1=2500,
    line=dict(color="black", width=2),
    name="Start of pandemic"
)

# Add custom legend items as annotations
fig.add_trace(
    go.Scatter(
        x=[None],
        y=[None],
        mode='lines',
        marker=dict(color="black", size=10),
        showlegend=True,
        name="Start of pandemic"
    )
)

fig.update_layout(
    legend=dict(
        title='',
        title_font=dict(size=14),
        font=dict(size=18)
    ),
    xaxis=dict(
        showgrid=True,  # Display x-axis grid lines
        gridcolor='lightgrey',  # Set the color of the x-axis grid lines
        gridwidth=0.5  # Set the width of the x-axis grid lines
    ),
    yaxis=dict(
        showgrid=True,  # Display y-axis grid lines
        gridcolor='lightgrey',  # Set the color of the y-axis grid lines
        gridwidth=0.5  # Set the width of the y-axis grid lines
    )
)


# Updating layout and axis labels
fig.update_layout(
    plot_bgcolor='white',
    title={
        'text': 'Monthly submissions',
        'font': {'size': 32, 'family': 'Calibri'} 
    },
    xaxis_title='',
    yaxis_title='Frequency',
    xaxis={
        'title': {'font': {'size': 24, 'family': 'Calibri'}},  
        'tickfont': {'size': 24, 'family': 'Calibri'},  
        "range":["2019-12-20", "2020-05-01"],
        "dtick":"M1",
        "ticklabelmode":"period"
    },
    yaxis={
        'title': {'font': {'size': 24, 'family': 'Calibri'}},  
        'tickfont': {'size': 24, 'family': 'Calibri'}, 
    }
    
)
fig.update_xaxes(tickangle= 45)  

fig.show()


In [40]:
# long format - comments
df_melted_comments = pd.melt(all_postings, id_vars='date_time', value_vars=['r/daddit Comments', 'r/Mommit Comments'],
                    var_name='Variable', value_name='Value')

In [41]:
# plot - comments
colors = ["rgb(88, 176, 95)", "rgb(241, 133, 64)"]


fig = px.line(df_melted_comments, x='date_time', y='Value', color='Variable', height=500, width=1200)
# Assigning the shades of grey to line colors
for i, color in enumerate(colors):
    fig.update_traces(selector=dict(name=df_melted_comments['Variable'].unique()[i]), line_color=color)

# Add vertical line --> start pandemic
fig.add_shape(
    type="line",
    x0="2020-03-08", x1="2020-03-08",
    y0=0, y1=40000,
    line=dict(color="black", width=2),
    name="Start of pandemic"
)


# Add custom legend items as annotations
fig.add_trace(
    go.Scatter(
        x=[None],
        y=[None],
        mode='lines',
        marker=dict(color="black", size=10),
        showlegend=True,
        name="Start of pandemic"
    )
)

fig.update_layout(
    legend=dict(
        title='',
        title_font=dict(size=14),
        font=dict(size=18)

    ),
    xaxis=dict(
        showgrid=True,  # Display x-axis grid lines
        gridcolor='lightgrey',  # Set the color of the x-axis grid lines
        gridwidth=0.5  # Set the width of the x-axis grid lines
    ),
    yaxis=dict(
        showgrid=True,  # Display y-axis grid lines
        gridcolor='lightgrey',  # Set the color of the y-axis grid lines
        gridwidth=0.5  # Set the width of the y-axis grid lines
    )
)


# Updating layout and axis labels
fig.update_layout(
    plot_bgcolor='white',

    title={
        'text': 'Monthly comments',
        'font': {'size': 32, 'family': 'Calibri'}  # Adjust the size and family of the title font as desired
    },
    xaxis_title='',
    yaxis_title='Frequency',
    xaxis={
        'title': {'font': {'size': 24, 'family': 'Calibri'}},  # Adjust the size and family of the x-axis label as desired
        'tickfont': {'size': 24, 'family': 'Calibri'},  # Adjust the size and family of the x-axis tick labels as desired
        "range":["2019-12-01", "2022-04-30"],
        "dtick":"M2",
        "ticklabelmode":"period"
    },
    yaxis={
        'title': {'font': {'size': 24, 'family': 'Calibri'}},  # Adjust the size and family of the y-axis label as desired
        'tickfont': {'size': 24, 'family': 'Calibri'},  # Adjust the size and family of the y-axis tick labels as desired
        #'range': [0, 2500]  # Adjust the range of the y-axis as desired

    }    
)

fig.update_xaxes(tickangle= 45)  
fig.show()

# Figure 1
combine submissions and comments to posts  --  general post volumen

In [44]:
# plot - posts
colors = ["rgb(88, 176, 95)", "rgb(241, 133, 64)"]


fig = px.line(df_melted_posts, x='date_time', y='Value', color='Variable', height=500, width=1200, markers=True)
# Assigning the shades of grey to line colors
for i, color in enumerate(colors):
    fig.update_traces(selector=dict(name=df_melted_posts['Variable'].unique()[i]), line_color=color)

# Add vertical line --> start pandemic
fig.add_shape(
    type="line",
    x0="2020-03-08", x1="2020-03-08",
    y0=0, y1=40000,
    line=dict(color="black", width=2),
    name="Start of pandemic"
)


# Add custom legend items as annotations
fig.add_trace(
    go.Scatter(
        x=[None],
        y=[None],
        mode='lines',
        marker=dict(color="black", size=10),
        showlegend=True,
        name="Start of pandemic"
    )
)


fig.update_layout(
    legend=dict(
        title='',
        title_font=dict(size=14),
        font=dict(size=18)

    ),
    xaxis=dict(
        showgrid=True,  # Display x-axis grid lines
        gridcolor='lightgrey',  # Set the color of the x-axis grid lines
        gridwidth=0.5  # Set the width of the x-axis grid lines
    ),
    yaxis=dict(
        showgrid=True,  # Display y-axis grid lines
        gridcolor='lightgrey',  # Set the color of the y-axis grid lines
        gridwidth=0.5  # Set the width of the y-axis grid lines
    )
)


# Updating layout and axis labels
fig.update_layout(
    plot_bgcolor='white',

    title={
        'text': 'Monthly posts',
        'font': {'size': 32, 'family': 'Calibri'}  # Adjust the size and family of the title font as desired
    },
    xaxis_title='',
    yaxis_title='Count',
    xaxis={
        'title': {'font': {'size': 24, 'family': 'Calibri'}},  # Adjust the size and family of the x-axis label as desired
        'tickfont': {'size': 24, 'family': 'Calibri'},  # Adjust the size and family of the x-axis tick labels as desired
        "range":["2019-12-01", "2022-04-30"],
        "dtick":"M2",
        "ticklabelmode":"period",
        "tickvals":["2019-12-31", "2020-01-31", "2020-02-28", "2020-03-31", "2020-04-30", "2021-03-31", "2022-03-31"],
        "rangebreaks": [
            {"bounds": ["2020-06-01", "2021-02-28"]},  
            {"bounds": ["2021-05-01", "2022-02-28"]}
        ]
    },
    yaxis={
        'title': {'font': {'size': 24, 'family': 'Calibri'}},  # Adjust the size and family of the y-axis label as desired
        'tickfont': {'size': 24, 'family': 'Calibri'},  # Adjust the size and family of the y-axis tick labels as desired

    }    
)

fig.update_xaxes(tickangle= 45)  
fig.show()

# How many words do comments have on average

In [42]:
# filter for comments
mommit_text_comments= mommit_text[mommit_text["category"]=="comments"].reset_index()
daddit_text_comments= daddit_text[daddit_text["category"]=="comments"].reset_index()

In [43]:
# reduce df to text and date columns
mommit_comments_short= mommit_text_comments[["whole_text", "date_time"]]
daddit_comments_short= daddit_text_comments[["whole_text", "date_time"]]

In [44]:
# average lenght of comments
count_words = lambda text: len(text.split())

# Apply lambda function to "whole_text" and compute the mean
average_words_m = mommit_comments_short['whole_text'].apply(count_words).mean() # 58.14
average_words_d = daddit_comments_short['whole_text'].apply(count_words).mean() # 47.37

find comment with largest number of words

In [45]:
# lambda function to count the number of words in a string
count_words = lambda text: (text, len(text.split()))

# Apply lambda function to 'whole_text' and get row w/ max words
max_row = mommit_comments_short['whole_text'].apply(count_words).iloc[mommit_comments_short['whole_text'].apply(count_words).apply(lambda x: x[1]).idxmax()]
max_row[0] # comment
max_row[1] # count words

1228

# how many words do submissions have on average

In [46]:
mommit_text_submissions= mommit_text[mommit_text["category"]=="submissions"].reset_index()
daddit_text_submissions= daddit_text[daddit_text["category"]=="submissions"].reset_index()

# keep only text and date col
mommit_submissions_short= mommit_text_submissions[["whole_text", "date_time"]]
daddit_submissions_short= daddit_text_submissions[["whole_text", "date_time"]]

In [47]:
# find average lenght of submissions
count_words = lambda text: len(text.split())

# Apply the lambda function to the 'Text' column and compute the mean
average_words_m = mommit_submissions_short['whole_text'].apply(count_words).mean() # 81.55
average_words_d = daddit_submissions_short['whole_text'].apply(count_words).mean() # 56.30

# Percentage of submissions with no comments

- attention: comments with less than 10 words are dropped from this dataset
- however, in this calculation all comments count

In [48]:
# mommit
zero_comments= mommit_text_submissions[mommit_text_submissions["num_comments"]==0]
percentage_comments=(len(zero_comments) / len(mommit_text_submissions))*100
percentage_comments

28.275391956373554

In [49]:
# daddit
zero_comments_d= daddit_text_submissions[daddit_text_submissions["num_comments"]==0]
percentage_comments_d=(len(zero_comments_d) / len(daddit_text_submissions))*100
percentage_comments_d

19.50718685831622

# Average number of comments per submission

In [50]:
mean_comments_mommit=mommit_text_submissions["num_comments"].mean()
mean_comments_mommit

12.106066802999319

In [51]:
mean_comments_daddit=daddit_text_submissions["num_comments"].mean()
mean_comments_daddit

10.046303901437371

# Percentage of submissions with picture

In [52]:
# Mommit
with_image= mommit_text_submissions[mommit_text_submissions["image"]==1]
percentage_image=(len(with_image) / len(mommit_text_submissions))*100
percentage_image

17.70961145194274

In [53]:
# daddit
with_image_d= daddit_text_submissions[daddit_text_submissions["image"]==1]
percentage_image_d=(len(with_image_d) / len(daddit_text_submissions))*100
percentage_image_d

39.753593429158116

# Unique users
- per month
- or for whole sample?

whole sample

In [54]:
## Mommit
# Convert the 'date_time' column to datetime
mommit_text['date_time'] = pd.to_datetime(mommit_text['date_time'])

# Create a new column for month
mommit_text['month'] = mommit_text['date_time'].dt.month
mommit_text['year'] = mommit_text['date_time'].dt.year

# Group the dataframe by author, and count the unique values
result = mommit_text.groupby(['author']).size().reset_index(name='count')
result

Unnamed: 0,author,count
0,--00empty00--,1
1,--eight,3
2,-CrazyLikeASloth-,2
3,-DontPanic42-,8
4,-Flossie-,1
...,...,...
19630,zurisadai,1
19631,zyathlith,6
19632,zypher0_,1
19633,zzsleepytinizz,2


In [55]:
## daddit
# Convert the 'date_time' column to datetime
daddit_text['date_time'] = pd.to_datetime(daddit_text['date_time'])

# Create a new column for month
daddit_text['month'] = daddit_text['date_time'].dt.month
daddit_text['year'] = daddit_text['date_time'].dt.year

# Group the dataframe by author, and count the unique values
result_d = daddit_text.groupby(['author']).size().reset_index(name='count')
result_d

Unnamed: 0,author,count
0,------me,1
1,---BeepBoop---,6
2,--0o0o0--,1
3,--Azazel--,1
4,-40-,1
...,...,...
21045,zwaymire,1
21046,zz_moe,3
21047,zzctdi,2
21048,zztr,1


per month

In [56]:
# Group the dataframe by month and author, and count the unique values
# author = unique per month (means author can be unique in january 2020 but also again in march 2020)
result_months = mommit_text.groupby(['year', 'month', 'author']).size().reset_index(name='count') # by month
result_months

Unnamed: 0,year,month,author,count
0,2019,12,-_-k,1
1,2019,12,-cc0unt-nt,1
2,2019,12,-leeson,1
3,2019,12,-lust4life-,1
4,2019,12,-mooncake-,1
...,...,...,...,...
25624,2022,3,zuks28,3
25625,2022,3,zulham_134,1
25626,2022,3,zulupanda,1
25627,2022,3,zypher0_,1


In [57]:
# Group the dataframe by month and author, and count the unique values
# author = unique per month (means author can be unique in january 2020 but also again in march 2020)
result_months_d = daddit_text.groupby(['year', 'month', 'author']).size().reset_index(name='count') # by month
result_months_d

Unnamed: 0,year,month,author,count
0,2019,12,-HiggsBoson-,2
1,2019,12,-WizeGuy-,1
2,2019,12,-heathcliffe-,1
3,2019,12,-paradoxwizard-,1
4,2019,12,-taradactyl-,1
...,...,...,...,...
30275,2022,3,zr0skyline,3
30276,2022,3,zrail,5
30277,2022,3,zsloth79,19
30278,2022,3,zugman,1


In [58]:
# unique authors over all months --> == result (len)
unique_author_count = result_months['author'].nunique()
unique_author_count

19635

In [59]:
# unique authors over all months --> == result (len)
unique_author_count_d = result_months_d['author'].nunique()
unique_author_count_d

21050