In [169]:
import pandas as pd
import plotly.express as px
from raceplotly.plots import barplot

In [170]:
df = pd.read_feather('sample2012.feather').explode('Tags')

df = df[(df.CreationDate > "2018-01-01") & (df.CreationDate < "2022-08-30")]

In [171]:
tags = (
    df
    .groupby([df.CreationDate.dt.to_period("M"), "Tags"])
    .agg({"ViewCount": "sum", "AnswerCount": "sum", "index": "count","CommentCount": "sum"})
    .reset_index()
)

tags = tags.rename(columns={"index": "PostCount"})

# Only baking and bread

In [172]:
mylist = ['baking','bread','sourdough']

mytags = tags[tags.Tags.isin(mylist)].copy()

mytags.CommentCount = mytags.CommentCount.astype(int)

In [173]:

fig = px.scatter(mytags,
           x=mytags.CreationDate.dt.to_timestamp(),
           y="PostCount",
           color="Tags",
           title="Post count per month on StackOverflow (size = comment count)",
           #trendline="ols",
           size="CommentCount",
           )
           
fig.show()

In [174]:
fig.write_html('tags.html')

# Pandemic

In [175]:
pandemic = mytags[
    (mytags.CreationDate > "2018-01-01") & (mytags.CreationDate < "2022-08-30")
].copy()

pandemic.set_index("CreationDate", inplace=True)

pandemic

Unnamed: 0_level_0,Tags,ViewCount,AnswerCount,PostCount,CommentCount
CreationDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-02,baking,89965,21,13,22
2018-02,bread,591,3,2,3
2018-03,baking,25623,24,19,37
2018-03,bread,18576,24,18,37
2018-03,sourdough,3543,5,4,1
...,...,...,...,...,...
2022-05,bread,4079,8,4,19
2022-05,sourdough,527,1,1,3
2022-06,baking,464,4,3,2
2022-06,sourdough,3850,2,1,2


In [176]:
rollavg = (
    pandemic.groupby(["Tags"])
    .rolling(3, center=True, min_periods=2)
    .mean().reset_index(level=0)
)


In [184]:
comment = px.line(
    rollavg,
    x=rollavg.index.to_timestamp(),
    y="CommentCount",
    title="3 months rolling average of comments",
    color="Tags",
)

comment.add_vline(
    x="2020-03-11",
    line_width=2,
    line_dash="dash",
    line_color="gray"
)

# add text annotation vertically

comment.add_annotation(
    x="2020-03-11",
    y=60,
    align="right",
    text="COVID-19",
    showarrow=False,
    font=dict(
        family="Courier New, monospace",
        size=16,
        color="black")
)

In [185]:
comment.write_html('comment.html')

In [178]:
covid = px.line(
    rollavg,
    x=rollavg.index.to_timestamp(),
    y="PostCount",
    color="Tags",
    title="3 months rolling average of comments",
)

covid.add_vline(
    x="2020-03-11",
    line_width=2,
    line_dash="dash",
    line_color="gray"
)

covid.add_annotation(
    x="2020-03-11",
    y=35,
    align="right",
    text="COVID-19",
    showarrow=False,
    font=dict(
        family="Courier New, monospace",
        size=16,
        color="black")
)

In [179]:
covid.write_html('covid.html')

# Post explorer

In [180]:

mydf = df[df.Tags.isin(mylist)].copy()

mydf.loc[:, ["ViewCount", "AnswerCount", "CommentCount"]] = mydf.loc[
    :, ["ViewCount", "AnswerCount", "CommentCount"]
].astype(int)



In a future version, `df.iloc[:, i] = newvals` will attempt to set the values inplace instead of always setting a new array. To retain the old behavior, use either `df[df.columns[i]] = newvals` or, if columns are non-unique, `df.isetitem(i, newvals)`



In [181]:
# create column with year
mydf['Year'] = mydf['CreationDate'].dt.year
mydf['Month'] = mydf['CreationDate'].dt.month

In [182]:
fig = px.scatter(mydf,
              x=mydf['Month'],
              y="CommentCount",
              color="Tags",
              size="AnswerCount",
              hover_name="Title",
              hover_data=["Month"],
              log_y=True,
              animation_frame=mydf['Year'],
              #animation_group="Tags",
)

fig.show()

In [183]:
fig.write_html('scatter.html')