In [1]:
import pandas as pd

In [2]:
posts_tag_df = pd.read_csv("posts_tag.csv")

In [3]:
# Show sample data
posts_tag_df

Unnamed: 0,Id,CreationYearMonth,Tag
0,4,2008-07,c#
1,4,2008-07,floating-point
2,4,2008-07,type-conversion
3,4,2008-07,double
4,4,2008-07,decimal
...,...,...,...
168684444,78091263,2024-03,react-hooks
168684445,78091263,2024-03,setstate
168684446,78091259,2024-03,drupal
168684447,78091259,2024-03,drupal-9


In [4]:
# Calculate number of posts (both questions and answers) per tag `Tag` and year-month.
tag_year_month_count_df = posts_tag_df.\
    groupby(['Tag', 'CreationYearMonth'])['Id'].\
    nunique().\
    reset_index(name='Count').\
    sort_values(by=['Tag', 'CreationYearMonth'])
tag_year_month_count_df

Unnamed: 0,Tag,CreationYearMonth,Count
0,.a,2010-01,4
1,.a,2010-03,2
2,.a,2010-04,5
3,.a,2011-02,1
4,.a,2011-05,4
...,...,...,...
3650769,zyte,2023-06,2
3650770,zyte,2023-09,1
3650771,zyte,2023-12,3
3650772,zyte,2024-01,1


In [5]:
# Calculate monothonically increasing number of posts per tag `Tag` and year-month.
tag_year_month_count_df['TotalCount'] = tag_year_month_count_df.groupby('Tag')['Count'].\
                        expanding().\
                        sum().\
                        reset_index(level=0, drop=True)
# Show sample data
tag_year_month_count_df

Unnamed: 0,Tag,CreationYearMonth,Count,TotalCount
0,.a,2010-01,4,4.0
1,.a,2010-03,2,6.0
2,.a,2010-04,5,11.0
3,.a,2011-02,1,12.0
4,.a,2011-05,4,16.0
...,...,...,...,...
3650769,zyte,2023-06,2,28.0
3650770,zyte,2023-09,1,29.0
3650771,zyte,2023-12,3,32.0
3650772,zyte,2024-01,1,33.0


In [6]:
# Calculate total number of posts (both questions and answers) per year-month.
# Calculate number of posts (both questions and answers) per tag `Tag` and year-month.
year_month_count_df = posts_tag_df.\
    groupby(['CreationYearMonth'])['Id'].\
    nunique().\
    reset_index(name='Count').\
    sort_values(by=['CreationYearMonth'])

# Show sample data
year_month_count_df

Unnamed: 0,CreationYearMonth,Count
0,2008-07,6
1,2008-08,14709
2,2008-09,61826
3,2008-10,58874
4,2008-11,47493
...,...,...
184,2023-11,130075
185,2023-12,110803
186,2024-01,125348
187,2024-02,127588


In [7]:
# Calculate monothonically increasing number of total posts per year-month.
year_month_count_df['TotalCount'] = year_month_count_df['Count'].expanding().sum()
# Show sample data
year_month_count_df

Unnamed: 0,CreationYearMonth,Count,TotalCount
0,2008-07,6,6.0
1,2008-08,14709,14715.0
2,2008-09,61826,76541.0
3,2008-10,58874,135415.0
4,2008-11,47493,182908.0
...,...,...,...
184,2023-11,130075,56520009.0
185,2023-12,110803,56630812.0
186,2024-01,125348,56756160.0
187,2024-02,127588,56883748.0


In [8]:
# Join per Tag created posts dataframe with all created posts dataframe per year-month
all_year_month_count_df = pd.merge(
    tag_year_month_count_df,
    year_month_count_df,
    left_on='CreationYearMonth',
    right_on='CreationYearMonth',
    how='inner',
    suffixes=('_Tag', '_All')
)

all_year_month_count_df

Unnamed: 0,Tag,CreationYearMonth,Count_Tag,TotalCount_Tag,Count_All,TotalCount_All
0,.a,2010-01,4,4.0,145866,1560787.0
1,.a,2010-03,2,6.0,160711,1862493.0
2,.a,2010-04,5,11.0,150604,2013097.0
3,.a,2011-02,1,12.0,236699,3929356.0
4,.a,2011-05,4,16.0,281657,4772127.0
...,...,...,...,...,...,...
3650769,zyte,2023-06,2,28.0,161929,55801992.0
3650770,zyte,2023-09,1,29.0,135820,56252733.0
3650771,zyte,2023-12,3,32.0,110803,56630812.0
3650772,zyte,2024-01,1,33.0,125348,56756160.0


In [9]:
# Calculate tag relative size for tag based on cummulative count of created post
# up till year-month.
all_year_month_count_df['CountPercantage'] = (all_year_month_count_df['TotalCount_Tag'] / all_year_month_count_df['TotalCount_All']) * 100
all_year_month_count_df

Unnamed: 0,Tag,CreationYearMonth,Count_Tag,TotalCount_Tag,Count_All,TotalCount_All,CountPercantage
0,.a,2010-01,4,4.0,145866,1560787.0,0.000256
1,.a,2010-03,2,6.0,160711,1862493.0,0.000322
2,.a,2010-04,5,11.0,150604,2013097.0,0.000546
3,.a,2011-02,1,12.0,236699,3929356.0,0.000305
4,.a,2011-05,4,16.0,281657,4772127.0,0.000335
...,...,...,...,...,...,...,...
3650769,zyte,2023-06,2,28.0,161929,55801992.0,0.000050
3650770,zyte,2023-09,1,29.0,135820,56252733.0,0.000052
3650771,zyte,2023-12,3,32.0,110803,56630812.0,0.000057
3650772,zyte,2024-01,1,33.0,125348,56756160.0,0.000058


In [None]:
# Calculate tag rank at Year-Month based on relative size 
all_year_month_count_df['Rank'] = all_year_month_count_df.groupby("CreationYearMonth")["CountPercantage"].rank(method="first", ascending=False)
all_year_month_count_df

Unnamed: 0,Tag,CreationYearMonth,Count_Tag,TotalCount_Tag,Count_All,TotalCount_All,CountPercantage,Rank
0,.a,2010-01,4,4.0,145866,1560787.0,0.000256,9138.0
1,.a,2010-03,2,6.0,160711,1862493.0,0.000322,9492.0
2,.a,2010-04,5,11.0,150604,2013097.0,0.000546,8539.0
3,.a,2011-02,1,12.0,236699,3929356.0,0.000305,11605.0
4,.a,2011-05,4,16.0,281657,4772127.0,0.000335,12028.0
...,...,...,...,...,...,...,...,...
3650769,zyte,2023-06,2,28.0,161929,55801992.0,0.000050,20477.0
3650770,zyte,2023-09,1,29.0,135820,56252733.0,0.000052,19279.0
3650771,zyte,2023-12,3,32.0,110803,56630812.0,0.000057,18069.0
3650772,zyte,2024-01,1,33.0,125348,56756160.0,0.000058,18929.0


In [20]:
# Save `posts_tag_df` as intermidiate result
all_year_month_count_df.to_csv('tags_trends_df.csv', index=False)