In [1]:
import pandas as pd

In [2]:
all_posts_df = pd.read_csv("Posts.csv")

In [3]:
# Show data shape
all_posts_df.dtypes

Tags             object
ParentId        float64
CreationDate     object
Id                int64
DeletionDate    float64
PostTypeId        int64
ClosedDate       object
dtype: object

In [4]:
# Show sample data
all_posts_df

Unnamed: 0,Tags,ParentId,CreationDate,Id,DeletionDate,PostTypeId,ClosedDate
0,<c#><floating-point><type-conversion><double><...,,2008-07-31T21:42:52.667,4,,1,
1,<html><css><internet-explorer-7>,,2008-07-31T22:08:08.620,6,,1,
2,,4.0,2008-07-31T22:17:57.883,7,,2,
3,<c#><.net><datetime>,,2008-07-31T23:40:59.743,9,,1,
4,<c#><datetime><time><datediff><relative-time-s...,,2008-07-31T23:55:37.967,11,,1,
...,...,...,...,...,...,...,...
59749044,<wifi><gstreamer><esp32><audio-streaming><mult...,,2024-03-02T02:52:48.793,78091308,,1,
59749045,<jquery><woocommerce><hide><show>,,2024-03-02T02:53:20.573,78091309,,1,
59749046,,18727766.0,2024-03-02T02:53:29.510,78091310,,2,
59749047,<r>,,2024-03-02T02:54:12.030,78091311,,1,


In [5]:
# Filter out closed or deleted posts, since they considered as irrelevant
# 'DeletionDate' and 'ClosedDate' columns present only for deleted or closed posts respectively.
non_deleted_closed_posts_df = all_posts_df[pd.isna(all_posts_df['DeletionDate']) & pd.isna(all_posts_df['ClosedDate'])]
non_deleted_closed_posts_count = len(non_deleted_closed_posts_df)
all_posts_count = len(all_posts_df)
print(all_posts_count)

non_deleted_closed_posts_percentage = round((non_deleted_closed_posts_count / all_posts_count) * 100, 2)
print(f'Number of NOT deleted or closed posts: {non_deleted_closed_posts_count}, which is {non_deleted_closed_posts_percentage}% of all data')

# Remove columns that we don't need anymore
posts_df = non_deleted_closed_posts_df.drop(['DeletionDate', 'ClosedDate'], axis=1)
print('Filtered posts dataframe shape:')
posts_df.dtypes

59749049
Number of NOT deleted or closed posts: 58652191, which is 98.16% of all data
Filtered posts dataframe shape:


Tags             object
ParentId        float64
CreationDate     object
Id                int64
PostTypeId        int64
dtype: object

In [None]:
# Trends will be calculated per-month granularity.
# Convert creation datetime into year-month pair. It easier and faster to do here, while posts dataframe is relatively small.
posts_df['CreationYearMonth'] = pd.to_datetime(posts_df['CreationDate']).dt.strftime('%Y-%m')
posts_df = posts_df.drop(['CreationDate'], axis=1)

In [None]:
# Show sample data
posts_df

In [None]:
# Split all posts_df onto two other dataframes: questions and answers.
# Questions does not assigned tags and answers does.
# Use `PostTypeId` column for it, where `1` is a type for question and `2` is for answer.
# Drop `ParentId` column for questions, because it is always `null` since they are parent posts for questions.
# Drop `Tags` column for answers, because it is always `null` since questions contain tags only. 
# Answers should have same tags as questions.
# See readme.txt for more details.
answers_df = posts_df[posts_df['PostTypeId'] == 2].drop(['PostTypeId', 'Tags'], axis=1)
questions_df = posts_df[posts_df['PostTypeId'] == 1].drop(['PostTypeId', 'ParentId'], axis=1)

answers_count = len(answers_df)
questions_count = len(questions_df)
posts_count = len(posts_df)

answers_percentage = round((answers_count / posts_count) * 100, 2)
questions_percentage = round((questions_count / posts_count) * 100, 2)

print(f'Number of answers: {answers_count}, which is {answers_percentage}% of all data')
print(f'Number of questions: {questions_count}, which is {questions_percentage}% of all data')

In [None]:
# Parse `Tags` column. It contains list of tags in XML like format. For instance: `<c#><.net><datetime>`
# To work properly with it, we need to turn it into proper list of tags.

# Remove the '<' and '>' characters and then split by '><'
questions_df['TagsParsed'] = questions_df['Tags'].str.replace('<', '').str.replace('>', '<').str.split('<')

# Remove 'Tags' column that is not needed anymore
questions_df.drop(['Tags'], axis=1, inplace=True)

# Remove empty strings that may appear as a result of the split
questions_df['TagsParsed'] = questions_df['TagsParsed'].apply(lambda tags: [tag for tag in tags if tag])

In [None]:
# Show data shape
questions_df.dtypes

In [None]:
# Show sample data
questions_df

In [None]:
# Explode 'TagsParsed' column to have a single tag per row and rename it to 'Tag'
# Having single tag per row allows to perform necessary aggregations.
questions_tag_df = questions_df.explode('TagsParsed').rename(columns={'TagsParsed': 'Tag'})

In [None]:
# Show data shape
questions_tag_df.dtypes

In [None]:
# Show sample data
questions_tag_df

In [None]:
# Merge answers dataframe with questions dataframe on 'Id' and 'ParentId' columns. 
# This merge is needed to populate tag data into answers posts for later aggregations.
# As it was mentioned before, answers posts don't have tags assigned, because they implicitily inherit those from parent question posts.
answers_tag_df = pd.merge(
    questions_tag_df,
    answers_df,
    left_on='Id',
    right_on='ParentId',
    how='inner',
    suffixes=('_Question', '_Answer')
)[['CreationYearMonth_Answer', 'Tag', 'Id_Answer']].rename(columns={'CreationYearMonth_Answer': 'CreationYearMonth', 'Id_Answer': 'Id'})

In [None]:
# Show data shape
answers_tag_df.dtypes

In [None]:
# Show sample data
answers_tag_df

In [None]:
# Union questions and answers dataframes to get dataframe that shows all posts created for particular tag
# Drop 'Id' column because it is not needed anymore.
posts_tag_df = pd.concat([questions_tag_df, answers_tag_df], ignore_index=True)

In [None]:
# Show data shape
posts_tag_df.dtypes

In [None]:
# Show sample data
posts_tag_df

In [None]:
# Save `posts_tag_df` as intermidiate result
posts_tag_df.to_csv('posts_tag.csv', index=False)