### Read Data

In [14]:
# import libraries
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt

In [2]:
# Azure Machine Learning workspace details:
subscription = '58bb8a15-5d27-4d02-a5ca-772d24ae37a8'
resource_group = 'project-rg'
workspace = 'group-02-aml'
datastore_name = 'workspaceblobstore'
path_on_datastore = 'filtered-submissions-all2'

# long-form Datastore uri format:
uri = f'azureml://subscriptions/{subscription}/resourcegroups/{resource_group}/workspaces/{workspace}/datastores/{datastore_name}'
print(uri)
print(path_on_datastore)

azureml://subscriptions/58bb8a15-5d27-4d02-a5ca-772d24ae37a8/resourcegroups/project-rg/workspaces/group-02-aml/datastores/workspaceblobstore
filtered-submissions-all


In [3]:
from azureml.fsspec import AzureMachineLearningFileSystem

# create the filesystem
fs = AzureMachineLearningFileSystem(uri)

# append parquet files in folder to a list
dflist = []
for path in fs.glob(f'{path_on_datastore}/*.parquet'):
    with fs.open(path) as f:
        dflist.append(pd.read_parquet(f))

# concatenate data frames
reddit_df = pd.concat(dflist)

In [7]:
reddit_df.shape

(107656, 68)

In [8]:
print(reddit_df.columns.tolist())

['adserver_click_url', 'adserver_imp_pixel', 'archived', 'author', 'author_cakeday', 'author_flair_css_class', 'author_flair_text', 'author_id', 'brand_safe', 'contest_mode', 'created_utc', 'crosspost_parent', 'crosspost_parent_list', 'disable_comments', 'distinguished', 'domain', 'domain_override', 'edited', 'embed_type', 'embed_url', 'gilded', 'hidden', 'hide_score', 'href_url', 'id', 'imp_pixel', 'is_crosspostable', 'is_reddit_media_domain', 'is_self', 'is_video', 'link_flair_css_class', 'link_flair_text', 'locked', 'media', 'media_embed', 'mobile_ad_url', 'num_comments', 'num_crossposts', 'original_link', 'over_18', 'parent_whitelist_status', 'permalink', 'pinned', 'post_hint', 'preview', 'promoted', 'promoted_by', 'promoted_display_name', 'promoted_url', 'retrieved_on', 'score', 'secure_media', 'secure_media_embed', 'selftext', 'spoiler', 'stickied', 'subreddit', 'subreddit_id', 'suggested_sort', 'third_party_trackers', 'third_party_tracking', 'third_party_tracking_2', 'thumbnail'

### Bubble Plot

In [18]:
# Convert 'created_utc' to datetime
reddit_df['created_utc'] = pd.to_datetime(reddit_df['created_utc'])

# Extract year from 'created_utc'
reddit_df['year'] = reddit_df['created_utc'].dt.year
# Extract month from 'created_utc'
reddit_df['month'] = reddit_df['created_utc'].dt.month


In [30]:
# group by year, month, and subreddit, and count the number of submissions
submissions_count = reddit_df.groupby(['year', 'month', 'subreddit']).size().reset_index(name='Num_Submissions')
# group by year, month, and subreddit, and count the number of comments
comments_sum_by_year = reddit_df.groupby(['year', 'month', 'subreddit'])['num_comments'].sum().reset_index(name='Total_Comments')
# combine two on year, month, subreddit
combined_df = pd.merge(submissions_count, comments_sum_by_year, on=['year', 'month', 'subreddit'])
# show first rows of combined df
print(combined_df.head())

   year  month     subreddit  Num_Submissions  Total_Comments
0  2021      1           nyc             2365           44872
1  2021      1  washingtondc             2260           41403
2  2021      2           nyc             2184           34772
3  2021      2  washingtondc             1277           17946
4  2021      3           nyc             2621           44468


In [28]:
# The animation_frame will be based on 'year'
fig = px.scatter(
    combined_df, 
    x='month', 
    y='Num_Submissions', 
    size='Total_Comments',
    color='subreddit',
    hover_name='subreddit',
    animation_frame='year',
    size_max=60,
    range_y=[combined_df['Num_Submissions'].min(), combined_df['Num_Submissions'].max()],
    title='Monthly Subreddit Activity'
)

# Update the layout to include all months on the x-axis, year 2023 only has three monthes data point
fig.update_xaxes(
    title='Month',
    tickmode='array',
    tickvals=[str(m).zfill(2) for m in range(1, 13)],
    ticktext=[str(m) for m in range(1, 13)]
)

fig.update_layout(
    height=700,  # set the height
    width=1200,  # set the width
    title_text='Monthly Number of Submissions by Subreddits from 2021-2023 ', 
    title_x=0.5  # Center the title
)

# Update the y-axis title
fig.update_yaxes(title='Number of Submissions')

# Show the figure
fig.show()

### Table showing average, median, min, max number of comments per year

In [55]:
# Create table aggregating statistics by subreddit
agg_df = reddit_df.groupby('year')['num_comments'].agg(['mean', 'median', 'min', 'max']).reset_index()

# Rename the columns
agg_df.columns = ['Year', 'Average Number of Comments', 'Median Number of Comments ', 'Min Number of Comments', 'Max Number of Comments']

agg_df

Unnamed: 0,year,average,median,min,max
0,2021,18.932682,2.0,0,3387
1,2022,22.912386,2.0,0,5503
2,2023,21.994144,3.0,0,1105


### Table showing average, median, min, max number of Reddit Scores per year

In [None]:
# Create table aggregating statistics by subreddit
agg_df2 = reddit_df.groupby('year')['score'].agg(['mean', 'median', 'min', 'max']).reset_index()

# Rename the columns
agg_df2.columns = ['year', 'Average Reddit Score', 'Median Reddit Score', 'Min Reddit Score', 'Max Reddit Score']

agg_df2

### Feature Engineering: Create low, medium, high labels based on Reddit Score

The average Reddit Score is around 84. 

**Low: 0 <= Reddit Score < 84**

**Medium: 84 <= Reddit Score < 200**

**High: 200 <= Reddit Score**

In [None]:
# Create labels
reddit_df['score_label'] = np.where((reddit_df['score'] >= 0) & (reddit_df['score'] < 84), 'low',
                                np.where((reddit_df['score'] >= 84) & (reddit_df['score'] < 200), 'medium', 'high'))