## Read in Data

In [1]:
import pandas as pd
import numpy as np
import plotly.express as px

In [2]:
# Azure Machine Learning workspace details:
subscription = '58bb8a15-5d27-4d02-a5ca-772d24ae37a8'
resource_group = 'project-rg'
workspace = 'group-02-aml'
datastore_name = 'workspaceblobstore'
path_on_datastore = 'filtered-submissions-all2'

# long-form Datastore uri format:
uri = f'azureml://subscriptions/{subscription}/resourcegroups/{resource_group}/workspaces/{workspace}/datastores/{datastore_name}'
print(uri)
print(path_on_datastore)

azureml://subscriptions/58bb8a15-5d27-4d02-a5ca-772d24ae37a8/resourcegroups/project-rg/workspaces/group-02-aml/datastores/workspaceblobstore
filtered-submissions-all2


In [3]:
pip install -U azureml-fsspec mltable

Requirement already up-to-date: azureml-fsspec in /anaconda/envs/azureml_py38/lib/python3.8/site-packages (1.2.0)
Requirement already up-to-date: mltable in /anaconda/envs/azureml_py38/lib/python3.8/site-packages (1.5.0)
Note: you may need to restart the kernel to use updated packages.


In [4]:
import pandas as pd
from azureml.fsspec import AzureMachineLearningFileSystem

# create the filesystem
fs = AzureMachineLearningFileSystem(uri)

# append parquet files in folder to a list
dflist = []
for path in fs.glob(f'{path_on_datastore}/*.parquet'):
    with fs.open(path) as f:
        dflist.append(pd.read_parquet(f))

# concatenate data frames
reddit_df = pd.concat(dflist)

In [5]:
reddit_df.shape
reddit_df.head(2)


(217394, 68)

Unnamed: 0,adserver_click_url,adserver_imp_pixel,archived,author,author_cakeday,author_flair_css_class,author_flair_text,author_id,brand_safe,contest_mode,...,suggested_sort,third_party_trackers,third_party_tracking,third_party_tracking_2,thumbnail,thumbnail_height,thumbnail_width,title,url,whitelist_status
0,,,False,[deleted],,,,,,False,...,,,,,default,,,Should I move to D.C. or commute from NoVa?,,all_ads
1,,,False,thewheisk,,,,,,False,...,,,,,default,,,ChatGPT - what should happen to a sitting memb...,https://www.reddit.com/r/Seattle/comments/119d...,all_ads


In [7]:
# Check whether all subreddits are there
reddit_df.subreddit.unique()

array(['washingtondc', 'Seattle', 'Atlanta', 'nyc'], dtype=object)

## Bargraph of subreddit posts by year

In [14]:
# Convert 'created_utc' to datetime
reddit_df['created_utc'] = pd.to_datetime(reddit_df['created_utc'])

# Extract year from 'created_utc'
reddit_df['year'] = reddit_df['created_utc'].dt.year
reddit_df['year'] = reddit_df['year'].astype('str')

In [15]:
post_counts = reddit_df.groupby(['subreddit', 'year']).size().reset_index(name='count')
post_counts

Unnamed: 0,subreddit,year,count
0,Atlanta,2021,18194
1,Atlanta,2022,17451
2,Atlanta,2023,4170
3,Seattle,2021,30388
4,Seattle,2022,31957
5,Seattle,2023,7578
6,nyc,2021,28515
7,nyc,2022,27564
8,nyc,2023,6154
9,washingtondc,2021,18872


In [23]:
# Make barplot in Plotly
fig = px.bar(post_counts, x="subreddit", y="count", color="year", template='plotly_white',
             labels={"year": "Year", "count": "Number of Posts", 'subreddit': 'City Subreddit'},
             title="Number of Reddit Posts per City, per Year")
# Update size
fig.update_layout(height=500, width=800)

## Bargraph of subreddit posts without year



In [25]:
subreddit_counts = reddit_df.groupby(['subreddit']).size().reset_index(name='count')
subreddit_counts

Unnamed: 0,subreddit,count
0,Atlanta,39815
1,Seattle,69923
2,nyc,62233
3,washingtondc,45423


In [29]:
# Make barplot in Plotly
fig2 = px.bar(subreddit_counts, x="subreddit", y="count", color = 'subreddit', template='plotly_white',
             labels={"count": "Number of Posts", 'subreddit': 'City Subreddit'},
             title="Number of Reddit Posts per City in Entire Dataset")
# Update size
fig2.update_layout(height=500, width=800)
# Remove legend
fig2.update_traces(showlegend=False)

## Table showing average, median, min, max number of comments per city

In [32]:
# Create table aggregating statistics by subreddit
agg_df = reddit_df.groupby('subreddit')['num_comments'].agg(['mean', 'median', 'min', 'max']).reset_index()

# Rename the columns
agg_df.columns = ['Subreddit', 'Average Number of Comments', 'Median Number of Comments', 'Min Number of Comments', 'Max Number of Comments']

In [33]:
agg_df

Unnamed: 0,Subreddit,Average Number of Comments,Median Number of Comments,Min Number of Comments,Max Number of Comments
0,Atlanta,10.178099,1,0,883
1,Seattle,25.399868,6,0,2761
2,nyc,21.989748,1,0,5503
3,washingtondc,19.796865,5,0,1991


## Table showing average, median, min, max Reddit score per city

In [35]:
reddit_df.columns

Index(['adserver_click_url', 'adserver_imp_pixel', 'archived', 'author',
       'author_cakeday', 'author_flair_css_class', 'author_flair_text',
       'author_id', 'brand_safe', 'contest_mode', 'created_utc',
       'crosspost_parent', 'crosspost_parent_list', 'disable_comments',
       'distinguished', 'domain', 'domain_override', 'edited', 'embed_type',
       'embed_url', 'gilded', 'hidden', 'hide_score', 'href_url', 'id',
       'imp_pixel', 'is_crosspostable', 'is_reddit_media_domain', 'is_self',
       'is_video', 'link_flair_css_class', 'link_flair_text', 'locked',
       'media', 'media_embed', 'mobile_ad_url', 'num_comments',
       'num_crossposts', 'original_link', 'over_18', 'parent_whitelist_status',
       'permalink', 'pinned', 'post_hint', 'preview', 'promoted',
       'promoted_by', 'promoted_display_name', 'promoted_url', 'retrieved_on',
       'score', 'secure_media', 'secure_media_embed', 'selftext', 'spoiler',
       'stickied', 'subreddit', 'subreddit_id', 'sugge

In [36]:
# Create table aggregating statistics by subreddit
agg_df2 = reddit_df.groupby('subreddit')['score'].agg(['mean', 'median', 'min', 'max']).reset_index()

# Rename the columns
agg_df2.columns = ['Subreddit', 'Average Reddit Score', 'Median Reddit Score', 'Min Reddit Score', 'Max Reddit Score']

In [37]:
agg_df2

Unnamed: 0,Subreddit,Average Reddit Score,Median Reddit Score,Min Reddit Score,Max Reddit Score
0,Atlanta,22.601155,1,0,3134
1,Seattle,83.430388,2,0,57618
2,nyc,68.069497,1,0,8363
3,washingtondc,53.056998,1,0,3789


# 

## Feature Engineering: Create low, medium, high engagement labels based on number of comments

Since the average number of comments is approximately 10-25 comments, I am going to create low, medium, and high labels as:

- Low: 0 <= num_comments < 20
- Medium: 20 <= num_comments < 100
- High: 100 <= num_comments 

In [38]:
# Check if there are nulls
reddit_df['num_comments'].isna().sum()

0

In [49]:
# Create labels
reddit_df['engagement_label'] = np.where((reddit_df['num_comments'] >= 0) & (reddit_df['num_comments'] < 20), 'low',
                                np.where((reddit_df['num_comments'] >= 20) & (reddit_df['num_comments'] < 100), 'medium', 'high'))

In [50]:
# Check if it worked
reddit_df[reddit_df['engagement_label']=='high'][['num_comments', 'engagement_label']].head(2)

Unnamed: 0,num_comments,engagement_label
4,143,high
12,158,high


In [51]:
# Check if it worked
reddit_df[reddit_df['engagement_label']=='medium'][['num_comments', 'engagement_label']].head(2)

Unnamed: 0,num_comments,engagement_label
5,43,medium
6,50,medium


In [52]:
# Check if it worked
reddit_df[reddit_df['engagement_label']=='low'][['num_comments', 'engagement_label']].head(2)

Unnamed: 0,num_comments,engagement_label
0,0,low
1,6,low


### Create bargraph showing distribution of labels

In [53]:
engagement_counts = reddit_df.groupby(['subreddit', 'engagement_label']).size().reset_index(name='count')
engagement_counts

Unnamed: 0,subreddit,engagement_label,count
0,Atlanta,high,1010
1,Atlanta,low,35055
2,Atlanta,medium,3750
3,Seattle,high,3855
4,Seattle,low,52403
5,Seattle,medium,13665
6,nyc,high,3588
7,nyc,low,51408
8,nyc,medium,7237
9,washingtondc,high,2023


In [55]:
# Make barplot in Plotly
fig3 = px.bar(engagement_counts, x="subreddit", y="count", color="engagement_label", template='plotly_white',
             labels={"engagement_label": "Engagement Label", "count": "Number of Posts", 'subreddit': 'City Subreddit'},
             title="Engagement Level for Posts by Subreddit")
# Update size
fig3.update_layout(height=500, width=800)