In [12]:
# creating paths to src and data folders in the repo
import sys
import pathlib
src_path = pathlib.Path().absolute().parent.parent / "src"
data_path = pathlib.Path().absolute().parent.parent / "data"

# add src path to sys.path so it is searched in import statements
sys.path.append(str(src_path))

# basic imports for data retreival and manipulation
import requests
import pandas as pd
import numpy as np
import datetime as dt
import time

# importing api wrappers for reddit data
import praw
from psaw import PushshiftAPI

# retrieve api credentials from .gitignore'd text file
secrets_path = src_path / 'api_credentials.txt'
secrets_txt = open(secrets_path, 'r')

my_id = secrets_txt.readline().split('=')[1].rstrip()
my_secret = secrets_txt.readline().split('=')[1].rstrip()

secrets_txt.close()

# create a praw and pushshitft instances
reddit = praw.Reddit(
     client_id=my_id,
     client_secret=my_secret,
     user_agent="test_script by u/Mizule_RL"
 )

s_api = PushshiftAPI(reddit)


In [15]:
import re

I'm choosing to limit my classes to: Digital, Paint, Ink, Pencil, Charcoal, Sculpture

In [22]:
def get_wordlists():
    '''
    returns a dictionary containing the identifying words for each class
    '''
    wordlists = {'digital': ['digital', 'adobe', 'photoshop', 'procreate', 'wacom', 'tablet', 'illustrator', '3d', 'vector'],
                 'paint': ['acrylic', 'oil', 'watercolor', 'water color'],
                 'ink': ['pen', 'marker', 'ink'],
                 'pencil': ['pencil', 'colored pencil', 'coloredpencil', 'graphite'],
                 'charcoal': ['charcoal'],
                 'sculpture': ['clay', 'string', 'sculpture', 'wire', 'nail', 'glass', 'yarn', 'metal', 'copper']}
    return wordlists

In [32]:
def validate_submission(post):
    '''
    function for determining if a post is formated in a way that can be parsed properly
    
    Parameters:
    -- post: praw.submission object to be validated
    
    Returns:
    -- Boolean: True if post has valid formatting, False if not
    '''
    
    link_format = 'https://i\.redd\.it/.{13}\.(jpg|png)'
    corpus = []
    wordlists = get_wordlists()
    for value in wordlists.values():
        corpus.extend(value)
    title = post.title.lower().split(',')
    if len(title) == 4:
        medium = title[2].strip()
        if medium == 'me':
            medium = title[1].strip()
        if re.match(link_format, post.url):
            for word in corpus:
                if medium.find(word) >= 0:
                    return True
    return False
    
def extract_medium_from_title(title):
    '''
    identifies the medium given the title of the post
    '''
    medium_counter = 0
    medium_type = 'other'
    title_split = title.lower().split(',')
    medium = title_split[2].strip()
    if medium == 'me':
        medium = title_split[1].strip()
    wordlists = get_wordlists()
    for item in wordlists.items():
        for word in item[1]:
            if medium.find(word) >= 0:
                medium_type = item[0]
                medium_counter += 1
                break
    if medium_counter > 1:
        return 'mixed_medium'
    else:
        return medium_type

In [38]:
def make_post_dict(post):
    '''
    creates and returns a dict containing relevant data from a post
    '''
    post_dict = {'title': post.title,
                 'medium': extract_medium_from_title(post.title),
                 'url': post.url, 
                 'id': post.id,
                 'unix_time': int(post.created_utc)}
    return post_dict

In [29]:
def fetch_submissions(min_posts, date = int(dt.datetime.now().timestamp())):
    start_epoch = date
    data_size = min_posts

    collected_posts = []
    while len(collected_posts) < data_size:
        print(f'polling pushshift for {data_size - len(collected_posts)} more posts before {start_epoch}')
        batch = list(s_api.search_submissions(before=start_epoch, subreddit='Art', limit=1000))
        for post in batch:
            if validate_submission(post):
                collected_posts.append(post)
        start_epoch = int(batch[-1].created_utc)
    return collected_posts

In [30]:
submissions_raw = fetch_submissions(10000)

polling pushshift for 10000 more posts before 1605647143
polling pushshift for 9959 more posts before 1605599241
polling pushshift for 9917 more posts before 1605585967
polling pushshift for 9862 more posts before 1605574629
polling pushshift for 9822 more posts before 1605565187
polling pushshift for 9783 more posts before 1605559306
polling pushshift for 9739 more posts before 1605551360
polling pushshift for 9694 more posts before 1605543522
polling pushshift for 9651 more posts before 1605537366
polling pushshift for 9602 more posts before 1605527150
polling pushshift for 9556 more posts before 1605512188
polling pushshift for 9512 more posts before 1605499991
polling pushshift for 9468 more posts before 1605488196
polling pushshift for 9413 more posts before 1605478748
polling pushshift for 9357 more posts before 1605470480
polling pushshift for 9305 more posts before 1605463098
polling pushshift for 9256 more posts before 1605455457
polling pushshift for 9197 more posts before 16

polling pushshift for 3074 more posts before 1603929914
polling pushshift for 3017 more posts before 1603922634
polling pushshift for 2971 more posts before 1603917654
polling pushshift for 2917 more posts before 1603910415
polling pushshift for 2868 more posts before 1603902956
polling pushshift for 2827 more posts before 1603895308
polling pushshift for 2779 more posts before 1603881442
polling pushshift for 2733 more posts before 1603859298
polling pushshift for 2687 more posts before 1603847110
polling pushshift for 2638 more posts before 1603837388
polling pushshift for 2594 more posts before 1603828840
polling pushshift for 2548 more posts before 1603822944
polling pushshift for 2499 more posts before 1603813353
polling pushshift for 2448 more posts before 1603804778
polling pushshift for 2394 more posts before 1603791495
polling pushshift for 2353 more posts before 1603774306
polling pushshift for 2310 more posts before 1603763542
polling pushshift for 2266 more posts before 160

In [39]:
submissions_dict = [make_post_dict(post) for post in submissions_raw]

In [40]:
submissions_dict

[{'title': 'Diya, me, 3D, 2020',
  'medium': 'digital',
  'url': 'https://i.redd.it/dk2s8oj6hsz51.jpg',
  'id': 'jvrq4w',
  'unix_time': 1605614029},
 {'title': 'Hexes&Exes, Webtoon, Digital Art, 2020',
  'medium': 'digital',
  'url': 'https://i.redd.it/o3m1hkvnesz51.png',
  'id': 'jvrk3v',
  'unix_time': 1605613143},
 {'title': 'Onward, me, ink on bristol, 2020',
  'medium': 'ink',
  'url': 'https://i.redd.it/0c4926h8dsz51.jpg',
  'id': 'jvrgof',
  'unix_time': 1605612655},
 {'title': 'Explosion, Me, Digital, 2020',
  'medium': 'digital',
  'url': 'https://i.redd.it/3txx4lcqbsz51.jpg',
  'id': 'jvrd6y',
  'unix_time': 1605612143},
 {'title': 'Lily, Me, Pen, 2020',
  'medium': 'ink',
  'url': 'https://i.redd.it/jxhc3pnmbsz51.jpg',
  'id': 'jvrczf',
  'unix_time': 1605612111},
 {'title': 'Imposter syndrome, me, digital, 2020',
  'medium': 'digital',
  'url': 'https://i.redd.it/buyuzygebsz51.jpg',
  'id': 'jvrcgp',
  'unix_time': 1605612035},
 {'title': 'White Rhino Track, Me, acrylic in

In [41]:
submissions_df = pd.DataFrame(submissions_dict)

In [42]:
submissions_df

Unnamed: 0,title,medium,url,id,unix_time
0,"Diya, me, 3D, 2020",digital,https://i.redd.it/dk2s8oj6hsz51.jpg,jvrq4w,1605614029
1,"Hexes&Exes, Webtoon, Digital Art, 2020",digital,https://i.redd.it/o3m1hkvnesz51.png,jvrk3v,1605613143
2,"Onward, me, ink on bristol, 2020",ink,https://i.redd.it/0c4926h8dsz51.jpg,jvrgof,1605612655
3,"Explosion, Me, Digital, 2020",digital,https://i.redd.it/3txx4lcqbsz51.jpg,jvrd6y,1605612143
4,"Lily, Me, Pen, 2020",ink,https://i.redd.it/jxhc3pnmbsz51.jpg,jvrczf,1605612111
...,...,...,...,...,...
10031,"Untitled, me, acrylic on canvas board, 2020",paint,https://i.redd.it/hkzkihqtyau51.jpg,jewxsy,1603222750
10032,"Wire wrapped crystal pendant, me, argentium si...",sculpture,https://i.redd.it/fp8aqleqyau51.jpg,jewxdr,1603222716
10033,"“Lost”, Artist me, Digital Painting, 2020",digital,https://i.redd.it/5zrygut4yau51.jpg,jewv1b,1603222527
10034,"The great egret, me, watercolor, 2020",paint,https://i.redd.it/3pw64tibxau51.jpg,jewro8,1603222239


In [43]:
submissions_df.medium.value_counts()

digital         5094
paint           2484
mixed_medium    1145
ink              949
pencil           158
charcoal         104
sculpture        102
Name: medium, dtype: int64