Natural Observer

A project to collect the thousands of observations of the natural world from Reddit (and maybe eventually other social media). Photos, identification, and any location information are collated to create a usable dataset for citizen science networks such as eBird and iNaturalist. We hope...

Created by:
Lindsey Parkinson, Thomas Oliver, and Roman Grisch 

In [1]:
import praw
import json
import pandas as pd
import numpy as np
from collections import defaultdict
import re
import datetime

#redditkeys.json contains all the information necessary to use the Reddit API

with open("/../redditkeys.json") as infile:
    credentials = json.load(infile)
reddit = praw.Reddit(client_id = credentials["client_id"],
                     client_secret = credentials["client_secret"],
                     user_agent=credentials["user_agent"],
                     username=credentials["username"],
                     password=credentials["password"])

print(reddit.user.me())

FileNotFoundError: [Errno 2] No such file or directory: '/../redditkeys.json'

In [2]:
date_list = []
author_list = []
id_list = []
link_flair_text_list = []
#num_comments_list = []
#score_list = []
title_list = []
#upvote_ratio_list = []
url_list = []


#subreddits we want to scrape information from
subreddit_list=  ['whatisthisfish',
                  'whatsthisfish'
                  #'whatsthisbug',
                  #'whatsthisbird',
                  #'whatsthissnake',
                 ]

#What information we want from each subreddit post
for subred in subreddit_list:
    subreddit = reddit.subreddit(subred)
    top_post = subreddit.top(limit = 5)  
    
    for sub in top_post: 
        #if link_flair_text_list.append(sub.link_flair_text != 'Solved'):
            #continue
        
        date_list.append(datetime.datetime.fromtimestamp(sub.created_utc))
        author_list.append(sub.author)
        id_list.append(sub.id)        
        link_flair_text_list.append(sub.link_flair_text)
        #num_comments_list.append(sub.num_comments)
        #score_list.append(sub.score)
        title_list.append(sub.title)
        #upvote_ratio_list.append(sub.upvote_ratio) 
        url_list.append(sub.url)
        
    print(subred, 'completed; ', end='')
    print('total', len(author_list), 'posts has been scraped')
    


whatisthisfish completed; total 5 posts has been scraped
whatsthisfish completed; total 10 posts has been scraped


In [3]:
def convert(row, col = "URL"):
    """
    This function will convert strings into hyperlinks, makes it easier to pull images
    """
    return "<a href='{}'>{}</a>".format(row[col], row.name)

In [4]:
df = pd.DataFrame({'Date': date_list,
                   'ID':id_list, 
                   'Author':author_list, 
                   'Title':title_list,
                   #'Count_of_Comments':num_comments_list,
                   #'Upvote_Count':score_list,
                   #'Upvote_Ratio':upvote_ratio_list,
                   'Flair':link_flair_text_list,
                   'URL':url_list
                  })



In [5]:
df['URL'] = df.apply(convert, axis = 1)

In [6]:
df

Unnamed: 0,Date,ID,Author,Title,Flair,URL
0,2020-02-17 19:45:23,f5denb,officialjoeyacnl,found at a pet store any ideas? image is a kno...,Solved,<a href='https://i.redd.it/f6ntf9ys4jh41.jpg'>...
1,2020-04-07 04:46:15,fwcjf9,Shinobi-butterstick,what type of fish is this?,Solved,<a href='https://i.redd.it/yxwvpygg7br41.jpg'>...
2,2020-07-27 00:07:36,hyfvpv,xItsMeGradyMC,"What's this fish? Oh that, that's a bluegill",Solved,<a href='https://i.redd.it/k6rb9y0wy9d51.png'>...
3,2020-02-17 14:48:48,f58z6m,jcole315315,"Caught in New Jersey, what is this fish?",Unsolved,<a href='https://i.redd.it/ljqoyz43ohh41.jpg'>...
4,2020-07-29 23:44:45,i09fvp,PhotonicBoom21,FAQ: Common sunfish in North America,,<a href='https://i.redd.it/brfh2rkh9vd51.jpg'>...
5,2020-06-15 00:58:58,h93riw,oceanjunkie,"Caught in key west, Florida. Never seen anythi...","Identified, high confidence",<a href='https://i.redd.it/j86d87vnhy451.jpg'>...
6,2020-05-11 17:59:31,ghq9mn,edgy_edgy_edgy,I found this video and the fish is hella cute ...,,<a href='https://v.redd.it/lvximb7tr5y41'>6</a>
7,2020-03-13 11:38:36,fhxmse,wirapori,Came across this jellyfish at my dad's saltwat...,Identification question,<a href='https://v.redd.it/c6k4lgvk4fm41'>7</a>
8,2020-06-12 15:25:32,h7kn6s,rnotjimmy,"Caught deep sea fishing off the coast of Looe,...",,<a href='https://i.redd.it/klx4m6aodh451.jpg'>...
9,2019-05-06 03:06:08,bl5kr6,hay-guise,What's this fish skeleton? Found in a friend's...,,<a href='https://i.redd.it/t6h2o3tcqhw21.jpg'>...


In [7]:
comments = defaultdict(list)
for blue in id_list:
    submission = reddit.submission(str(blue))
    for top_level_comment in submission.comments:
        comments[submission.title].append(top_level_comment.body)

In [8]:
top_comment = list()
for x in comments.values():
    top_comment.append(x[0])

In [9]:
df["Top Comment"] = top_comment

In [10]:
df

Unnamed: 0,Date,ID,Author,Title,Flair,URL,Top Comment
0,2020-02-17 19:45:23,f5denb,officialjoeyacnl,found at a pet store any ideas? image is a kno...,Solved,<a href='https://i.redd.it/f6ntf9ys4jh41.jpg'>...,Flowerhorn
1,2020-04-07 04:46:15,fwcjf9,Shinobi-butterstick,what type of fish is this?,Solved,<a href='https://i.redd.it/yxwvpygg7br41.jpg'>...,"Scarus coeruleus, blue parrotfish! They are in..."
2,2020-07-27 00:07:36,hyfvpv,xItsMeGradyMC,"What's this fish? Oh that, that's a bluegill",Solved,<a href='https://i.redd.it/k6rb9y0wy9d51.png'>...,I wonder if it calls me pink lung.
3,2020-02-17 14:48:48,f58z6m,jcole315315,"Caught in New Jersey, what is this fish?",Unsolved,<a href='https://i.redd.it/ljqoyz43ohh41.jpg'>...,Seaweed blenny.
4,2020-07-29 23:44:45,i09fvp,PhotonicBoom21,FAQ: Common sunfish in North America,,<a href='https://i.redd.it/brfh2rkh9vd51.jpg'>...,Recently we have been seeing a *lot* of sunfis...
5,2020-06-15 00:58:58,h93riw,oceanjunkie,"Caught in key west, Florida. Never seen anythi...","Identified, high confidence",<a href='https://i.redd.it/j86d87vnhy451.jpg'>...,"Took me awhile, but its a swallow-tailed bass ..."
6,2020-05-11 17:59:31,ghq9mn,edgy_edgy_edgy,I found this video and the fish is hella cute ...,,<a href='https://v.redd.it/lvximb7tr5y41'>6</a>,Looks like spotted porcupinefish also known as...
7,2020-03-13 11:38:36,fhxmse,wirapori,Came across this jellyfish at my dad's saltwat...,Identification question,<a href='https://v.redd.it/c6k4lgvk4fm41'>7</a>,Hey it looks like a [Lobonema Smithii](https:/...
8,2020-06-12 15:25:32,h7kn6s,rnotjimmy,"Caught deep sea fishing off the coast of Looe,...",,<a href='https://i.redd.it/klx4m6aodh451.jpg'>...,I’m going with Tub Gurnard. https://en.m.wiki...
9,2019-05-06 03:06:08,bl5kr6,hay-guise,What's this fish skeleton? Found in a friend's...,,<a href='https://i.redd.it/t6h2o3tcqhw21.jpg'>...,Raphael catfish or close relative


In [26]:
#Import and download NLP tools

import nltk
#nltk.download('punkt')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /home/lindsey/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.
[nltk_data] Downloading package words to /home/lindsey/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/lindsey/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

In [22]:
practice = df["Title"].copy
print(practice)

print(df["Title"].to_list())

<bound method NDFrame.copy of 0    found at a pet store any ideas? image is a kno...
1                           what type of fish is this?
2         What's this fish? Oh that, that's a bluegill
3             Caught in New Jersey, what is this fish?
4                 FAQ: Common sunfish in North America
5    Caught in key west, Florida. Never seen anythi...
6    I found this video and the fish is hella cute ...
7    Came across this jellyfish at my dad's saltwat...
8    Caught deep sea fishing off the coast of Looe,...
9    What's this fish skeleton? Found in a friend's...
Name: Title, dtype: object>
['found at a pet store any ideas? image is a known image too', 'what type of fish is this?', "What's this fish? Oh that, that's a bluegill", 'Caught in New Jersey, what is this fish?', 'FAQ: Common sunfish in North America', 'Caught in key west, Florida. Never seen anything like it.', 'I found this video and the fish is hella cute but I dont know what kind of fish it is', "Came across this

In [27]:
#A function to pull location information from Reddit post titles

#with open('sample.txt', 'r') as f:
out_str = ","
sample = out_str.join(df['Title'])


sentences = nltk.sent_tokenize(sample)
tokenized_sentences = [nltk.word_tokenize(sentence) for sentence in sentences]
tagged_sentences = [nltk.pos_tag(sentence) for sentence in tokenized_sentences]
chunked_sentences = nltk.ne_chunk_sents(tagged_sentences, binary=True)

def extract_entity_names(t):
    entity_names = []

    if hasattr(t, 'label') and t.label:
        if t.label() == 'NE':
            entity_names.append(' '.join([child[0] for child in t]))
        else:
            for child in t:
                entity_names.extend(extract_entity_names(child))

    return entity_names

entity_names = []
for tree in chunked_sentences:
    # Print results per sentence
    # print extract_entity_names(tree)

    entity_names.extend(extract_entity_names(tree))

# Print all entity names
#print entity_names

# Print unique entity names
print(entity_names)
    

['Caught', 'New Jersey', 'FAQ', 'North America', 'Caught', 'Florida', 'Came', 'Brunei', 'Mentiri', 'Caught', 'Looe', 'Cornwall', 'Southern CA']


In [None]:
#The function grabs all the location words but is also attracted to the word "caught"