In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import json # to work with json file format
from bs4 import BeautifulSoup

pd.options.display.float_format = '{:,.2f}'.format

In [3]:
file = '../data/bigfoot_data.json'

In [4]:
df = pd.read_json(file, lines = True)

In [5]:
df.head()

Unnamed: 0,_id,html,time,url
0,{'$oid': '5939abbd2acdf6607095366c'},"\r\n<!doctype html public ""-//w3c//dtd html 4....",Thu Jun 8 19:55:40 2017,show_report.asp?id=13038
1,{'$oid': '5939abbd2acdf6607095366d'},"\r\n<!doctype html public ""-//w3c//dtd html 4....",Thu Jun 8 19:55:41 2017,show_report.asp?id=8792
2,{'$oid': '5939abbd2acdf6607095366e'},"\r\n<!doctype html public ""-//w3c//dtd html 4....",Thu Jun 8 19:55:41 2017,show_report.asp?id=1255
3,{'$oid': '5939abbd2acdf6607095366f'},"\r\n<!doctype html public ""-//w3c//dtd html 4....",Thu Jun 8 19:55:41 2017,show_report.asp?id=11616
4,{'$oid': '5939abbe2acdf66070953670'},"\r\n<!doctype html public ""-//w3c//dtd html 4....",Thu Jun 8 19:55:42 2017,show_report.asp?id=637


In [6]:
df.shape

(4857, 4)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4857 entries, 0 to 4856
Data columns (total 4 columns):
_id     4857 non-null object
html    4857 non-null object
time    4857 non-null object
url     4857 non-null object
dtypes: object(4)
memory usage: 151.9+ KB


In [8]:
df['length_html_all'] = df['html'].str.len()

In [9]:
df['length_html_all'].min()

11171

## No Duplicates in the HTML or URL columns!!


In [10]:
df[df.duplicated('html')]

Unnamed: 0,_id,html,time,url,length_html_all


# Get info on tags

In [11]:
test_vals = df['html'][0]

In [12]:
soup = BeautifulSoup(test_vals, 'html.parser')

In [13]:
soup.find_all('span', {'class': 'field'})

[<span class="field">Submitted  by  witness   on Saturday, November 12, 2005.</span>,
 <span class="field">Snowmobiler has encounter in deep snow near Potter, AK</span>,
 <span class="field">YEAR:</span>,
 <span class="field">SEASON:</span>,
 <span class="field">MONTH:</span>,
 <span class="field">STATE:</span>,
 <span class="field">COUNTY:</span>,
 <span class="field">LOCATION DETAILS:</span>,
 <span class="field">NEAREST TOWN:</span>,
 <span class="field">NEAREST ROAD:</span>,
 <span class="field">OBSERVED:</span>,
 <span class="field">ALSO NOTICED:</span>,
 <span class="field">OTHER WITNESSES:</span>,
 <span class="field">OTHER STORIES:</span>,
 <span class="field">TIME AND CONDITIONS:</span>,
 <span class="field">ENVIRONMENT:</span>]

In [14]:
spans = soup.find_all('span', {'class': 'field'})

In [21]:
data = dict()

In [24]:
data['submitted'] = spans[0].text
data['title'] = spans[1].text

In [25]:
data

{'submitted': 'Submitted  by  witness   on Saturday,\xa0November\xa012,\xa02005.',
 'title': 'Snowmobiler has encounter in deep snow near Potter, AK'}

In [31]:

for sentence in soup.find_all("p"):
    if any(span in sentence for span in spans):
        text = sentence.text
        #print (text)
        #print('------\n')
        list_info = text.split(': ',1)
        print(list_info)
        data[list_info[0]] = list_info[1]
        
#if any(ext in url_string for ext in extensionsToCheck):

'''for span in spans:
    print(span["class"],span.text)'''

['YEAR', '2004']
['SEASON', 'Winter']
['MONTH', 'February']
['STATE', 'Alaska']
['COUNTY', 'Anchorage County']
['LOCATION DETAILS', 'Up near powerline clearings east of Potter Marsh in Anchorage.']
['NEAREST TOWN', 'Anchorage / Hillside']
['NEAREST ROAD', 'No real roads in the area']
['OBSERVED', 'I and two of my friends were bored one night so we decided to do a little snowmachining. Though it was illegal to snowmachine in Anchorage, there were some good trails to ride on a little north of my house.  We took off at probably 11 pm, rode up the road about a quarter mile, and cut off on the trails. It had snowed about 10 inches a few days before so there was fresh snow, with no tracks.  I was leading the way for about a half hour, then we stopped and talked for a little bit.  We took off again and kept cruising on some sort of game trail that led to an opening in the woods.  I rode off into the opening with my friends following about fifty yards behind me.  I came over this little mound 

'for span in spans:\n    print(span["class"],span.text)'

In [32]:
data

{'submitted': 'Submitted  by  witness   on Saturday,\xa0November\xa012,\xa02005.',
 'title': 'Snowmobiler has encounter in deep snow near Potter, AK',
 'YEAR': '2004',
 'SEASON': 'Winter',
 'MONTH': 'February',
 'STATE': 'Alaska',
 'COUNTY': 'Anchorage County',
 'LOCATION DETAILS': 'Up near powerline clearings east of Potter Marsh in Anchorage.',
 'NEAREST TOWN': 'Anchorage / Hillside',
 'NEAREST ROAD': 'No real roads in the area',
 'OBSERVED': 'I and two of my friends were bored one night so we decided to do a little snowmachining. Though it was illegal to snowmachine in Anchorage, there were some good trails to ride on a little north of my house.  We took off at probably 11 pm, rode up the road about a quarter mile, and cut off on the trails. It had snowed about 10 inches a few days before so there was fresh snow, with no tracks.  I was leading the way for about a half hour, then we stopped and talked for a little bit.  We took off again and kept cruising on some sort of game trail t

In [33]:
data.keys()

dict_keys(['submitted', 'title', 'YEAR', 'SEASON', 'MONTH', 'STATE', 'COUNTY', 'LOCATION DETAILS', 'NEAREST TOWN', 'NEAREST ROAD', 'OBSERVED', 'ALSO NOTICED', 'OTHER WITNESSES', 'OTHER STORIES', 'TIME AND CONDITIONS', 'ENVIRONMENT'])

In [35]:
html_df = pd.DataFrame(columns = data.keys())

In [41]:
cols = ['submitted', 'title', 'YEAR', 'SEASON', 'MONTH',
                                  'STATE', 'COUNTY', 'LOCATION DETAILS', 'NEAREST TOWN',
                                  'NEAREST ROAD', 'OBSERVED', 'ALSO NOTICED',
                                  'OTHER WITNESSES', 'OTHER STORIES', 'TIME AND CONDITIONS',
                                  'ENVIRONMENT']

html_df = pd.DataFrame(columns = cols)

In [42]:
html_df

Unnamed: 0,submitted,title,YEAR,SEASON,MONTH,STATE,COUNTY,LOCATION DETAILS,NEAREST TOWN,NEAREST ROAD,OBSERVED,ALSO NOTICED,OTHER WITNESSES,OTHER STORIES,TIME AND CONDITIONS,ENVIRONMENT


In [47]:
for doc in df.html:
    doc_dict = dict()
    souped_doc = BeautifulSoup(doc, 'html.parser')
    spans = souped_doc.find_all('span', {'class': 'field'})
    
    try:
        doc_dict['submitted'] = spans[0].text
        doc_dict['title'] = spans[1].text
    except:
        continue
        
    for sentence in souped_doc.find_all("p"):
        if any(span in sentence for span in spans):
            text = sentence.text
            #print (text)
            #print('------\n')
            list_info = text.split(': ',1)
            #print(list_info)
            doc_dict[list_info[0]] = list_info[1]
    
    html_df = html_df.append({k:doc_dict[k] for k in cols if k in doc_dict}, ignore_index=True)

KeyboardInterrupt: 

In [None]:

doc_dict = dict()
souped_doc = BeautifulSoup(doc, 'html.parser')
spans = souped_doc.find_all('span', {'class': 'field'})

doc_dict['submitted'] = spans[0].text
doc_dict['title'] = spans[1].text

for sentence in souped_doc.find_all("p"):
    if any(span in sentence for span in spans):
        text = sentence.text
        #print (text)
        #print('------\n')
        list_info = text.split(': ',1)
        #print(list_info)
        doc_dict[list_info[0]] = list_info[1]

html_df = html_df.append({k:doc_dict[k] for k in cols if k in doc_dict}, ignore_index=True)


In [48]:
html_df

Unnamed: 0,submitted,title,YEAR,SEASON,MONTH,STATE,COUNTY,LOCATION DETAILS,NEAREST TOWN,NEAREST ROAD,OBSERVED,ALSO NOTICED,OTHER WITNESSES,OTHER STORIES,TIME AND CONDITIONS,ENVIRONMENT
0,"Submitted by witness on Saturday, November...",Snowmobiler has encounter in deep snow near Po...,2004,Winter,February,Alaska,Anchorage County,Up near powerline clearings east of Potter Mar...,Anchorage / Hillside,No real roads in the area,I and two of my friends were bored one night s...,"Some tracks in the snow, and a clearing in the...",My two friends were snowmachining behind me bu...,I have not heard of any other incidents in Anc...,Middle of the night. The only light was the he...,"In the middle of the woods, in a clearing cove..."
1,"Submitted by witness on Thursday, June 3, ...",Four nocturnal hikers get pelted with snow nea...,2003,Winter,December,Alaska,Anchorage County,"Few houses on the way, a power relay station. ...",Anchorage,Dowling,"Me and a couple of friends had been bored, whe...","We smelled of colonge and after shave, and one...","4. Me, w-man, warren and sean. We were at my h...",no,"Started at 11, ended at about 3-3:30. Weather ...","A pine forest, with a bog or swamp on the righ..."
2,"Submitted by witness A. M. on Tuesday, Octob...",Creature observed walking back and forth by wi...,1998,Fall,September,Alaska,Bethel County,"45 miles by air west of Lake Iliamna, Alaska i...",,,My hunting buddy and I were sitting on a ridge...,nothing unusual,Scouting for caribou with high quality binocul...,,,Call Iliamna Air taxi for lat & Long of Long L...
3,"Submitted by witness on Friday, May 6, 2005.",Fishermen find footprints east of Egegik,2004,Summer,July,Alaska,Bristol Bay County,"Approximately 95 miles east of Egegik, Alaska....",Egegik,,"To whom it may concern, I am a commercial fish...",Just these foot prints and how obvious it was ...,"One other witness, and he was fishing prior to...","I've only heard of one other story, from an ol...","Approximately 12:30 pm, partially coudy/sunny.","Lake front,creek spit, gravel and sand, alder ..."
4,"Submitted by witness on Monday, November 1...",Campers' encounter just after dark in the Wran...,2000,Summer,June,Alaska,Cordova-McCarthy County,"On the main trail toward the glacier, before t...","Kennikot, Alaska",not sure,My hiking partner and I arrived late to the Ke...,I did hear what appeared to be grunting in the...,"I was the only witness, there was one other in...",,About 12:00 Midnight / full moon / clear / dim...,This sighting was located at approximately 1 t...
5,"Submitted by witness on Thursday, Septembe...",Daytime sighting of reddish-colored bigfoot ne...,2009,Summer,July,Alaska,Fairbanks County,The intersection of Auburn and Farmers Loop is...,Fairbanks Alaska,Auburn Dr a mile north of Farmers Loop,"It was the month of July, 2009 in Fairbanks Al...",I saw some people the next day who saw some ki...,No other witnesses,I saw some people the next day who saw some ki...,around 6pmLighting was excellent and weather w...,The forest was made up of mostly birch trees w...
6,"Submitted by witness on Sunday, May 31, 1998.","Family on their way home see a large, hairy cr...",1998,Spring,May,Alaska,Fairbanks County,Fort Wainwright,,,I personally did not see it but a Noncommissio...,,They had just picked the buddy up from work.,,,"On a road called Trainor Gate road, on the mor..."
7,"Submitted by witness on Monday, March 19, ...",Hunter hears footsteps late at night in Goldst...,1997,Fall,August,Alaska,Fairbanks County,I would prefer you don't include this in my po...,Fairbanks,Jones Road,This happened in late August of 1997 in a sid...,I covered pretty much everything in my narrative.,Just myself.,Most of the native peoples of Alaska seem to h...,"Middle of the night, clear weather, thumbnail ...","Upland Aspen/birch forest, above O'connor Creek."
8,"Submitted by witness on Monday, June 15, 1...",Sighting by Army personnel on manuevers,Late 1970's,Summer,,Alaska,Fairbanks County,"Black Rapids Glacier, Alaska Richardson Hiway ...",,,I was part of a group of about a dozen Army pe...,,Just sitting around relaxing,I was telling a friend about this and he said ...,,High rugged mountains
9,Submitted by witness Gerald Chenard on Thurs...,"Witness observers 6 foot, shaggy, dark brown c...",1964,Fall,September,Alaska,Fairbanks County,"This incident occurred back in 1964, and I hav...",,,Something that I would not believe unless I sa...,I believe I covered that in my earlier statement,I WAS JUST WALKING DOWN THIS DIRT ROAD RABBIT ...,,,The type of terrain was land was smooth and ge...
