# Data Stories
## Create raw data which can be used to make some plots (or even better, make the plots here...)

In [1]:
%matplotlib inline
## Python packages - you may have to pip install sqlalchemy, sqlalchemy_utils, and psycopg2.
from sqlalchemy import create_engine
from sqlalchemy_utils import database_exists, create_database
import psycopg2
import pandas as pd
import csv
import numpy as np
import operator
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
import nltk

# For haversine formula
from math import radians, cos, sin, asin, sqrt

from os import path
from PIL import Image
import matplotlib.pyplot as plt

from wordcloud import WordCloud, STOPWORDS

## For word clouds
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Database Connection Details
dbname = 'beer_final'
username = 'postgres'
mypassword = 'simple'

## Here, we're using postgres, but sqlalchemy can connect to other things too.
engine = create_engine('postgres://%s:%s@localhost/%s'%(username,mypassword,dbname))
con = psycopg2.connect(database = dbname, user = username,host='/var/run/postgresql',password=mypassword)
print "Connecting to",engine.url
cur = con.cursor()
beer_query_both = """
SELECT 
    breweries.beer_name_key,
    breweries.brewery_name,
    breweries.style_name,
    breweries.beer_name,
    breweries.city,
    breweries.latittude,
    breweries.longitude,
    breweries.hop_mean,
    breweries.hop_stddev,
    reviews.review_text,
    reviews.stemmed_review_text
FROM 
    breweries,reviews
WHERE
    breweries.beer_name_key = reviews.beer_key
"""
cur.execute(beer_query_both)
results = []
results = cur.fetchall()
print len(results),"results"

Connecting to postgres://postgres:simple@localhost/beer_final
33827 results


In [3]:
def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [4]:
print haversine(-117.169593, 33.518543,-118.41557312,33.9176902771)

123.49037117


In [8]:
def test_distance(threshold_distance, ref, test):
    '''
    threshold_distance is a distance (in km) which is the 'cutoff'. ref and test are both dicts
    which are of the form: dict{ 'lat':<latittude coordinate>,'long':<longitude coordinate>}. 
    The distance is dested (great circles) and if the distance is about threshold_distance, we
    return False, else, we return True
    '''
    #print ref['long'],ref['lat'],test['long'],test['lat']
    
    distance = haversine(ref['long'],ref['lat'],test['long'],test['lat'])
    
    #print distance
    if distance > threshold_distance:
        return False
    else:
        return True

In [9]:
LA = {'lat':34.1504034640726,'long':-118.274230049679}
SF = {'lat':37.7611701723497,'long':-122.3883919409  }
SD = {'lat':32.896564       ,'long':-117.137428      }

In [10]:
LA_list = []
SF_list = []
SD_list = []

for result in results:
    lat = result[5]
    lon = result[6]
    try:
        float(lat)
        float(lon)
    except:
        continue
    test = {'lat':float(lat),'long':float(lon)}
    if test_distance(70.0,LA,test):
        #print lat,lon
        #print result[4]
        LA_list.append(result)
    if test_distance(70.0,SF,test):
        SF_list.append(result)
    if test_distance(70.0,SD,test):
        SD_list.append(result)
print len(LA_list),"Los Angeles beers"
print len(SF_list),"Bay Area beers"
print len(SD_list),"San Diego beers"


4277 Los Angeles beers
4316 Bay Area beers
7727 San Diego beers


In [None]:
def get_frac_hoppy(results):
    for result in results:
        hoppiness = -1
        hop_dev = -1
        try:
            hoppiness = float(result[7])
            hop_dev = float(result[8])
        except:
            continue
        
            
            