# Metis Data Science Bootcamp
## San Francisco, Winter 2020
### Project 4: Election Reporting Sentiment Analysis

### Publisher Location Identification

For part of this project I wanted to explore whether the location of the publisher had any correlation to the sentiment of their reporting.  This notebook uses GoogleMap Search API to identify the locations of the publishers and plots them on a map.  The color of the markers for the locations corresponds to the sentiment of that publications articles.

In [2]:
import sys
import re
import os.path
import requests
import time
import pandas as pd
import pprint

from bs4 import BeautifulSoup
import urllib
import json

from os import path

from pymongo import MongoClient
import numpy as np

In [3]:
places_search_url2 = "https://maps.googleapis.com/maps/api/geocode/json"

In [4]:
def search_address(place_name, region) :
    #
    # get access to google map, this is an environment variable to protect it.
    #
    api_key = os.environ.get('GoogleAPIKey')
    engine_id = os.environ.get('GoogleAPIEngineId')
    
    # predefine these strings in case of request failure
    found_name = ''
    found_address = ""
    found_id = ""
    found_lat = ""
    found_zip = ""
    found_long = ""

    # set the parameter dict for the request
    param_load = {"address" : place_name, "key" : api_key, "cx" : engine_id}

    # call the request
    r = requests.get(places_search_url2, params=param_load).json()

    #
    # if we successfully read the google response data then 
    # extract the information we want from the json data structure
    #

    if(r["status"] == 'OK') :
        found_address = r["results"][0]["formatted_address"]
        found_lat = r["results"][0]["geometry"]["location"]["lat"]
        found_long = r["results"][0]["geometry"]["location"]["lng"]
        found_id = r["results"][0]["place_id"]
        add_comp = r["results"][0]["address_components"]
        #
        # Look for the zip code in the address components
        #
        for comp in add_comp :
            if("postal_code" == comp["types"]) :
                foundZip = comp["long_name"]

    # put the extracted data in the format we want for each brewery
    location_data = (found_address, found_zip, found_id, found_lat, found_long)
    return(location_data)


In [5]:
# Open a link to the database and initialize the cursors for the collections
db_client = MongoClient()
db_news = db_client['news_search']
db_news_col = db_news['search_result']
db_news_sources = db_news['pub_locations']
db_news_content = db_news['news_content']

Make a collection for the publisher information

In [7]:
cursor = db_news_col.find({}, {'_id': 1, 'base_url':1})    

for publisher in list(cursor) :
    if db_news_sources.count_documents({'name' : publisher['base_url']}) == 0 :
        pub_dat = list(search_address(publisher['base_url'], ''))

        if(len(pub_dat[0]) > 0) :
            if pub_dat[1] == '' :
                tokens = pub_dat[0].split(',')
                if len(tokens) >=4 :

                    state_zip = tokens[2]
                    zip_tokens = state_zip.split()

                    if len(zip_tokens) >= 2 :
                        pub_dat[1] = zip_tokens[1]

            pub_dict = {'name': publisher['base_url'], 'address': pub_dat[0], 'zip' : pub_dat[1], \
                        'googID' : pub_dat[2], 'lat': pub_dat[3], 'lng' : pub_dat[4]}
            db_news_sources.insert_one(pub_dict)
                

In [6]:
#db_news.drop_collection('pub_locations')

{'nIndexesWas': 1, 'ns': 'news_search.pub_locations', 'ok': 1.0}

Search for the locations of the publishers

In [9]:
lat_list = []
lng_list = []
publishers = []
sentiment_polarity_list = []
sentiment_subjectivity_list = []
sent_polarity_aves = []
sent_subjectivity_aves = []
sent_colors = []

cursor = db_news_sources.find({}, {'_id':1, 'name':1, 'address':1, 'zip':1, 'googID': 1, 'lat':1, 'lng':1})
pub_list = list(cursor)

for pub in pub_list :
    lat_list.append(pub['lat'])
    lng_list.append(pub['lng'])
    publishers.append(pub['name'])
    article_cur = db_news_content.find({'base_url':pub['name']}, {'_id':0, 'sentiment':1})
    for article in list(article_cur) :
        sentiment_polarity_list = article['sentiment'][0]
        sentiment_subjectivity_list = article['sentiment'][1]
    sent_polarity_aves.append(np.mean(sentiment_polarity_list))
    sent_subjectivity_aves.append(np.mean(sentiment_subjectivity_list))

color_pick = lambda y : 'red' if y < -0.1 else ('green' if y > 0.5 else 'black')
sent_colors = [color_pick(z) for z in sent_subjectivity_aves]

Use Folium to plot the locations

In [10]:
import folium
from folium.plugins import HeatMap
import numpy as np

def generateBaseMap(default_location=[np.mean(lat_list), np.mean(lng_list)], default_zoom_start=4):
    base_map = folium.Map(location=default_location, control_scale=True, zoom_start=default_zoom_start)
    return base_map

basemap = generateBaseMap()

# Visualize physical subway locations
for lat,lon,publisher,color in zip(lat_list,lng_list,publishers, sent_colors):
     folium.CircleMarker([lat, lon],
                            popup=publisher,
                            color=color,
                            fill=True,
                            fill_opacity=0.7,
                            radius=4,
                           ).add_to(basemap)
basemap



In [11]:
db_news_sources.count_documents({'base_url' : publisher['base_url']})

0