In [1]:
from pymongo import MongoClient
import pandas as pd
import json
import dataPrep as f

# Data cleaning y obtención de datos

#### Lanzamiento de solicitudes mediante API, pequeña limpieza y primeros filtros en los diferentes dataset de lugares de interés, con el objetivo de preparación para la toma de decisiones y plasmar los resultados en un mapa de Folium.

#### COMPAÑIAS

- Importamos base de datos de compañías

In [2]:
client = MongoClient("mongodb://localhost/companies")

In [3]:
db = client.get_database()

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

- Filtramos por empresas vivas y convertimos en DataFrame para poder operar

In [5]:
query = {'deadpooled_year':{'$eq':None}}

In [6]:
data = list(db["companies"].find(query,{"_id":0,"name":1,"category_code":1,"number_of_employees":1,"founded_year":1,"deadpooled_year":1,"total_money_raised":1,"offices":1}))

df = pd.DataFrame(data)

In [7]:
df.head()

Unnamed: 0,name,category_code,number_of_employees,founded_year,deadpooled_year,total_money_raised,offices
0,Postini,web,,1999.0,,$0,"[{'description': None, 'address1': '959 Skyway..."
1,Digg,news,60.0,2004.0,,$45M,"[{'description': None, 'address1': '135 Missis..."
2,Flektor,games_video,,,,$0,"[{'description': None, 'address1': '8536 Natio..."
3,Fox Interactive Media,web,0.0,1979.0,,$0,"[{'description': '', 'address1': '407 N Maple ..."
4,Geni,web,18.0,2006.0,,$16.5M,"[{'description': 'Headquarters', 'address1': '..."


- Aplicamos la función que expande la columna de 'offices' y genera una nueva columna 'location' con el formato correcto de las coordenadas

In [8]:
df2 = f.officesClean(df)

In [9]:
df2.head()

Unnamed: 0,name,category_code,number_of_employees,founded_year,deadpooled_year,total_money_raised,description,address1,address2,zip_code,city,state_code,country_code,latitude,longitude,location
0,Postini,web,,1999.0,,$0,,"959 Skyway Road, Suite 200",,94070,San Carlos,CA,USA,37.506885,-122.247573,"{'type': 'Point', 'coordinates': [-122.247573,..."
1,Digg,news,60.0,2004.0,,$45M,,135 Mississippi St,,94107,San Francisco,CA,USA,37.764726,-122.394523,"{'type': 'Point', 'coordinates': [-122.394523,..."
2,Flektor,games_video,,,,$0,,"8536 National Blvd, Suite A",,90232,Culver City,CA,USA,34.025958,-118.379768,"{'type': 'Point', 'coordinates': [-118.379768,..."
3,Fox Interactive Media,web,0.0,1979.0,,$0,,407 N Maple Dr,,90210,Beverly Hills,CA,USA,34.076179,-118.39417,"{'type': 'Point', 'coordinates': [-118.39417, ..."
4,Geni,web,18.0,2006.0,,$16.5M,Headquarters,9229 W. Sunset Blvd.,,90069,West Hollywood,CA,USA,34.090368,-118.393064,"{'type': 'Point', 'coordinates': [-118.393064,..."


- Exportamos como json para hacer consultas/modificar información

In [None]:
df2.to_json("../OUTPUT/first_order", orient="records")

- Importamos de nuevo el primer json limpio y filtramos solo por localizaciones con coordenadas

In [10]:
query2 = {'location':{'$ne':None}}

In [11]:
data2 = list(db["first_order"].find(query2,{"_id":0}))

first_order = pd.DataFrame(data2)

In [13]:
first_order.location.isnull().sum()

0

- Categorizamos las compañías según actividad para quedarnos solo con las tecnológicas

In [14]:
tech_comp = ["web","software","mobile","games_video","ecommerce","network_hosting","hardware","biotech","cleantech","analytics","semiconductor","photo_video","software","nanotech"]

first_order["tech_company"] = df["category_code"].apply(lambda x: "yes" if x in tech_comp else "no")

In [17]:
first_order.head()

Unnamed: 0,name,category_code,number_of_employees,founded_year,deadpooled_year,total_money_raised,description,address1,address2,zip_code,city,state_code,country_code,latitude,longitude,location,tech_company
0,Fox Interactive Media,web,0.0,1979.0,,$0,,407 N Maple Dr,,90210,Beverly Hills,CA,USA,34.076179,-118.39417,"{'type': 'Point', 'coordinates': [-118.39417, ...",yes
1,Flektor,games_video,,,,$0,,"8536 National Blvd, Suite A",,90232,Culver City,CA,USA,34.025958,-118.379768,"{'type': 'Point', 'coordinates': [-118.379768,...",no
2,Postini,web,,1999.0,,$0,,"959 Skyway Road, Suite 200",,94070,San Carlos,CA,USA,37.506885,-122.247573,"{'type': 'Point', 'coordinates': [-122.247573,...",yes
3,Geni,web,18.0,2006.0,,$16.5M,Headquarters,9229 W. Sunset Blvd.,,90069,West Hollywood,CA,USA,34.090368,-118.393064,"{'type': 'Point', 'coordinates': [-118.393064,...",yes
4,Digg,news,60.0,2004.0,,$45M,,135 Mississippi St,,94107,San Francisco,CA,USA,37.764726,-122.394523,"{'type': 'Point', 'coordinates': [-122.394523,...",yes


- Comprobamos que la mayoría de las empresas tecnológicas se sitúan en San Francisco, por tanto, ésta será la ciudad en la que buscaremos donde posicionarnos

In [26]:
tech = first_order.loc[first_order['tech_company'] == 'yes']

tech.city.value_counts().head(5)

San Francisco    399
New York         335
London           184
                 153
Austin            90
Name: city, dtype: int64

- Dejamos filtrado el dataset por las compañías ubicadas en San Francisco cuya actividad se centra en el entorno tecnológico para guardar el fichero final

In [27]:
tech = tech.loc[tech['city'] == 'San Francisco']

In [None]:
tech.to_json("../OUTPUT/tech_companies", orient="records")

#### STARBUCKS

- Importamos el csv que contiene todos los starbucks

In [17]:
starbucks = pd.read_csv('../INPUT/starbucks_us_locations.csv')

In [18]:
starbucks.columns = ['longitude','latitude','city','adress']
starbucks.head()

Unnamed: 0,longitude,latitude,city,adress
0,-149.905495,61.195339,Starbucks - AK - Anchorage 00002,Carrs-Anchorage #1805_1650 W Northern Lights B...
1,-149.7522,61.2297,Starbucks - AK - Anchorage 00003,Elmendorf AFB_Bldg 5800 Westover Avenue_Anchor...
2,-149.864336,61.195251,Starbucks - AK - Anchorage 00004,Fred Meyer - Anchorage #11_1000 E Northern Lig...
3,-149.837973,61.137514,Starbucks - AK - Anchorage 00005,Fred Meyer - Anchorage #656_2300 Abbott Road_A...
4,-149.909279,61.139947,Starbucks - AK - Anchorage 00006,Fred Meyer - Anchorage (Dimond) #71_2000 W Dim...


- Convertimos la columna 'location' en string para poder aplicar el filtro de localizar los starbucks de San Francisco

In [19]:
starbucks['city'] = starbucks['city'].astype(str)

In [20]:
starbucks['San_Francisco'] = starbucks['city'].apply(lambda x: 'yes' if 'San Francisco' in x else 'no')

- Comprobamos que hay 81 starbucks en San Francisco. Dejamos el dataset filtrado por éstos y preparamos las coordenadas para las geoqueries en MongoDB.

In [21]:
sf = starbucks.loc[starbucks['San_Francisco'] == 'yes']
sf.city.value_counts().sum()

81

In [23]:
sf['location'] = sf[['latitude','longitude']].apply(lambda x:f.coorFormat(x.latitude,x.longitude), axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [24]:
sf.drop('San_Francisco', axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [25]:
sf = sf.reset_index()

In [26]:
sf.drop('index', axis=1, inplace=True)

In [35]:
sf.to_json('../OUTPUT/starbucks.json', orient="records")

#### PUBS

- Obtenemos las coordenadas de los pubs de San Francisco mediante la API Google Places.

In [2]:
pubs = f.googlePlaces('pubs in san francisco')

- Sacamos las coordenadas y los nombres y convertimos en DataFrame para adaptar las coordenadas.

In [5]:
df_pubs = f.cleaningPoints(pubs)
df_pubs.head()

Unnamed: 0,name,latitude,longitude,location
0,The Chieftain Irish Pub & Restaurant,37.781464,-122.405287,"{'type': 'Point', 'coordinates': [-122.4052868..."
1,Black Horse London Pub,37.7986,-122.424482,"{'type': 'Point', 'coordinates': [-122.4244825..."
2,Edinburgh Castle Pub,37.786127,-122.418957,"{'type': 'Point', 'coordinates': [-122.4189571..."
3,Johnny Foley's Irish House,37.786208,-122.408753,"{'type': 'Point', 'coordinates': [-122.4087528..."
4,The Pub,37.806256,-122.422714,"{'type': 'Point', 'coordinates': [-122.4227139..."


In [8]:
df_pubs.to_json("../OUTPUT/pubs.json", orient="records")

#### ESCUELAS

-  Obtenemos las coordenadas de los colegios en San Francisco mediante la API Google Places.

In [6]:
schools = f.googlePlaces('schools in san francisco')

- Sacamos las coordenadas y los nombres y convertimos en DataFrame para adaptar las coordenadas.

In [7]:
df_schools = f.cleaningPoints(schools)
df_schools.head()

Unnamed: 0,name,latitude,longitude,location
0,San Francisco University High School,37.79089,-122.445465,"{'type': 'Point', 'coordinates': [-122.4454646..."
1,International High School,37.775382,-122.421637,"{'type': 'Point', 'coordinates': [-122.4216371..."
2,Urban School of San Francisco,37.770934,-122.445942,"{'type': 'Point', 'coordinates': [-122.445942,..."
3,The Bay School of San Francisco,37.800779,-122.455646,"{'type': 'Point', 'coordinates': [-122.455646,..."
4,San Francisco School,37.732782,-122.411353,"{'type': 'Point', 'coordinates': [-122.4113534..."


In [9]:
df_schools.to_json("../OUTPUT/schools.json", orient="records")