# Filtrado inicial de compañias

In [34]:
from pymongo import MongoClient
import pymongo
import pandas as pd
import src.tools.mongo as mongo
import numpy as np

In [2]:
df_companies=mongo.getCompanies(mongo.localConection())
df_companies.shape

(18801, 42)

In [3]:
df_companies.columns

Index(['_id', 'name', 'permalink', 'crunchbase_url', 'homepage_url',
       'blog_url', 'blog_feed_url', 'twitter_username', 'category_code',
       'number_of_employees', 'founded_year', 'deadpooled_year', 'tag_list',
       'alias_list', 'email_address', 'phone_number', 'description',
       'created_at', 'updated_at', 'overview', 'image', 'products',
       'relationships', 'competitions', 'providerships', 'total_money_raised',
       'funding_rounds', 'investments', 'acquisition', 'acquisitions',
       'offices', 'milestones', 'video_embeds', 'screenshots',
       'external_links', 'partners', 'founded_month', 'founded_day',
       'deadpooled_month', 'deadpooled_day', 'deadpooled_url', 'ipo'],
      dtype='object')

In [4]:
df_companies=df_companies.drop(columns=['permalink', 'crunchbase_url', 'homepage_url',
       'blog_url', 'blog_feed_url', 'twitter_username','tag_list',
       'alias_list', 'email_address', 'phone_number','created_at',
        'updated_at', 'overview', 'image', 'products',
       'relationships', 'competitions', 'providerships', 'total_money_raised',
        'milestones', 'video_embeds', 'screenshots',
       'external_links', 'partners'])

### Eliminación de empresas que hayan cerrado hace varios años.

In [5]:
print("La empresa que mas recientemente ha cerrado es en el año:")
df_companies["deadpooled_year"].value_counts().index.max()

La empresa que mas recientemente ha cerrado es en el año:


2014.0

Elimino todas las empresas que han cerrado ya que fue hace muchos años y no puedo obtener trabajadores de ellas.

In [6]:
df_companies=df_companies[df_companies.deadpooled_year.isnull()]
df_companies.shape

(17872, 18)

## Eliminar compañias que no se dedican a campos relacionados

In [7]:
category=["web","software","games_video","network_hosting","hardware","analytics","music",
          "cleantech","photo_video","design"]
#df_companies["category_code"].value_counts()

In [8]:
df_companies=df_companies[df_companies.category_code.isin(category)]
df_companies.shape

(8412, 18)

# Filtrar con un query de mongo

In [9]:
def office(row):
    return row.offices

#df_companies.applymap(office)
offices=[]
city=[]
for i in range(len(df_companies)):
    offices.append([df_companies.offices.iloc[i],df_companies._id.iloc[i]])
        

In [10]:
office_list=[]
for office in offices:
    a=pd.io.json.json_normalize(office[0])
    a["_id"]=office[1]
    office_list.append(a)


In [11]:
df_office=pd.concat(office_list,sort=True)
df_office.head()

Unnamed: 0,_id,address1,address2,city,country_code,description,latitude,longitude,state_code,zip_code
0,52cdef7c4bab8bd675297d90,"959 Skyway Road, Suite 200",,San Carlos,USA,,37.5069,-122.248,CA,94070.0
0,52cdef7c4bab8bd675297d92,"8536 National Blvd, Suite A",,Culver City,USA,,34.026,-118.38,CA,90232.0
0,52cdef7c4bab8bd675297d91,9229 W. Sunset Blvd.,,West Hollywood,USA,Headquarters,34.0904,-118.393,CA,90069.0
0,52cdef7c4bab8bd675297d93,407 N Maple Dr,,Beverly Hills,USA,,34.0762,-118.394,CA,90210.0
0,52cdef7c4bab8bd675297d96,,,Menlo Park,USA,,37.4841,-122.169,CA,


In [12]:
#Elimino filas con valores nulos en la latitud o la longitud
df_office=df_office.dropna(subset=['latitude', 'longitude'])

In [13]:
#Limpio City
df_office["city_clean"]=df_office.city.apply(lambda x: str(x).replace(" City",""))

In [14]:
df_office=df_office.drop("city",axis=1)
df_office=df_office.rename(columns={"city_clean":"city"})

In [15]:
from geopy.geocoders import Nominatim
import time

def getCityFromCoord(coord):
    geolocator = Nominatim(user_agent="companyLocation")
    time.sleep(1.5)
    try:
        
        return geolocator.geocode(coord)[0].split(",")[1].strip()
    except:
        print(coord)


In [16]:
from geopy.geocoders import Nominatim
geolocator = Nominatim(user_agent="SharkAttack")
location = geolocator.geocode("37.5069,-122.248")
location[0].split(",")[1].strip()

'San Carlos'

In [17]:

#for i in range(len(df_office)):
#    if df_office.iloc[i].city=="" or df_office.iloc[i].city=="None":
#        df_office.iloc[i].city=getCityFromCoord(str(df_office.iloc[i].latitude)+","+str(df_office.iloc[i].longitude))

In [18]:
df_office.shape

(5403, 10)

In [19]:
filter_void=df_office.city=="" 
filter_None=df_office.city=="None"
df_office=df_office[~(filter_void | filter_None)]

In [20]:
companies2=df_companies.merge(df_office, on="_id")
companies2=companies2.drop(columns=["_id"] )

In [21]:
category1=["web","software","games_video"]


In [22]:
cities=companies2[companies2.category_code.isin(category1)].city.value_counts()

Goodcities=cities[cities.values>=30]
Goodcities=list(Goodcities.index)


In [24]:
companies2=companies2[companies2.city.isin(Goodcities)]
companies2.shape
aux=companies2.copy()

In [25]:
def geopoint(long, lat): 
    return {'type': 'Point', 'coordinates': [long,lat]}

In [37]:
companies2["geopoint"]=np.vectorize(geopoint)(companies2["longitude"],companies2["latitude"])


In [50]:
companies2.head()
companies2.shape


(1778, 27)

In [None]:
companies2.to_json('Output/offices.json', orient="records")

In [51]:
def geopoint(long, lat): 
    return {"type": "Point", "coordinates": [long,lat]}


client=mongo.localConection()
db = client.companies
coll=db["offices"] 
coll.insert_many(companies2.to_dict('record'))
coll.create_index([('geopoint', pymongo.GEOSPHERE)])

'geopoint_2dsphere'