# Advanced Querying Mongo

Importing libraries and setting up connection

In [1]:
#pip install pymongo

from pymongo import MongoClient

client = MongoClient

In [2]:
cursor=MongoClient()

cursor

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

In [3]:
str_conn='mongodb://localhost:27017'

cursor=MongoClient(str_conn)

cursor

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

In [4]:
cursor.list_database_names()  # lista de bases de datos

['admin', 'companies', 'config', 'local']

In [5]:
db=cursor.companies

In [6]:
colec=db.companies

In [7]:
list(colec.find().limit(1))[0].keys() #para tener referencia de todas las 'columnas/keys' de mis documentos en mongo

dict_keys(['_id', 'name', 'permalink', 'crunchbase_url', 'homepage_url', 'blog_url', 'blog_feed_url', 'twitter_username', 'category_code', 'number_of_employees', 'founded_year', 'founded_month', 'founded_day', 'deadpooled_year', 'tag_list', 'alias_list', 'email_address', 'phone_number', 'description', 'created_at', 'updated_at', 'overview', 'image', 'products', 'relationships', 'competitions', 'providerships', 'total_money_raised', 'funding_rounds', 'investments', 'acquisition', 'acquisitions', 'offices', 'milestones', 'video_embeds', 'screenshots', 'external_links', 'partners'])

### 1. All the companies whose name match 'Babelgum'. Retrieve only their `name` field.

In [8]:
query= {'name': 'Babelgum'}

filtro = {'name': True, '_id':0}

list(colec.find(query, filtro))


[{'name': 'Babelgum'}]

### 2. All the companies that have more than 5000 employees. Limit the search to 20 companies and sort them by **number of employees**.

In [9]:
query= {'number_of_employees':{'$gt': 5000}}

filtro = {'name': True,'number_of_employees':True, '_id':0}

list(colec.find(query, filtro).sort('number_of_employees', -1).limit(1))

[{'name': 'Siemens', 'number_of_employees': 405000}]

### 3. All the companies founded between 2000 and 2005, both years included. Retrieve only the `name` and `founded_year` fields.

In [10]:
query={'$and': [{'founded_year': {'$lte': 2005}},
                {'founded_year': {'$gte': 2000}}]}

filtro= {'name': True,'founded_year':True, '_id':0}

list(colec.find(query, filtro).limit(1))


[{'name': 'Wetpaint', 'founded_year': 2005}]

### 4. All the companies that had a Valuation Amount of more than 100.000.000 and have been founded before 2010. Retrieve only the `name` and `ipo` fields.

In [11]:
query={'$and': [{'founded_year': {'$lt': 2010}},
                {'ipo.valuation_amount': {'$gt': 100000000}}]}

filtro= {'name': True,'ipo':True, '_id':0}

list(colec.find(query, filtro).limit(1))

[{'name': 'Facebook',
  'ipo': {'valuation_amount': 104000000000,
   'valuation_currency_code': 'USD',
   'pub_year': 2012,
   'pub_month': 5,
   'pub_day': 18,
   'stock_symbol': 'NASDAQ:FB'}}]

### 5. All the companies that have less than 1000 employees and have been founded before 2005. Order them by the number of employees and limit the search to 10 companies.

In [12]:
query={'$and': [{'number_of_employees':{'$lt': 1000}},
                {'founded_year': {'$lt': 2005}}]}

filtro= {'name': 1,'number_of_employees': 1, '_id':0}

list(colec.find(query, filtro).sort('number_of_employees', -1).limit(2))

[{'name': 'Infinera Corporation', 'number_of_employees': 974},
 {'name': 'NorthPoint Communications Group', 'number_of_employees': 948}]

### 6. All the companies that don't include the `partners` field.

In [13]:
query= {'partners':{'$exists': 'false'}}

filtro = {'name': True, '_id':0}

list(colec.find(query, filtro).limit(1))

[{'name': 'Wetpaint'}]

### 7. All the companies that have a null type of value on the `category_code` field.

In [14]:

query= {'category_code':None}

filtro = {'name': True, '_id':0}

list(colec.find(query, filtro).limit(1))

[{'name': 'Collective'}]

### 8. All the companies that have at least 100 employees but less than 1000. Retrieve only the `name` and `number of employees` fields.

In [15]:
query={'$and': [{'number_of_employees': {'$lt': 1000}},
                {'number_of_employees': {'$gte': 100}}]}

filtro = {'name': True, 'number_of_employees':True, '_id':0}

list(colec.find(query, filtro).limit(1))

[{'name': 'AdventNet', 'number_of_employees': 600}]

### 9. Order all the companies by their IPO price in a descending order.

In [16]:
query= {'ipo.valuation_amount':{'$exists': 'true'}}

filtro = {'name': True,'ipo.valuation_amount':True, '_id':0}

list(colec.find(query, filtro).sort('ipo', -1).limit(1))

[{'name': 'GREE', 'ipo': {'valuation_amount': 108960000000}}]

### 10. Retrieve the 10 companies with more employees, order by the `number of employees`

In [17]:
query= {'number_of_employees':{'$exists': 'true'}}

filtro = {'name': True, 'number_of_employees': True, '_id':0}

list(colec.find(query, filtro).sort('number_of_employees', -1).limit(10))

[{'name': 'Siemens', 'number_of_employees': 405000},
 {'name': 'IBM', 'number_of_employees': 388000},
 {'name': 'Toyota', 'number_of_employees': 320000},
 {'name': 'PayPal', 'number_of_employees': 300000},
 {'name': 'Nippon Telegraph and Telephone Corporation',
  'number_of_employees': 227000},
 {'name': 'Samsung Electronics', 'number_of_employees': 221726},
 {'name': 'Accenture', 'number_of_employees': 205000},
 {'name': 'Tata Consultancy Services', 'number_of_employees': 200300},
 {'name': 'Flextronics International', 'number_of_employees': 200000},
 {'name': 'Safeway', 'number_of_employees': 186000}]

### 11. All the companies founded on the second semester of the year. Limit your search to 1000 companies.

In [18]:
query={'$and': [{'founded_month': {'$lte': 12}},
                {'founded_month': {'$gt': 6}}]}

filtro = {'name': True, 'founded_month':True, '_id':0}

list(colec.find(query, filtro).limit(1))

[{'name': 'Wetpaint', 'founded_month': 10}]

### 12. All the companies founded before 2000 that have an acquisition amount of more than 10.000.00

In [19]:
query={'$and': [{'founded_year': {'$lt': 2000}},
                {'acquisition.price_amount': {'$gt': 10000000}}]}

filtro = {'name': True, 'acquisition.price_amount':True, '_id':0}

list(colec.find(query, filtro).limit(1))

[{'name': 'Postini', 'acquisition': {'price_amount': 625000000}}]

### 13. All the companies that have been acquired after 2010, order by the acquisition amount, and retrieve only their `name` and `acquisition` field.

In [20]:
query={'acquisition.acquired_year': {'$gt': 2010}}
                

filtro = {'name': True, 'acquisition.price_amount':True, '_id':0}

list(colec.find(query, filtro).sort('acquisition.price_amount', -1).limit(1))

[{'name': 'T-Mobile', 'acquisition': {'price_amount': 39000000000}}]

### 14. Order the companies by their `founded year`, retrieving only their `name` and `founded year`.

In [21]:
query= {'founded_year':{'$ne': None}}

filtro = {'name': True, 'founded_year': True, '_id':0}

list(colec.find(query, filtro).sort('founded_year', 1).limit(10))

[{'name': 'US Army', 'founded_year': 1800},
 {'name': 'SmallWorlds', 'founded_year': 1800},
 {'name': 'Alstrasoft', 'founded_year': 1800},
 {'name': 'DuPont', 'founded_year': 1802},
 {'name': 'Bachmann Industries', 'founded_year': 1833},
 {'name': 'McKesson', 'founded_year': 1833},
 {'name': 'Bertelsmann', 'founded_year': 1835},
 {'name': 'Accuity', 'founded_year': 1836},
 {'name': 'CENTRA', 'founded_year': 1839},
 {'name': 'WeGame', 'founded_year': 1840}]

### 15. All the companies that have been founded on the first seven days of the month, including the seventh. Sort them by their `acquisition price` in a descending order. Limit the search to 10 documents.

In [22]:
query= {'founded_day':{'$lte': 7}}

filtro = {'name': True, '_id':0}

list(colec.find(query, filtro).sort('acquisition.price_amount', -1).limit(10))

[{'name': 'Netscape'},
 {'name': 'PayPal'},
 {'name': 'Zappos'},
 {'name': 'Alibaba'},
 {'name': 'Postini'},
 {'name': 'Danger'},
 {'name': 'Clearwell Systems'},
 {'name': 'PrimeSense'},
 {'name': 'Amobee'},
 {'name': 'BlueLithium'}]

### 16. All the companies on the 'web' `category` that have more than 4000 employees. Sort them by the amount of employees in ascending order.

In [23]:
query={'$and': [{'category_code': 'web'},
                {'number_of_employees': {'$gt':4000}}]}

filtro = {'name': True, '_id':0}

list(colec.find(query, filtro).sort('number_of_employees', 1).limit(1))

[{'name': 'Expedia'}]

### 17. All the companies whose acquisition amount is more than 10.000.000, and currency is 'EUR'.

In [24]:
query={'$and': [{'acquisition.price_currency_code': 'EUR'},
                {'acquisition.price_amount': {'$gt':10000000}}]}

filtro = {'name': True, '_id':0}

list(colec.find(query, filtro).limit(1))

[{'name': 'ZYB'}]

### 18. All the companies that have been acquired on the first trimester of the year. Limit the search to 10 companies, and retrieve only their `name` and `acquisition` fields.

In [25]:
query={'$and': [{'acquisition.acquired_month': {'$lte': 3}}]}

filtro = {'name': True, 'founded_month':True, '_id':0}

list(colec.find(query, filtro).limit(10))

[{'name': 'Kyte', 'founded_month': 12},
 {'name': 'NetRatings', 'founded_month': None},
 {'name': 'blogTV', 'founded_month': 5},
 {'name': 'Livestream', 'founded_month': 5},
 {'name': 'iContact', 'founded_month': 7},
 {'name': 'Coghead', 'founded_month': 5},
 {'name': 'Dailymotion', 'founded_month': 3},
 {'name': 'Netvibes', 'founded_month': 9},
 {'name': 'Flickr', 'founded_month': None},
 {'name': 'BabyCenter', 'founded_month': None}]

# Bonus
### 19. All the companies that have been founded between 2000 and 2010, but have not been acquired before 2011.

In [57]:
query={'$and': [{'founded_year': {'$lte': 2010, '$gte': 2000}},
                
        {'$or': [{'acquisition.acquired_year': {'$gt': 2011}, 'acquisition.acquired_year':{'$eq': None}}]}]}

filtro= {'name': True, '_id':0}

list(colec.find(query, filtro).limit(1))


])

[{'name': 'Zoho'}]

### 20. All the companies that have been 'deadpooled' after the third year.

In [36]:


query= {'deadpooled_year':{'$exists': 'true'},'deadpooled_year': {'$ne':None},}

filtro = {'name': True,'deadpooled_year': True,'founded_year':True, '_id':0}



list(colec.find(query, filtro).limit(10))


[{'name': 'Wetpaint', 'founded_year': 2005, 'deadpooled_year': 1},
 {'name': 'AdventNet', 'founded_year': 1996, 'deadpooled_year': 2},
 {'name': 'Zoho', 'founded_year': 2005, 'deadpooled_year': 3},
 {'name': 'Digg', 'founded_year': 2004, 'deadpooled_year': None},
 {'name': 'Facebook', 'founded_year': 2004, 'deadpooled_year': None},
 {'name': 'Omnidrive', 'founded_year': 2005, 'deadpooled_year': 2008},
 {'name': 'Postini', 'founded_year': 1999, 'deadpooled_year': None},
 {'name': 'Geni', 'founded_year': 2006, 'deadpooled_year': None},
 {'name': 'Flektor', 'founded_year': None, 'deadpooled_year': None},
 {'name': 'Fox Interactive Media',
  'founded_year': 1979,
  'deadpooled_year': None}]

In [37]:
import pandas as pd


In [39]:
df= df=pd.DataFrame(colec.find(query, filtro))



Unnamed: 0,name,founded_year,deadpooled_year
0,Wetpaint,2005.0,1.0
1,AdventNet,1996.0,2.0
2,Zoho,2005.0,3.0
3,Digg,2004.0,
4,Facebook,2004.0,


In [60]:

df['deadpooltime']= df.founded_year - df.deadpooled_year

#df.drop([0, 1, 2], axis=1, inplace=True)

df.head()

Unnamed: 0,name,founded_year,deadpooled_year,deadpooltimae,deadpooltime
3,Digg,2004.0,,,
4,Facebook,2004.0,,,
5,Omnidrive,2005.0,2008.0,-3.0,-3.0
6,Postini,1999.0,,,
7,Geni,2006.0,,,
