# Advanced Querying Mongo

Importing libraries and setting up connection

In [1]:
from pymongo import MongoClient
client = MongoClient("mongodb://localhost:27017")
import pandas as pd

### 1. All the companies whose name match 'Babelgum'. Retrieve only their `name` field.

In [25]:
db=client.companies
db

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'companies')

In [28]:
colec = db.colection

In [29]:
list(colec.find().limit(1))

[{'_id': ObjectId('52cdef7c4bab8bd675297d8a'),
  'name': 'Wetpaint',
  'permalink': 'abc2',
  'crunchbase_url': 'http://www.crunchbase.com/company/wetpaint',
  'homepage_url': 'http://wetpaint-inc.com',
  'blog_url': 'http://digitalquarters.net/',
  'blog_feed_url': 'http://digitalquarters.net/feed/',
  'twitter_username': 'BachelrWetpaint',
  'category_code': 'web',
  'number_of_employees': 47,
  'founded_year': 2005,
  'founded_month': 10,
  'founded_day': 17,
  'deadpooled_year': 1,
  'tag_list': 'wiki, seattle, elowitz, media-industry, media-platform, social-distribution-system',
  'alias_list': '',
  'email_address': 'info@wetpaint.com',
  'phone_number': '206.859.6300',
  'description': 'Technology Platform Company',
  'created_at': datetime.datetime(2007, 5, 25, 6, 51, 27),
  'updated_at': 'Sun Dec 08 07:15:44 UTC 2013',
  'overview': '<p>Wetpaint is a technology platform company that uses its proprietary state-of-the-art technology and expertise in social media to build and mon

In [30]:
query = {'name':'Babelgum'}

select={'name': 1, '_id': 0}

res = list(colec.find(query, select))

res

[{'name': 'Babelgum'}]

### 2. All the companies that have more than 5000 employees. Limit the search to 20 companies and sort them by **number of employees**.

In [35]:
query = {'number_of_employees':{'$gt':5000}}

select={'name': 1,'number_of_employees':1, '_id': 0}

res = colec.find(query, select).sort('number_of_employees', -1).limit(20)

pd.DataFrame(res)

Unnamed: 0,name,number_of_employees
0,Siemens,405000
1,IBM,388000
2,Toyota,320000
3,PayPal,300000
4,Nippon Telegraph and Telephone Corporation,227000
5,Samsung Electronics,221726
6,Accenture,205000
7,Tata Consultancy Services,200300
8,Flextronics International,200000
9,Safeway,186000


### 3. All the companies founded between 2000 and 2005, both years included. Retrieve only the `name` and `founded_year` fields.

In [37]:
query = {'$and':[{'founded_year':{'$gte':2000}},
               {'founded_year':{'$lte':2005}}]}

select={'name': 1,'founded_year':1, '_id': 0}

res = colec.find(query, select).sort('founded_year', -1)

pd.DataFrame(res)

Unnamed: 0,name,founded_year
0,Wetpaint,2005
1,Zoho,2005
2,Omnidrive,2005
3,Helio,2005
4,Jingle Networks,2005
...,...,...
3729,Vigilos,2000
3730,Block Shield,2000
3731,Netrake,2000
3732,EnterSys Group,2000


### 4. All the companies that had a Valuation Amount of more than 100.000.000 and have been founded before 2010. Retrieve only the `name` and `ipo` fields.

In [53]:
query = {'$and':[{'ipo.valuation_amount':{'$gt':10e8}},
               {'founded_year':{'$lt':2010}}]}

select={'name': 1,'ipo.valuation_amount':1, '_id': 0}

res = colec.find(query, select).sort('ipo.valuation_amount', -1).limit(5)

pd.DataFrame(res)

Unnamed: 0,name,ipo
0,GREE,{'valuation_amount': 108960000000.0}
1,Facebook,{'valuation_amount': 104000000000.0}
2,Amazon,{'valuation_amount': 100000000000.0}
3,Twitter,{'valuation_amount': 18100000000.0}
4,Groupon,{'valuation_amount': 12800000000.0}


### 5. All the companies that have less than 1000 employees and have been founded before 2005. Order them by the number of employees and limit the search to 10 companies.

In [8]:
query = {'$and':[{'ipo.valuation_amount':{'$gt':10e8}},
               {'founded_year':{'$lt':2010}}]}

select={'name': 1,'ipo.valuation_amount':1, '_id': 0}

res = colec.find(query, select).sort('ipo.valuation_amount', -1)

pd.DataFrame(res)

### 6. All the companies that don't include the `partners` field.

In [50]:
query = {'$nor':[{'partners':{'$gte':1}}]}

select={'name': 1,'partners':1, '_id': 0}

res = colec.find(query, select)

pd.DataFrame(res)

Unnamed: 0,name,partners
0,Wetpaint,[]
1,AdventNet,[]
2,Zoho,[]
3,Digg,[]
4,Facebook,[]
...,...,...
18796,Adhunk,[]
18797,AfterLogic,[]
18798,goBookmaker,[]
18799,EnteGreat Solutions,[]


### 7. All the companies that have a null type of value on the `category_code` field.

In [52]:
query = {'category_code':None}

select={'name': 1,'category_code':1, '_id': 0}

res = colec.find(query, select)

pd.DataFrame(res)

Unnamed: 0,name,category_code
0,Collective,
1,Snimmer,
2,KoolIM,
3,Level9 Media,
4,VidKing,
...,...,...
2746,Nellix,
2747,Cantimer,
2748,cruisecritic,
2749,Coloroot,


### 8. All the companies that have at least 100 employees but less than 1000. Retrieve only the `name` and `number of employees` fields.

In [54]:
query = {'$and':[{'number_of_employees':{'$gte':100}},
               {'number_of_employees':{'$lte':1000}}]}

select={'name': 1,'number_of_employees':1, '_id': 0}

res = colec.find(query, select).sort('founded_year', -1)

pd.DataFrame(res)

Unnamed: 0,name,number_of_employees
0,Wamba,120
1,Social Gaming Network,100
2,4shared,666
3,Magento,275
4,Integrate,120
...,...,...
937,Netcentives,496
938,Galam,250
939,COA Solutions,500
940,Elite Advanced Laser Corporation,320


### 9. Order all the companies by their IPO price in a descending order.

In [67]:
query = {}

select={'name': 1,'ipo.valuation_amount':1, '_id': 0}

res = colec.find(query, select).sort('ipo.valuation_amount', -1)

pd.DataFrame(res)

Unnamed: 0,name,ipo
0,GREE,{'valuation_amount': 108960000000.0}
1,Facebook,{'valuation_amount': 104000000000.0}
2,Amazon,{'valuation_amount': 100000000000.0}
3,Twitter,{'valuation_amount': 18100000000.0}
4,Groupon,{'valuation_amount': 12800000000.0}
...,...,...
18796,Adhunk,
18797,AfterLogic,
18798,goBookmaker,
18799,EnteGreat Solutions,


### 10. Retrieve the 10 companies with more employees, order by the `number of employees`

In [60]:
query = {}

select={'name': 1,'number_of_employees':1, '_id': 0}

res = colec.find(query, select).sort('number_of_employees', -1).limit(10)

pd.DataFrame(res)

Unnamed: 0,name,number_of_employees
0,Siemens,405000
1,IBM,388000
2,Toyota,320000
3,PayPal,300000
4,Nippon Telegraph and Telephone Corporation,227000
5,Samsung Electronics,221726
6,Accenture,205000
7,Tata Consultancy Services,200300
8,Flextronics International,200000
9,Safeway,186000


### 11. All the companies founded on the second semester of the year. Limit your search to 1000 companies.

In [65]:
query = {'founded_month':{'$gte':7}}

select={'name': 1,'founded_month':1, '_id': 0}

res = colec.find(query, select).limit(1000)

pd.DataFrame(res)

Unnamed: 0,name,founded_month
0,Wetpaint,10
1,Zoho,9
2,Digg,10
3,Omnidrive,11
4,eBay,9
...,...,...
995,LingusTV,12
996,AccountMaven,9
997,Chapatiz,10
998,YOOWALK,10


### 12. All the companies founded before 2000 that have an acquisition amount of more than 10.000.00

In [68]:
query = {'$and':[{'founded_year':{'$lt':2000}},
               {'acquisition.price_amount':{'$gt':10e6}}]}

select={'name': 1,'founded_year':1, 'acquisition.price_amount':1,'_id': 0}

res = colec.find(query, select).sort('acquisition.price_amount', -1)

pd.DataFrame(res)

Unnamed: 0,name,founded_year,acquisition
0,BEA Systems,1995,{'price_amount': 8500000000.0}
1,Navteq,1985,{'price_amount': 8100000000.0}
2,Sun Microsystems,1982,{'price_amount': 7400000000.0}
3,Pixar,1986,{'price_amount': 7400000000.0}
4,LSI,1980,{'price_amount': 6600000000.0}
...,...,...,...
200,Connect3 Systems,1993,{'price_amount': 13450000}
201,Litmus Media,1999,{'price_amount': 13000000}
202,Litmus Media,1999,{'price_amount': 13000000}
203,MIVA,1999,{'price_amount': 11600000}


### 13. All the companies that have been acquired after 2010, order by the acquisition amount, and retrieve only their `name` and `acquisition` field.

In [69]:
query = {'acquisition.acquired_year':{'$gt':2010}}

select={'name': 1, 'acquisition':1,'_id': 0}

res = colec.find(query, select).sort('acquisition.price_amount', -1)

pd.DataFrame(res)

Unnamed: 0,name,acquisition
0,T-Mobile,"{'price_amount': 39000000000.0, 'price_currenc..."
1,Goodrich Corporation,"{'price_amount': 18400000000.0, 'price_currenc..."
2,LSI,"{'price_amount': 6600000000.0, 'price_currency..."
3,National Semiconductor,"{'price_amount': 6500000000.0, 'price_currency..."
4,Ariba,"{'price_amount': 4300000000.0, 'price_currency..."
...,...,...
731,MediaPal,"{'price_amount': None, 'price_currency_code': ..."
732,Vertro,"{'price_amount': None, 'price_currency_code': ..."
733,ALOT,"{'price_amount': None, 'price_currency_code': ..."
734,Celestial Semiconductor,"{'price_amount': None, 'price_currency_code': ..."


### 14. Order the companies by their `founded year`, retrieving only their `name` and `founded year`.

In [70]:
query = {}

select={'name': 1, 'acquisition.acquired_year':1,'_id': 0}

res = colec.find(query, select).sort('acquisition.acquired_year', -1)

pd.DataFrame(res)

Unnamed: 0,name,acquisition
0,Sense Networks,{'acquired_year': 2014}
1,Nullsoft,{'acquired_year': 2014}
2,Alverix,{'acquired_year': 2014}
3,Wetpaint,{'acquired_year': 2013}
4,blogTV,{'acquired_year': 2013}
...,...,...
18796,Oriact,
18797,Adhunk,
18798,AfterLogic,
18799,goBookmaker,


### 15. All the companies that have been founded on the first seven days of the month, including the seventh. Sort them by their `acquisition price` in a descending order. Limit the search to 10 documents.

In [72]:
query = {'acquisition.acquired_day':{'$lte':7}}

select={'name': 1, 'acquisition.acquired_day':1,'acquisition.price_amount':1,'_id': 0}

res = colec.find(query, select).sort('acquisition.price_amount', -1).limit(10)

pd.DataFrame(res)

Unnamed: 0,name,acquisition
0,National Semiconductor,"{'price_amount': 6500000000.0, 'acquired_day': 4}"
1,The Weather Channel,"{'price_amount': 3500000000.0, 'acquired_day': 7}"
2,Interactive Data,"{'price_amount': 3400000000.0, 'acquired_day': 4}"
3,Macromedia,"{'price_amount': 3400000000.0, 'acquired_day': 3}"
4,Qualcomm Atheros,"{'price_amount': 3100000000.0, 'acquired_day': 5}"
5,ExactTarget,"{'price_amount': 2500000000.0, 'acquired_day': 4}"
6,Quest Software,"{'price_amount': 2400000000.0, 'acquired_day': 2}"
7,Legent Corporation,"{'price_amount': 1740000000, 'acquired_day': 1}"
8,Digital Insight,"{'price_amount': 1650000000, 'acquired_day': 2}"
9,Equallogic,"{'price_amount': 1400000000, 'acquired_day': 5}"


### 16. All the companies on the 'web' `category` that have more than 4000 employees. Sort them by the amount of employees in ascending order.

In [75]:
query =  {'$and':[{'category_code':'web'},
               {'number_of_employees':{'$gt':4000}}]}

select={'name': 1, 'category_code':1,'number_of_employees':1,'_id': 0}

res = colec.find(query, select).sort('number_of_employees', 1).limit(10)

pd.DataFrame(res)

Unnamed: 0,name,category_code,number_of_employees
0,Expedia,web,4400
1,AOL,web,8000
2,Webkinz,web,8657
3,Rakuten,web,10000
4,Los Angeles Times Media Group,web,10000
5,Groupon,web,10000
6,Yahoo!,web,13600
7,eBay,web,15000
8,Experian,web,15500


### 17. All the companies whose acquisition amount is more than 10.000.000, and currency is 'EUR'.

In [83]:
query =   {'$and':[{'acquisition.price_amount':{'$gt':10e7}},
               {'acquisition.price_currency_code':'EUR'}]}

select={'name': 1, 'acquisition.price_amount':1,'acquisition.price_currency_code':1,'_id': 0}

res = colec.find(query, select).sort('acquisition.price_amount', -1).limit(10)

pd.DataFrame(res)

Unnamed: 0,name,acquisition
0,Apertio,"{'price_amount': 140000000, 'price_currency_co..."


### 18. All the companies that have been acquired on the first trimester of the year. Limit the search to 10 companies, and retrieve only their `name` and `acquisition` fields.

In [84]:
query = {'acquisition.acquired_month':{'$lte':3}}

select={'name': 1, 'acquisition':1,'_id': 0}

res = colec.find(query, select).limit(10)

pd.DataFrame(res)

Unnamed: 0,name,acquisition
0,Kyte,"{'price_amount': None, 'price_currency_code': ..."
1,NetRatings,"{'price_amount': 327000000, 'price_currency_co..."
2,blogTV,"{'price_amount': None, 'price_currency_code': ..."
3,Livestream,"{'price_amount': None, 'price_currency_code': ..."
4,iContact,"{'price_amount': 169000000, 'price_currency_co..."
5,Coghead,"{'price_amount': None, 'price_currency_code': ..."
6,Dailymotion,"{'price_amount': 168000000, 'price_currency_co..."
7,Netvibes,"{'price_amount': None, 'price_currency_code': ..."
8,Flickr,"{'price_amount': None, 'price_currency_code': ..."
9,BabyCenter,"{'price_amount': None, 'price_currency_code': ..."


# Bonus
### 19. All the companies that have been founded between 2000 and 2010, but have not been acquired before 2011.

In [85]:
query = {'$and':[{'founded_year':{'$gte':2000}},
               {'founded_year':{'$lte':2010}},
                {'acquisition.acquired_year':{'$gte':2011}}]}


select={'name': 1,'founded_year':1, 'acquisition.acquired_year':1,'_id': 0}

res = colec.find(query, select).sort('founded_year', -1)

pd.DataFrame(res)

Unnamed: 0,name,founded_year,acquisition
0,Magento,2010,{'acquired_year': 2011}
1,Trunkt,2009,{'acquired_year': 2012}
2,HyperWeek,2009,{'acquired_year': 2013}
3,Shutl,2009,{'acquired_year': 2013}
4,Honk,2009,{'acquired_year': 2011}
...,...,...,...
481,Colibria,2000,{'acquired_year': 2011}
482,PopCap Games,2000,{'acquired_year': 2011}
483,Moonfruit,2000,{'acquired_year': 2012}
484,Telx,2000,{'acquired_year': 2011}


### 20. All the companies that have been 'deadpooled' after the third year.

In [87]:
# sin terminar
query = {'$expr':{'deadpooled_year', 'founded_year'}}
select={'name': 1,'deadpooled_year':1, 'founded_year':1,'_id': 0}
res = list(colec.find(query, select))

pd.DataFrame(res)

InvalidDocument: cannot encode object: {'founded_year', 'deadpooled_year'}, of type: <class 'set'>