# Advanced Querying Mongo

Importing libraries and setting up connection

In [69]:
%pip install pymongo




In [2]:
from pymongo import MongoClient
import pandas as pd
client = MongoClient("mongodb://localhost:27017/")


### 1. All the companies whose name match 'Babelgum'. Retrieve only their `name` field.

In [3]:
# Your Code

client.list_database_names()

# Seleccionar la base de datos "companies"
db = client["companies"]

# Acceder a la colección "connection"
collection = db["connection"]

# Craer una query para obtener el nombre "Babelgum" y que solo me devuelva el nombre

query = {"name": "Babelgum"}

select={'_id': False, 'name': True}

res = list(collection.find(query,select).limit(5))

# Convertirlo en un DataFrame

df = pd.DataFrame(res)


df




Unnamed: 0,name
0,Babelgum


### 2. All the companies that have more than 5000 employees. Limit the search to 20 companies and sort them by **number of employees**.

In [4]:
# Your Code

# En collection buscamos las compañías que tengan mas de 5000 empleados, limtamos a 20 resultados y ordenamos por numero de empleados de mayor a menor

query = {"number_of_employees": {"$gt": 5000}}

select={'_id': False, 'name': True, 'number_of_employees': True}

res1 = list(collection.find(query,select).limit(20).sort("number_of_employees", -1))

# Convertirlo en un DataFrame de pandas

df1 = pd.DataFrame(res)

df1




Unnamed: 0,name,number_of_employees
0,Siemens,405000
1,IBM,388000
2,Toyota,320000
3,PayPal,300000
4,Nippon Telegraph and Telephone Corporation,227000
5,Samsung Electronics,221726
6,Accenture,205000
7,Tata Consultancy Services,200300
8,Flextronics International,200000
9,Safeway,186000


### 3. All the companies founded between 2000 and 2005, both years included. Retrieve only the `name` and `founded_year` fields.

In [6]:
# Your Code

# Crear una query para obtener las compañías que fueron fundadadas entre 2000 y 2005 incluidos. Solo necesitamos el nombre y el año de fundación

query = {"founded_year": {"$gte": 2000, "$lte": 2005}}

select={'_id': False, 'name': True, 'founded_year': True}

res2 = list(collection.find(query,select))

#convertirlo en un DataFrame

df2 = pd.DataFrame(res2)

df2



Unnamed: 0,name,founded_year
0,Wetpaint,2005
1,Zoho,2005
2,Digg,2004
3,Facebook,2004
4,Omnidrive,2005
...,...,...
3729,EnterSys Group,2000
3730,Axon Solutions,2004
3731,Intergy,2003
3732,AfterLogic,2002


### 4. All the companies that had a Valuation Amount of more than 100.000.000 and have been founded before 2010. Retrieve only the `name` and `ipo` fields.

In [7]:
# Your Code

# Todas las compañías que tienen un "Valuation Amount" mayor que 100.000.000 y fundadas antes de 2010

query = {"$and": [{"founded_year": {"$lt": 2010}}, {"ipo.valuation_amount": {"$gt": 100000000}}]}

select={'_id': False, 'name': True, 'ipo': True}

res3 = list(collection.find(query,select))

# Convertirlo en un DataFrame

df3 = pd.DataFrame(res3)

df3

Unnamed: 0,name,ipo
0,Facebook,"{'valuation_amount': 104000000000.0, 'valuatio..."
1,Twitter,"{'valuation_amount': 18100000000.0, 'valuation..."
2,Yelp,"{'valuation_amount': 1300000000, 'valuation_cu..."
3,LinkedIn,"{'valuation_amount': 9310000000.0, 'valuation_..."
4,Amazon,"{'valuation_amount': 100000000000.0, 'valuatio..."
5,Brightcove,"{'valuation_amount': 290000000, 'valuation_cur..."
6,KIT digital,"{'valuation_amount': 235000000, 'valuation_cur..."
7,Nielsen,"{'valuation_amount': 1600000000, 'valuation_cu..."
8,OpenTable,"{'valuation_amount': 1050000000, 'valuation_cu..."
9,ChannelAdvisor,"{'valuation_amount': 287000000, 'valuation_cur..."


### 5. All the companies that have less than 1000 employees and have been founded before 2005. Order them by the number of employees and limit the search to 10 companies.

In [8]:
# Your Code

# Todas las compañías que tienen un número de empleados < 1000 y fundadas antes de 2005

query = {"$and": [{"founded_year": {"$lt": 2005}}, {"number_of_employees": {"$lt": 1000}}]}

select={'_id': False, 'name': True, 'number_of_employees': True}

# Ahora ordenarlas por número de empleados y limitar a 10 resultados

res4 = list(collection.find(query,select).sort("number_of_employees", -1).limit(10))

# Convertirlo en un DataFrame

df4 = pd.DataFrame(res4)

df4




Unnamed: 0,name,number_of_employees
0,Infinera Corporation,974
1,NorthPoint Communications Group,948
2,888 Holdings,931
3,Forrester Research,903
4,SonicWALL,900
5,Webmetrics,900
6,Cornerstone OnDemand,881
7,Mozilla,800
8,Buongiorno,800
9,Yelp,800


### 6. All the companies that don't include the `partners` field.

In [11]:
# Your Code
# Creamos una query para ver las compañías que no incluyan el campo "partners"

query = {"partners": {"$exists": False}}

select={'_id': False, 'name': True}

res5 = list(collection.find(query,select))

res5



[]

In [13]:
# Ahora buscamos las compañías que tengan la lista de "partners" vacía

query = {"partners": {"$ne":[]}}

select={'_id': False, 'name': True}

res6 = list(collection.find(query,select))

# Convertirlo en un DataFrame

df6 = pd.DataFrame(res6)

df6

Unnamed: 0,name
0,Google
1,Parakey
2,boo-box
3,Echo
4,Pickle
...,...
149,Ethicle
150,Wikitude
151,Digital China Information Technology Services ...
152,Teambox


### 7. All the companies that have a null type of value on the `category_code` field.

In [14]:
# Your Code

# Creamos una query para las compañias que tengan null el campo "category_code"

query = {"category_code": {"$eq": None}}

select={'_id': False, 'name': True, 'category_code': True}

res7 = list(collection.find(query,select))

# Creamos un DataFrame

df7 = pd.DataFrame(res7)

df7

Unnamed: 0,name,category_code
0,Collective,
1,Snimmer,
2,KoolIM,
3,Level9 Media,
4,VidKing,
...,...,...
2746,Nellix,
2747,Cantimer,
2748,cruisecritic,
2749,Coloroot,


### 8. All the companies that have at least 100 employees but less than 1000. Retrieve only the `name` and `number of employees` fields.

In [20]:
# Your Code

# Creamos una query para las compañias que tengan >= 100 empleados y < 1000

query = {"$and": [{"number_of_employees": {"$gte": 100}}, {"number_of_employees": {"$lt": 1000}}]}

select={'_id': False, 'name': True, 'number_of_employees': True}

res8 = list(collection.find(query,select).sort("number_of_employees", -1))

# Creamos un DataFrame

df8 = pd.DataFrame(res8)

df8

Unnamed: 0,name,number_of_employees
0,Datamonitor,984
1,Infinera Corporation,974
2,Box,950
3,NorthPoint Communications Group,948
4,888 Holdings,931
...,...,...
912,ZoomSystems,100
913,Exent,100
914,Mashable,100
915,Applied Language Solutions,100


### 9. Order all the companies by their IPO price in a descending order.

In [17]:
# Your Code

# Creamos una query para todas las compañias por IPO price y ordenamos de mayor a menor

query = {'$and': [{'ipo.valuation_amount': {'$gt': 100000000}}, {'founded_year': {'$lt': 2010}}]}

select = {'_id': False, 'name': True, 'ipo.valuation_amount': True}

res9 = list(collection.find(query,select).sort('ipo.valuation_amount', -1))

# Creamos un DataFrame

df9 = pd.DataFrame(res9)

df9

Unnamed: 0,name,ipo
0,GREE,{'valuation_amount': 108960000000.0}
1,Facebook,{'valuation_amount': 104000000000.0}
2,Amazon,{'valuation_amount': 100000000000.0}
3,Twitter,{'valuation_amount': 18100000000.0}
4,Groupon,{'valuation_amount': 12800000000.0}
5,Tencent,{'valuation_amount': 11000000000.0}
6,Western Digital,{'valuation_amount': 9430000000.0}
7,LinkedIn,{'valuation_amount': 9310000000.0}
8,BMC Software,{'valuation_amount': 6000000000.0}
9,Rackspace,{'valuation_amount': 5440000000.0}


### 10. Retrieve the 10 companies with more employees, order by the `number of employees`

In [18]:
# Your Code

# Creamos una query para las compañias con mayor numero de empleados y ordenamos de mayor a menor y limitamos a 10 resultados

query = {"number_of_employees": {"$gt": 1000}}

select={'_id': False, 'name': True, 'number_of_employees': True}

res10 = list(collection.find(query,select).sort("number_of_employees", -1).limit(10))

# Creamos un DataFrame

df10 = pd.DataFrame(res10)

df10



Unnamed: 0,name,number_of_employees
0,Siemens,405000
1,IBM,388000
2,Toyota,320000
3,PayPal,300000
4,Nippon Telegraph and Telephone Corporation,227000
5,Samsung Electronics,221726
6,Accenture,205000
7,Tata Consultancy Services,200300
8,Flextronics International,200000
9,Safeway,186000


### 11. All the companies founded on the second semester of the year. Limit your search to 1000 companies.

In [22]:
# Your Code

# Creamos una query para compañias fundad en el los ultimos seis meses de cada año

query = {"founded_month": {"$gte": 6}}

select={'_id': False, 'name': True, 'founded_month': True}

# Limitamos a 1000 resultados

res11 = list(collection.find(query,select).limit(1000))

# Creamos un DataFrame

df11 = pd.DataFrame(res11)

df11

Unnamed: 0,name,founded_month
0,Wetpaint,10
1,Zoho,9
2,Digg,10
3,Omnidrive,11
4,Postini,6
...,...,...
995,Openfilm,11
996,uCubd,9
997,MyGreat,7
998,SquareClock,12


### 12. All the companies founded before 2000 that have an acquisition amount of more than 10.000.00

In [24]:
# Your Code

# Creamos una query para las compañias fundadas antes de 2000 que tengan un acquisition amount mayor que 10.000.000

query = {"$and": [{"founded_year": {"$lt": 2000}}, {"acquisition.price_amount": {"$gt": 10000000}}]}

select={'_id': False, 'name': True, 'acquisition.price_amount': True}

res12 = list(collection.find(query,select))

# Creamos un DataFrame

df12 = pd.DataFrame(res12)

df12

Unnamed: 0,name,acquisition
0,Postini,{'price_amount': 625000000}
1,SideStep,{'price_amount': 180000000}
2,Recipezaar,{'price_amount': 25000000}
3,PayPal,{'price_amount': 1500000000}
4,Snapfish,{'price_amount': 300000000}
...,...,...
200,Savvion,{'price_amount': 49000000}
201,Inventa Technologies,{'price_amount': 30000000}
202,Universal Microwave,{'price_amount': 23200000}
203,Advanced Control Components,{'price_amount': 18780000}


### 13. All the companies that have been acquired after 2010, order by the acquisition amount, and retrieve only their `name` and `acquisition` field.

In [25]:
# Your Code

# Creamos una query que han sido adquiridas despues de 2010

query = {"acquisition.acquired_year": {"$gt": 2010}}

select={'_id': False, 'name': True, 'acquisition.acquired_year': True}

# Las ordenamos por acquisition amount de mayor a menor

res13 = list(collection.find(query,select).sort("acquisition.price_amount", -1))

# Creamos un DataFrame

df13 = pd.DataFrame(res13)

df13

Unnamed: 0,name,acquisition
0,T-Mobile,{'acquired_year': 2011}
1,Goodrich Corporation,{'acquired_year': 2011}
2,LSI,{'acquired_year': 2013}
3,National Semiconductor,{'acquired_year': 2011}
4,Ariba,{'acquired_year': 2012}
...,...,...
731,MediaPal,{'acquired_year': 2011}
732,Vertro,{'acquired_year': 2012}
733,ALOT,{'acquired_year': 2012}
734,Celestial Semiconductor,{'acquired_year': 2011}


### 14. Order the companies by their `founded year`, retrieving only their `name` and `founded year`.

In [26]:
# Your Code

# Creamos una query para ordenar las compañias por su founded year de mayor a menor

query = {"founded_year": {"$exists": True}}

select={'_id': False, 'name': True, 'founded_year': True}

res14 = list(collection.find(query, select).sort("founded_year", -1))

# Creamos un DataFrame

df14 = pd.DataFrame(res14)

df14

Unnamed: 0,name,founded_year
0,Fixya,2013.0
1,Wamba,2013.0
2,Advaliant,2013.0
3,Fluc,2013.0
4,iBazar,2013.0
...,...,...
18796,Embedster,
18797,Willdan Group,
18798,Geekdive,
18799,goBookmaker,


### 15. All the companies that have been founded on the first seven days of the month, including the seventh. Sort them by their `acquisition price` in a descending order. Limit the search to 10 documents.

In [28]:
# Your Code

# Creamos una query para las compañias que hayan sido fundadas en los 7 primeros dias del mes e incluido el dia 7

query = {"founded_day": {"$lte": 7}}

select={'_id': False, 'name': True, 'founded_day': True}

# Las ordenaamos aquisition price de mayor a menor y limitamos a 10 resultados

res15 = list(collection.find(query,select).sort("acquisition.price_amount", -1).limit(10))

# Creamos un DataFrame

df15 = pd.DataFrame(res15)

df15

Unnamed: 0,name,founded_day
0,Netscape,4
1,PayPal,1
2,Zappos,1
3,Alibaba,1
4,Postini,2
5,Danger,1
6,Clearwell Systems,6
7,PrimeSense,1
8,Amobee,1
9,BlueLithium,1


### 16. All the companies on the 'web' `category` that have more than 4000 employees. Sort them by the amount of employees in ascending order.

In [30]:
# Your Code

# Creamos una query para las compañias que estan en la categoria web y tienen mas de 4000 empleados

query = {"$and": [{"category_code": "web"}, {"number_of_employees": {"$gt": 4000}}]}

select={'_id': False, 'name': True, 'category_code': True, 'number_of_employees': True}
# Las ordenamos por numero de empleados de mayor a menor

res16 = list(collection.find(query,select).sort("number_of_employees", -1))

# Creamos un DataFrame

df16 = pd.DataFrame(res16)

df16

Unnamed: 0,name,category_code,number_of_employees
0,Experian,web,15500
1,eBay,web,15000
2,Yahoo!,web,13600
3,Rakuten,web,10000
4,Los Angeles Times Media Group,web,10000
5,Groupon,web,10000
6,Webkinz,web,8657
7,AOL,web,8000
8,Expedia,web,4400


### 17. All the companies whose acquisition amount is more than 10.000.000, and currency is 'EUR'.

In [35]:
# Your Code

# Creamos una query para las compañias que tengan un valor de adquisicion mayor que 10.000.000 y la currency sea EUR

query = {"$and": [{"acquisition.price_amount": {"$gt": 10000000}}, {"acquisition.price_currency_code": "EUR"}]}

select={'_id': False, 'name': True, 'acquisition.price_amount': True, 'acquisition.price_currency_code': True}

res17= list(collection.find(query,select))

# Creamos un DataFrame

df17 = pd.DataFrame(res17)

# Importar dtale

import dtale

dtale.show(df17)



### 18. All the companies that have been acquired on the first trimester of the year. Limit the search to 10 companies, and retrieve only their `name` and `acquisition` fields.

In [37]:
# Your Code

# Creamos una query para las compañias que hayan sido adquiridas en los tres primeros meses del año

query = {"acquisition.acquired_month": {"$lte": 3}}

select={'_id': False, 'name': True, 'acquisition.acquired_month': True}

# Limitamos a 10 resultados

res18 = list(collection.find(query,select).limit(10))

# Creamos un DataFrame

df18 = pd.DataFrame(res18)

df18



Unnamed: 0,name,acquisition
0,Kyte,{'acquired_month': 1}
1,NetRatings,{'acquired_month': 2}
2,blogTV,{'acquired_month': 3}
3,Livestream,{'acquired_month': 1}
4,iContact,{'acquired_month': 2}
5,Coghead,{'acquired_month': 2}
6,Dailymotion,{'acquired_month': 2}
7,Netvibes,{'acquired_month': 2}
8,Flickr,{'acquired_month': 3}
9,BabyCenter,{'acquired_month': 3}


# Bonus
### 19. All the companies that have been founded between 2000 and 2010, but have not been acquired before 2011.

In [38]:
# Your Code

# Creamos una query para las compañias fundadas entre 2000 y 2010 pero han sido adquiridas antes de 2011

query = {"$and": [{"founded_year": {"$gte": 2000}}, {"founded_year": {"$lte": 2010}}, {"acquisition.acquired_year": {"$lt": 2011}}]}

select={'_id': False, 'name': True, 'founded_year': True, 'acquisition.acquired_year': True}

res19 = list(collection.find(query,select))

# Creamos un DataFrame

df19 = pd.DataFrame(res19)

df19

Unnamed: 0,name,founded_year,acquisition
0,StumbleUpon,2002,{'acquired_year': 2009}
1,Gizmoz,2003,{'acquired_year': 2009}
2,Helio,2005,{'acquired_year': 2008}
3,Joost,2006,{'acquired_year': 2009}
4,Plaxo,2002,{'acquired_year': 2008}
...,...,...,...
708,Edgeos,2001,{'acquired_year': 2008}
709,Mu-Gahat Enterprises,2005,{'acquired_year': 2009}
710,EnterSys Group,2000,{'acquired_year': 2009}
711,Intergy,2003,{'acquired_year': 2009}


### 20. All the companies that have been 'deadpooled' after the third year.

In [39]:
# Your Code

Unnamed: 0,name,deadpooled_year
0,Omnidrive,2008
1,Babelgum,2013
2,Sparter,2008
3,Thoof,2013
4,Mercora,2008
...,...,...
921,tribalX,2010
922,Prolify,2007
923,Advanced Power Projects,2011
924,OfficialVirtualDJ,2009
