# 4.1 - MongoDB

![mongo](images/mongodb.jpeg)

https://docs.mongodb.com/manual/administration/install-community/

https://docs.mongodb.com/compass/current/install/

MongoDB es una base de datos orientada a documentos. Esto quiere decir que en lugar de guardar los datos en registros, guarda los datos en documentos. Estos documentos son almacenados en BSON, que es una representación binaria de JSON.

Una de las diferencias más importantes con respecto a las bases de datos relacionales, es que no es necesario seguir un esquema. Los documentos de una misma colección, concepto similar a una tabla de una base de datos relacional, pueden tener esquemas diferentes.


In [1]:
%pip install pymongo

Note: you may need to restart the kernel to use updated packages.


In [2]:
from pymongo import MongoClient

import warnings
warnings.filterwarnings('ignore')

In [3]:
str_conn='mongodb://localhost:27017'

cursor=MongoClient(str_conn)

cursor

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True)

In [4]:
cursor.list_database_names()   # lista de bases de datos

['admin',
 'arbbot',
 'campus_tools',
 'companies',
 'complete_exchange_matches',
 'config',
 'ironcomes',
 'local',
 'telegram_messages',
 'total_records']

In [5]:
db=cursor.nueva_db   # nueva base de datos

In [6]:
cursor.list_database_names() 

['admin',
 'arbbot',
 'campus_tools',
 'companies',
 'complete_exchange_matches',
 'config',
 'ironcomes',
 'local',
 'telegram_messages',
 'total_records']

In [7]:
colec=db.n_colec   # nueva coleccion

In [8]:
db.list_collection_names() 

[]

In [9]:
dictio={'nombre': 'Pepe', 'edad': 35, '4': [0, 1, 2 ,3]}

colec.insert_one(dictio)

<pymongo.results.InsertOneResult at 0x1067cff10>

In [10]:
cursor.list_database_names() 

['admin',
 'arbbot',
 'campus_tools',
 'companies',
 'complete_exchange_matches',
 'config',
 'ironcomes',
 'local',
 'nueva_db',
 'telegram_messages',
 'total_records']

In [11]:
db.list_collection_names() 

['n_colec']

In [12]:
# insertar varios

# insert into colec (columnas) values (valores);   equivalente SQL

json=[{'nombre': 'yo', 'edad': 38},
      {'_id': 20, 'actividad': 'natacion', 'lugar': 'aqui'}]

colec.insert_many(json)

<pymongo.results.InsertManyResult at 0x105dca850>

In [13]:
json=[{'nombre': 'Ana', 'edad': 24},
      {'_id': 24, 'actividad': 'natacion', 'hora': '22:00', 'lugar': 'aculla'}]


colec.insert_many(json)

<pymongo.results.InsertManyResult at 0x1067cf5b0>

In [14]:
# select * from colec;

colec.find()

<pymongo.cursor.Cursor at 0x1067f8700>

In [15]:
list(colec.find())

[{'_id': ObjectId('63639f3a822e82b27e3fa23c'),
  'nombre': 'Pepe',
  'edad': 35,
  '4': [0, 1, 2, 3]},
 {'_id': ObjectId('63639f3a822e82b27e3fa23d'), 'nombre': 'yo', 'edad': 38},
 {'_id': 20, 'actividad': 'natacion', 'lugar': 'aqui'},
 {'_id': ObjectId('63639f3a822e82b27e3fa23e'), 'nombre': 'Ana', 'edad': 24},
 {'_id': 24, 'actividad': 'natacion', 'hora': '22:00', 'lugar': 'aculla'}]

In [16]:
a=list(colec.find())

In [17]:
a[4]

{'_id': 24, 'actividad': 'natacion', 'hora': '22:00', 'lugar': 'aculla'}

In [18]:
a[0]['4']

[0, 1, 2, 3]

### Queries

In [19]:
list(colec.find())   # select * from colec;

[{'_id': ObjectId('63639f3a822e82b27e3fa23c'),
  'nombre': 'Pepe',
  'edad': 35,
  '4': [0, 1, 2, 3]},
 {'_id': ObjectId('63639f3a822e82b27e3fa23d'), 'nombre': 'yo', 'edad': 38},
 {'_id': 20, 'actividad': 'natacion', 'lugar': 'aqui'},
 {'_id': ObjectId('63639f3a822e82b27e3fa23e'), 'nombre': 'Ana', 'edad': 24},
 {'_id': 24, 'actividad': 'natacion', 'hora': '22:00', 'lugar': 'aculla'}]

In [20]:
list(colec.find({'edad': 24}))   # select * from colec where edad=24;

[{'_id': ObjectId('63639f3a822e82b27e3fa23e'), 'nombre': 'Ana', 'edad': 24}]

In [21]:
list(colec.find({'nombre': 'yo'}))   # select * from colec where nombre=yo;

[{'_id': ObjectId('63639f3a822e82b27e3fa23d'), 'nombre': 'yo', 'edad': 38}]

In [22]:
list(colec.find({'edad': {'$gt': 24}}))   # select * from colec where edad>24;

[{'_id': ObjectId('63639f3a822e82b27e3fa23c'),
  'nombre': 'Pepe',
  'edad': 35,
  '4': [0, 1, 2, 3]},
 {'_id': ObjectId('63639f3a822e82b27e3fa23d'), 'nombre': 'yo', 'edad': 38}]

In [23]:
list(colec.find({'edad': {'$gte': 24}}))   # select * from colec where edad>=24;

[{'_id': ObjectId('63639f3a822e82b27e3fa23c'),
  'nombre': 'Pepe',
  'edad': 35,
  '4': [0, 1, 2, 3]},
 {'_id': ObjectId('63639f3a822e82b27e3fa23d'), 'nombre': 'yo', 'edad': 38},
 {'_id': ObjectId('63639f3a822e82b27e3fa23e'), 'nombre': 'Ana', 'edad': 24}]

In [24]:
list(colec.find({'edad': {'$lt': 38}}))   # select * from colec where edad<38;

[{'_id': ObjectId('63639f3a822e82b27e3fa23c'),
  'nombre': 'Pepe',
  'edad': 35,
  '4': [0, 1, 2, 3]},
 {'_id': ObjectId('63639f3a822e82b27e3fa23e'), 'nombre': 'Ana', 'edad': 24}]

In [25]:
list(colec.find({'edad': {'$lte': 35}}))   # select * from colec where edad<=35;

[{'_id': ObjectId('63639f3a822e82b27e3fa23c'),
  'nombre': 'Pepe',
  'edad': 35,
  '4': [0, 1, 2, 3]},
 {'_id': ObjectId('63639f3a822e82b27e3fa23e'), 'nombre': 'Ana', 'edad': 24}]

In [26]:
list(colec.find({'edad': {'$ne': 38}}))   # select * from colec where edad!=38;

[{'_id': ObjectId('63639f3a822e82b27e3fa23c'),
  'nombre': 'Pepe',
  'edad': 35,
  '4': [0, 1, 2, 3]},
 {'_id': 20, 'actividad': 'natacion', 'lugar': 'aqui'},
 {'_id': ObjectId('63639f3a822e82b27e3fa23e'), 'nombre': 'Ana', 'edad': 24},
 {'_id': 24, 'actividad': 'natacion', 'hora': '22:00', 'lugar': 'aculla'}]

In [27]:
list(colec.find({'$and':[{'edad': {'$ne': 38}}, 
                         {'nombre': 'Pepe'}]}))   

# select * from colec where edad!=38 and nombre=Pepe;

[{'_id': ObjectId('63639f3a822e82b27e3fa23c'),
  'nombre': 'Pepe',
  'edad': 35,
  '4': [0, 1, 2, 3]}]

In [28]:
list(colec.find({'$and':[{'edad': {'$lt': 38}}, 
                         {'edad': {'$gt': 24}}]}))   

# select * from colec where edad<38 and edad>24;
# select * from colec where edad between (38, 24);

[{'_id': ObjectId('63639f3a822e82b27e3fa23c'),
  'nombre': 'Pepe',
  'edad': 35,
  '4': [0, 1, 2, 3]}]

In [29]:
query={'$and':[{'edad': {'$lt': 38}}, 
               {'edad': {'$gt': 24}}]}

list(colec.find(query))  

[{'_id': ObjectId('63639f3a822e82b27e3fa23c'),
  'nombre': 'Pepe',
  'edad': 35,
  '4': [0, 1, 2, 3]}]

In [30]:
query={'edad': {'$ne': 24}}

list(colec.find(query).limit(3))  

# select * from colec where edad!=24 limit 3;

[{'_id': ObjectId('63639f3a822e82b27e3fa23c'),
  'nombre': 'Pepe',
  'edad': 35,
  '4': [0, 1, 2, 3]},
 {'_id': ObjectId('63639f3a822e82b27e3fa23d'), 'nombre': 'yo', 'edad': 38},
 {'_id': 20, 'actividad': 'natacion', 'lugar': 'aqui'}]

In [31]:
query={'edad': {'$ne': 24}}

list(colec.find(query).sort('edad', 1).limit(3))   

# select * from colec where edad!=24 order by edad asc limit 3;

[{'_id': 24, 'actividad': 'natacion', 'hora': '22:00', 'lugar': 'aculla'},
 {'_id': 20, 'actividad': 'natacion', 'lugar': 'aqui'},
 {'_id': ObjectId('63639f3a822e82b27e3fa23c'),
  'nombre': 'Pepe',
  'edad': 35,
  '4': [0, 1, 2, 3]}]

In [32]:
colec.update_one({'_id': 20}, {'$set': {'actividad': 'atletismo'}})

<pymongo.results.UpdateResult at 0x10680de20>

In [33]:
list(colec.find({'_id': 20}))

[{'_id': 20, 'actividad': 'atletismo', 'lugar': 'aqui'}]

In [34]:
colec.update_many({'nombre': 'Ana'}, {'$set': {'edad': 1}})

<pymongo.results.UpdateResult at 0x1068076a0>

In [35]:
list(colec.find())

[{'_id': ObjectId('63639f3a822e82b27e3fa23c'),
  'nombre': 'Pepe',
  'edad': 35,
  '4': [0, 1, 2, 3]},
 {'_id': ObjectId('63639f3a822e82b27e3fa23d'), 'nombre': 'yo', 'edad': 38},
 {'_id': 20, 'actividad': 'atletismo', 'lugar': 'aqui'},
 {'_id': ObjectId('63639f3a822e82b27e3fa23e'), 'nombre': 'Ana', 'edad': 1},
 {'_id': 24, 'actividad': 'natacion', 'hora': '22:00', 'lugar': 'aculla'}]

In [36]:
# select count(*) from colec group by _id;     # por cada id suma 1

list(colec.aggregate([{'$group': {'_id': '', 'count': {'$sum': 1}}}]))

[{'_id': '', 'count': 5}]

In [37]:
colec.delete_one({'_id': 20})

# delete from colec where _id=20;

<pymongo.results.DeleteResult at 0x10680deb0>

In [38]:
colec.delete_many({'nombre': 'Ana'})

# delete from colec where nombre=Ana;

<pymongo.results.DeleteResult at 0x106807310>

In [39]:
list(colec.find())

[{'_id': ObjectId('63639f3a822e82b27e3fa23c'),
  'nombre': 'Pepe',
  'edad': 35,
  '4': [0, 1, 2, 3]},
 {'_id': ObjectId('63639f3a822e82b27e3fa23d'), 'nombre': 'yo', 'edad': 38},
 {'_id': 24, 'actividad': 'natacion', 'hora': '22:00', 'lugar': 'aculla'}]

In [40]:
colec.drop()

In [41]:
db.list_collection_names()

[]

### DB Companies

In [42]:
db=cursor.companies

In [43]:
colec=db.companies

In [44]:
#list(colec.find().limit(1))

In [45]:
# select name, category_code from colec where category_code in ('web', 'ecommerce') limit 10;

query={'$or': [{'category_code': 'web'},
               {'category_code': 'ecommerce'}]}

filtro={'name': True, 'category_code': True, '_id': False}


list(colec.find(query, filtro).limit(10))

[{'name': 'Wetpaint', 'category_code': 'web'},
 {'name': 'Postini', 'category_code': 'web'},
 {'name': 'Geni', 'category_code': 'web'},
 {'name': 'Fox Interactive Media', 'category_code': 'web'},
 {'name': 'StumbleUpon', 'category_code': 'web'},
 {'name': 'Gizmoz', 'category_code': 'web'},
 {'name': 'eBay', 'category_code': 'web'},
 {'name': 'Viacom', 'category_code': 'web'},
 {'name': 'Plaxo', 'category_code': 'web'},
 {'name': 'Yahoo!', 'category_code': 'web'}]

In [46]:
query={'$and': [{'category_code': 'web'},
                {'founded_year': {'$lte': 2002}}]}


filtro={'founded_year': True, 'name': True, '_id': False}

list(colec.find(query, filtro).limit(10))  



# select founded_year, name from colec where category_code=web and founded_year<=2002 limit 10;

[{'name': 'Postini', 'founded_year': 1999},
 {'name': 'Fox Interactive Media', 'founded_year': 1979},
 {'name': 'StumbleUpon', 'founded_year': 2002},
 {'name': 'eBay', 'founded_year': 1995},
 {'name': 'Viacom', 'founded_year': 1971},
 {'name': 'Plaxo', 'founded_year': 2002},
 {'name': 'Yahoo!', 'founded_year': 1994},
 {'name': 'Meetup', 'founded_year': 2002},
 {'name': 'Topix', 'founded_year': 2002},
 {'name': 'Steorn', 'founded_year': 2000}]

In [47]:
import pandas as pd


df=pd.DataFrame(list(colec.find(query, filtro)))

df.head()

Unnamed: 0,name,founded_year
0,Postini,1999
1,Fox Interactive Media,1979
2,StumbleUpon,2002
3,eBay,1995
4,Viacom,1971


In [48]:
query={'$and': [{'category_code': 'web'},
                {'founded_year': {'$lte': 2002}}]}


filtro={'founded_year': True, 'name': True}

df=pd.DataFrame(list(colec.find(query, filtro)))

df.set_index('_id', inplace=True)

df.head()

Unnamed: 0_level_0,name,founded_year
_id,Unnamed: 1_level_1,Unnamed: 2_level_1
52cdef7c4bab8bd675297d90,Postini,1999
52cdef7c4bab8bd675297d93,Fox Interactive Media,1979
52cdef7c4bab8bd675297d95,StumbleUpon,2002
52cdef7c4bab8bd675297d9b,eBay,1995
52cdef7c4bab8bd675297d9f,Viacom,1971


In [49]:
# select name, category_code, founded_year from colec where name in ('WeGame', 'Facebook');

query={'name': {'$in': ['WeGame', 'Facebook']}}

filtro={'name': True, 'category_code': True, 'founded_year': True, '_id': False}

list(colec.find(query, filtro))

[{'name': 'Facebook', 'category_code': 'social', 'founded_year': 2004},
 {'name': 'WeGame', 'category_code': 'web', 'founded_year': 1840}]

In [50]:
# select name, category_code, founded_year from colec where name like 'Face%'; 


query={'name': {'$regex': '^Face'}}

filtro={'name': True, 'category_code': True, 'founded_year': True, '_id': False}

list(colec.find(query, filtro))

[{'name': 'Facebook', 'category_code': 'social', 'founded_year': 2004},
 {'name': 'FaceTec', 'category_code': 'software', 'founded_year': 2002},
 {'name': 'Face Your Manga', 'category_code': 'web', 'founded_year': None},
 {'name': 'Facebookster',
  'category_code': 'public_relations',
  'founded_year': 2003},
 {'name': 'Facebook Causes Application',
  'category_code': None,
  'founded_year': None},
 {'name': 'FaceKoo', 'category_code': 'network_hosting', 'founded_year': 2008},
 {'name': 'FacebookLicious!',
  'category_code': 'games_video',
  'founded_year': 2007},
 {'name': 'FaceTime Strategy',
  'category_code': 'public_relations',
  'founded_year': None}]

### Geoqueries

In [51]:
# documentos que contengan oficinas, devuelve solo el nombre y el array de oficinas


query={'offices': {'$not': {'$size': 0}}}

filtro={'_id': 0, 'name': 1, 'offices': 1}

ofi=colec.find(query, filtro)

In [52]:
df=pd.DataFrame(ofi)

df=df.dropna()

df.head()

Unnamed: 0,name,offices
0,Wetpaint,"[{'description': '', 'address1': '710 - 2nd Av..."
1,AdventNet,"[{'description': 'Headquarters', 'address1': '..."
2,Zoho,"[{'description': 'Headquarters', 'address1': '..."
3,Digg,"[{'description': None, 'address1': '135 Missis..."
4,Facebook,"[{'description': 'Headquarters', 'address1': '..."


In [53]:
df.offices[0]

[{'description': '',
  'address1': '710 - 2nd Avenue',
  'address2': 'Suite 1100',
  'zip_code': '98104',
  'city': 'Seattle',
  'state_code': 'WA',
  'country_code': 'USA',
  'latitude': 47.603122,
  'longitude': -122.333253},
 {'description': '',
  'address1': '270 Lafayette Street',
  'address2': 'Suite 505',
  'zip_code': '10012',
  'city': 'New York',
  'state_code': 'NY',
  'country_code': 'USA',
  'latitude': 40.7237306,
  'longitude': -73.9964312}]

In [54]:
# extraigo la primera oficina y creo geopunto

def get_first(data):
    
    data=data.offices
    
    principal=None
    
    if data[0]['latitude'] and data[0]['longitude']:
        
        principal={'type': 'Point',
                   'coordinates': [data[0]['longitude'], data[0]['latitude']]}
        
    return {'total_offices': len(data),
            'lat': data[0]['latitude'],
            'lng': data[0]['longitude'],
            'principal': principal}

In [57]:
first_office=df[['offices']].apply(get_first, result_type='expand', axis=1)

first_office.head()

Unnamed: 0,total_offices,lat,lng,principal
0,2.0,47.603122,-122.333253,"{'type': 'Point', 'coordinates': [-122.333253,..."
1,1.0,37.692934,-121.904945,"{'type': 'Point', 'coordinates': [-121.904945,..."
2,1.0,37.692934,-121.904945,"{'type': 'Point', 'coordinates': [-121.904945,..."
3,1.0,37.764726,-122.394523,"{'type': 'Point', 'coordinates': [-122.394523,..."
4,3.0,37.41605,-122.151801,"{'type': 'Point', 'coordinates': [-122.151801,..."


In [58]:
first_office=first_office.dropna()

df=pd.concat([df, first_office], axis=1).drop('offices', axis=1)

df=df.dropna()

df.head()

Unnamed: 0,name,total_offices,lat,lng,principal
0,Wetpaint,2.0,47.603122,-122.333253,"{'type': 'Point', 'coordinates': [-122.333253,..."
1,AdventNet,1.0,37.692934,-121.904945,"{'type': 'Point', 'coordinates': [-121.904945,..."
2,Zoho,1.0,37.692934,-121.904945,"{'type': 'Point', 'coordinates': [-121.904945,..."
3,Digg,1.0,37.764726,-122.394523,"{'type': 'Point', 'coordinates': [-122.394523,..."
4,Facebook,3.0,37.41605,-122.151801,"{'type': 'Point', 'coordinates': [-122.151801,..."


In [None]:
#df.to_json('../data/oficinas.json')

In [60]:
db.first_office.insert_many(df.to_dict('records'))

<pymongo.results.InsertManyResult at 0x10f73c970>

In [61]:
db.first_office.create_index([('principal', '2dsphere')])

'principal_2dsphere'

In [62]:
def find_near(array, radio=1000):
    
    query={'principal': {'$near': {'$geometry': {'type': 'Point',
                                                 'coordinates': array},
                                  '$maxDistance': radio}}}


    return db.first_office.find(query)

In [63]:
park_avenue=[-73.987308, 40.738935] 

In [66]:
df=pd.DataFrame(find_near(park_avenue))

df.head()

Unnamed: 0,_id,name,total_offices,lat,lng,principal
0,63639fa4822e82b27e3fbff5,SpaBooker,1.0,40.738567,-73.987199,"{'type': 'Point', 'coordinates': [-73.987199, ..."
1,63639fa4822e82b27e3fadcd,HealthiNation,1.0,40.739341,-73.988357,"{'type': 'Point', 'coordinates': [-73.988357, ..."
2,63639fa4822e82b27e3fa9b5,Special Ops Media,1.0,40.737721,-73.987725,"{'type': 'Point', 'coordinates': [-73.987725, ..."
3,63639fa4822e82b27e3fc647,Mashable,1.0,40.740154,-73.986742,"{'type': 'Point', 'coordinates': [-73.9867417,..."
4,63639fa4822e82b27e3fb74c,Return Path,1.0,40.740207,-73.987002,"{'type': 'Point', 'coordinates': [-73.987002, ..."
