In [1]:
import pandas as pd
from pymongo import MongoClient
import csv

In [2]:
pd.set_option('display.float_format', lambda x: '%.12f' % x)

In [3]:
def isfloat(value):
    try:
        float(value)
        return True
    except ValueError:
        return False

def is_valid_date(date_str):
    try:
        pd.to_datetime(date_str, format="%Y-%m-%d", errors='raise')
        return True
    except ValueError:
        return False

In [4]:
dtype_spec = {
    "listing_id":int,
    "name": str,
    "host_name": str,
    "neighborhood": str,
    "latitude": str,
    "longitude": str,
    "property_type" : str,
    "room_type" : str,
    "price" : str,
    "review_scores_rating" : str,
    "review_scores_accuracy" : str,
    "review_scores_cleanliness" : str,
    "review_scores_checkin" : str,
    "review_scores_communication" : str,
    "review_scores_location" : str,
    "review_scores_value" : str,
    "review_per_month" : str,
    "date" : str,
    "comment" :str
}

column_names = ["listing_id","name","host_name","neighborhood","latitude",
                "longitude","property_type","room_type","price","review_scores_rating",
                "review_scores_accuracy","review_scores_cleanliness","review_scores_checkin",
                "review_scores_communication","review_scores_location","review_scores_value",
                "review_per_month","date","comment"]
#df = pd.read_csv('/home/ubuntu/DATASET/outputmapreduce.csv', sep=',', header=None, names=column_names, dtype=dtype_spec, parse_dates=['date'])
df = pd.read_csv('/home/ubuntu/DATASET/outputmapreduce.csv', sep=',',
                 header=None, names=column_names, dtype=dtype_spec)

#-1 come prezzo al posto del - per poi dropparle nel momento del training dei classifier
df['price'] = df['price'].apply(lambda x: -1 if x == "-" else float(x))
df['latitude'] = df['latitude'].apply(lambda x: None if x == "-" else float(x))
df['longitude'] = df['longitude'].apply(lambda x: None if x == "-" else float(x))
df['listing_id'] = df['listing_id'].apply(lambda x: None if x == "-" else int(x))
df['review_scores_rating'] = df['review_scores_rating'].apply(lambda x: -1 if not x.isdigit() else float(x))
df['review_scores_accuracy'] = df['review_scores_accuracy'].apply(lambda x: -1 if not x.isdigit() else float(x))
df['review_scores_cleanliness'] = df['review_scores_cleanliness'].apply(lambda x: -1 if not x.isdigit() else float(x))
df['review_scores_checkin'] = df['review_scores_checkin'].apply(lambda x: -1 if not x.isdigit() else float(x))
df['review_scores_communication'] = df['review_scores_communication'].apply(lambda x: -1 if not x.isdigit() else float(x))
df['review_scores_location'] = df['review_scores_location'].apply(lambda x: -1 if not x.isdigit() else float(x))
df['review_scores_value'] = df['review_scores_value'].apply(lambda x: -1 if not x.isdigit() else float(x))
df['review_per_month'] = df['review_per_month'].apply(lambda x: float(x) if isfloat(x) else -1)
df['date'] = df['date'].apply(lambda x: x if is_valid_date(x) else None)


In [5]:
df.head()

Unnamed: 0,listing_id,name,host_name,neighborhood,latitude,longitude,property_type,room_type,price,review_scores_rating,review_scores_accuracy,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,review_per_month,date,comment
0,9996278,Stylish family house near centre,Imke,OudNoord,52.39156759772,4.926034915463,House,Private room,65.0,90.0,9.0,9.0,7.0,9.0,9.0,9.0,0.11,2016-02-22,My friends and I really enjoyed our stay at Im...
1,9996278,Stylish family house near centre,Imke,OudNoord,52.39156759772,4.926034915463,House,Private room,65.0,90.0,9.0,9.0,7.0,9.0,9.0,9.0,0.11,2016-02-07,I enjoyed Imkes place Despite the fact that th...
2,9996278,Stylish family house near centre,Imke,OudNoord,52.39156759772,4.926034915463,House,Private room,65.0,90.0,9.0,9.0,7.0,9.0,9.0,9.0,0.11,2016-01-24,Probably the coolest house in Amsterdam. All o...
3,9996278,Stylish family house near centre,Imke,OudNoord,52.39156759772,4.926034915463,House,Private room,65.0,90.0,9.0,9.0,7.0,9.0,9.0,9.0,0.11,2015-12-31,Vert nice house We spend a great stay
4,9992899,Cozy Canal House in the centre,Liselotte,CentrumWest,52.375859759374,4.893092078023,Apartment,Private room,79.0,95.0,9.0,10.0,10.0,10.0,10.0,9.0,0.34,2016-02-21,We loved every minute of our stay here. Liselo...


In [6]:
df.dtypes

listing_id                       int64
name                            object
host_name                       object
neighborhood                    object
latitude                       float64
longitude                      float64
property_type                   object
room_type                       object
price                          float64
review_scores_rating           float64
review_scores_accuracy         float64
review_scores_cleanliness      float64
review_scores_checkin          float64
review_scores_communication    float64
review_scores_location         float64
review_scores_value            float64
review_per_month               float64
date                            object
comment                         object
dtype: object

In [8]:
client = MongoClient('mongodb://localhost:27017/')
db = client['dsbda_project'] 
collection = db['row']
client.list_database_names()

['admin', 'config', 'dsbda_project', 'local']

In [30]:
dictionary = df.to_dict(orient='records')
result = collection.insert_many(dictionary)

In [9]:
document_count = collection.count_documents({})
print(f"Numero di documenti in 'dsbda_project': {document_count}")

Numero di documenti in 'dsbda_project': 431828


In [25]:
cursor = db.row.find()

In [26]:
for document in cursor[:10]:
    print(document)

{'_id': ObjectId('6675ec2320e6600681d48dde'), 'listing_id': 9996278, 'name': 'Stylish family house near centre', 'host_name': 'Imke', 'neighborhood': 'OudNoord', 'latitude': 52.3915675977199, 'longitude': 4.926034915462951, 'property_type': 'House', 'room_type': 'Private room', 'price': 65.0, 'review_scores_rating': 90.0, 'review_scores_accuracy': 9.0, 'review_scores_cleanliness': 9.0, 'review_scores_checkin': 7.0, 'review_scores_communication': 9.0, 'review_scores_location': 9.0, 'review_scores_value': 9.0, 'review_per_month': 0.11, 'date': '2016-02-22', 'comment': 'My friends and I really enjoyed our stay at Imkes house. She was very welcoming and answered all the questions we had about the city. She also provided many recommendations about the city and the area she lives in i.e visiting eating tourist locations etc.... We were all very pleased with our stay and didnt have any complaints leaving. The house was also very easily accessible by bus from the Amsterdam central station. Alt

In [27]:
cursor = db.row.find({"price": {"$gt": 2000}})

In [14]:
for document in cursor[:10]:
    print(document)

{'_id': ObjectId('6675ec2420e6600681d63fc6'), 'listing_id': 5587500, 'name': 'Crane Hotel Faralda Amsterdam', 'host_name': 'Faralda', 'neighborhood': 'OudNoord', 'latitude': 52.399452260913016, 'longitude': 4.8946751097429555, 'property_type': 'Other', 'room_type': 'Entire homeapt', 'price': 4500.0, 'review_scores_rating': 100.0, 'review_scores_accuracy': 10.0, 'review_scores_cleanliness': 10.0, 'review_scores_checkin': 10.0, 'review_scores_communication': 10.0, 'review_scores_location': 10.0, 'review_scores_value': 9.0, 'review_per_month': 0.05, 'date': '2016-09-16', 'comment': 'The crane wow what an experience... a crazy and yet wonderful experience in a world of same same... this is something incredibly unique and different... the location is uber cool... lots of bars and chill out spaces close by... Brood the bread shop at the ferry terminal has great coffee...and the pastries are to die for. Thank you Faralda for a wonderful experience.'}
{'_id': ObjectId('6675ec2420e6600681d63fc7

In [28]:
client.drop_database("dsbda_project")

In [10]:
ops = [
    { "$match": { "neighborhood": "Bos en Lommer" } },
    { "$group": {"_id" : "$property_type" }}
]

cursor = db.row.aggregate(ops)

for document in cursor:
    print(document)

{'_id': 'Boat'}
{'_id': 'Serviced apartment'}
{'_id': 'Hostel'}
{'_id': 'Loft'}
{'_id': 'Bed and breakfast'}
{'_id': 'Cabin'}
{'_id': 'Houseboat'}
{'_id': 'Guest suite'}
{'_id': 'House'}
{'_id': 'Condominium'}
{'_id': 'Townhouse'}
{'_id': 'Apartment'}
