In [None]:
import pandas as pd
import numpy as np
import pymongo
import urllib.parse

In [None]:
username = 'your_username_from_mongodb_atlas'
password = 'your_password'
cluster_url = 'url@mongodb_atlas'

escaped_username = urllib.parse.quote_plus(username)
escaped_password = urllib.parse.quote_plus(password)

uri = f"mongodb+srv://{escaped_username}:{escaped_password}@{cluster_url}"

client = pymongo.MongoClient(uri)
db = client["sample_airbnb"]
col= db["listingsAndReviews"]

In [None]:
for x in col.find():
    print(x)

In [None]:
wh_data = []
for i in col.find():
    data = dict(id = i['_id'],
                listing_url = i['listing_url'],
                name = i['name'],
                description = i['description'],
                house_rules = i['house_rules'],
                property_type = i['property_type'],
                room_type = i['room_type'],
                bed_type = i.get('bed_type'),
                min_nights = i['minimum_nights'],
                max_nights = i['maximum_nights'],
                cancellation_policy = i['cancellation_policy'],
                accommodates = i['accommodates'],
                total_bedrooms = i.get('bedrooms'),
                total_beds = i.get('beds'),
                availability = i['availability']['availability_365'],
                price = i['price'],
                security_deposit = i.get('security_deposit'),
                cleaning_fee = i.get('cleaning_fee'),
                extra_people = i['extra_people'],
                guests_included= i['guests_included'],
                no_of_reviews = i['number_of_reviews'],
                review_score = i.get('review_scores', {}).get('review_scores_rating'),
                amenities = ', '.join(i['amenities']),
                host_id = i['host']['host_id'],
                host_name = i['host']['host_name'],
                Host_total_listings = i['host']['host_total_listings_count'],
                street = i['address']['street'],
                country = i['address']['country'],
                country_code = i['address']['country_code'],
                Longitude = i['address']['location']['coordinates'][0],
                Latitude = i['address']['location']['coordinates'][1],
    )
    wh_data.append(data)

In [None]:
df = pd.DataFrame(wh_data)
df


In [None]:
df.info()

In [None]:
df['price'] = df['price'].astype(str)
df['price'] = df['price'].astype(float)
df['extra_people'] = df['extra_people'].astype(str).astype(float)
df['min_nights'] = df['min_nights'].astype('Int64')
df['max_nights'] = df['max_nights'].astype('Int64')
df['cleaning_fee'] = df['cleaning_fee'].astype(str)
df['cleaning_fee'] = df['cleaning_fee'].replace('None', 0)
df['cleaning_fee'] = df['cleaning_fee'].astype(float)
df['guests_included'] = df['guests_included'].astype(str)
df['guests_included'] = df['guests_included'].astype(float)
df['security_deposit'] = df['security_deposit'].astype(str)
df['security_deposit'] = df['security_deposit'].replace('None', 0)
df['security_deposit'] = df['security_deposit'].astype(float)
df['Host_total_listings'] = df['Host_total_listings'].astype('Int64')

In [None]:
df.isna().sum()

In [None]:
df.replace(r'^\s*$', np.nan, regex=True, inplace=True) # removes blank spaces
df['availability'].fillna(df['availability'].mode(), inplace=True)
df['extra_people'].fillna(df['extra_people'].median(), inplace=True)
df['no_of_reviews'].fillna(df['no_of_reviews'].min(), inplace=True)
df['total_bedrooms'].fillna(df['total_bedrooms'].median(), inplace=True)
df['total_beds'].fillna(df['total_beds'].median(), inplace=True)
df['review_score'].fillna(df['review_score'].median(), inplace=True)
df['Host_total_listings'].fillna(df['Host_total_listings'].median(), inplace=True)


In [None]:
# Check for duplicate rows in the Dataframe
duplicate_rows = df[df.duplicated()]

print("Duplicate entries in the DataFrame:",duplicate_rows)


In [None]:
df.columns


In [None]:
df.reset_index(drop=True,inplace=True)
df.to_csv('Airbnb_data.csv',index=False)