## Imports

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

dataset = pd.read_csv("../Dataset/listings.csv")
dataset

## Remove unwanted columns

In [None]:
# remove duplicates
dataset.drop_duplicates(inplace=True)

In [None]:
columns_to_drop = ['id','listing_url', 'scrape_id', 'last_scraped', 'name', 'summary','space', 'description', 'experiences_offered','neighborhood_overview','notes', 'transit', 'access', 'interaction', 'house_rules','thumbnail_url', 'medium_url', 'picture_url', 'xl_picture_url','host_id', 'host_url', 'host_name', 'host_since', 'host_location','host_about', 'host_response_time', 'host_response_rate','host_acceptance_rate', 'host_is_superhost', 'host_thumbnail_url','host_picture_url', 'host_neighbourhood', 'host_listings_count','host_total_listings_count', 'host_verifications','host_has_profile_pic', 'host_identity_verified', 'street', 'neighbourhood_cleansed','neighbourhood_group_cleansed','country','country_code','is_location_exact','latitude','longitude','city','state','zipcode','market','smart_location','weekly_price', 'monthly_price', 'security_deposit','cleaning_fee', 'guests_included', 'extra_people', 'minimum_nights','maximum_nights', 'calendar_updated', 'has_availability','availability_30', 'availability_60', 'availability_90', 'calendar_last_scraped', 'number_of_reviews','first_review', 'last_review','review_scores_accuracy', 'review_scores_cleanliness','review_scores_checkin', 'review_scores_communication','review_scores_location', 'review_scores_value', 'requires_license','license', 'jurisdiction_names', 'instant_bookable', 'require_guest_profile_picture','require_guest_phone_verification', 'calculated_host_listings_count','reviews_per_month','review_scores_rating','amenities'] 

dataset.drop(columns_to_drop, axis = 1, inplace = True)
dataset

## Handling null values

In [None]:
# no.of null values before handling null values
dataset.isnull().sum()

In [None]:
# drop square_feet column
dataset.drop(['square_feet'],axis=1, inplace=True)

# replace the NaN with 0
dataset['bathrooms'].fillna(0, inplace=True)
dataset['bedrooms'].fillna(0, inplace=True)
dataset['beds'].fillna(0, inplace=True)

# replace the NaN with string Unknown
dataset['property_type'].fillna('Other', inplace=True)

# removing rows having null values
dataset = dataset[dataset['neighbourhood'].notna()]

dataset

In [None]:
# no.of null values after handling null values
dataset.isnull().sum()

## Processing the price column

In [None]:
def removeDollar(price):
    # remove $ and comma from price, ignore na values so that we wont get any errors.
    return str(price).replace('$','').replace(',','')
    
# applying function to column
dataset['price'] = dataset['price'].apply(removeDollar)
# make float as the data type of column
dataset['price'] = dataset['price'].astype(float)
dataset

## Convert Categorical data to numeric

In [None]:
def convertToNumeric(dataset):
    le = LabelEncoder()

    dataset['neighbourhood'] = le.fit_transform(dataset['neighbourhood'])
    dataset['property_type'] = le.fit_transform(dataset['property_type'])
    dataset['room_type'] = le.fit_transform(dataset['room_type'])
    dataset['bed_type'] = le.fit_transform(dataset['bed_type'])
    dataset['cancellation_policy'] = le.fit_transform(dataset['cancellation_policy'])
    return dataset

dataset = convertToNumeric(dataset)
dataset

In [None]:
# move price column to end
column_to_move = dataset.pop("price")
dataset.insert(10, "price", column_to_move )
dataset

## Exporting the cleaned dataset

In [None]:
dataset.to_csv("listings_cleaned.csv", index = False)