In [81]:
import numpy as np
import pandas as pd 
import re
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt

In [82]:
df = pd.read_csv('train.csv')

  df = pd.read_csv('train.csv')


## Cleaning

### Purge

In [83]:
df_cleaned = df.drop([
    'id', 
    'host_name', 
    'country', 
    'country_code', 
    'state', 
    'market', 
    'city', 
    'host_neighbourhood', 
    'host_acceptance_rate', 
    'experiences_offered'], axis=1)

### Format

In [84]:
# Clean and format dates
date_features = ['host_since', 'first_review', 'last_review']
for i in range(len(date_features)):
    df_cleaned[date_features[i]] = df_cleaned[date_features[i]].apply(lambda x: \
        datetime.strptime(x, '%Y-%m-%d').timestamp() \
        if x is not np.nan else x)


# Clean bool features
bool_features = [
    'require_guest_phone_verification', 
    'require_guest_profile_picture', 
    'is_business_travel_ready',
    'instant_bookable',
    'host_identity_verified',
    'host_has_profile_pic',
    'host_is_superhost']

for i in range(len(bool_features)):
    df_cleaned[bool_features[i]] = df_cleaned[bool_features[0]] \
        .apply(lambda x: x=='t' or x==True if x is not np.nan else x)
    

# Converting string features containing numeric information to numeric dtypes.
# Converts host_response_rate from strings representing percentages to proportions.
df_cleaned['host_response_rate'] = df_cleaned['host_response_rate'].apply(
    lambda x: float(x.replace('%',''))/100 if x is not np.nan else x)
# Converts extra_people from strings representing prices to floats.
df_cleaned['extra_people'] = df_cleaned['extra_people'].apply(
    lambda x: float(x.replace('$','')) if x is not np.nan else x)


# Removes most doubles from zipcode by correcting formatting 
df_cleaned.zipcode = df_cleaned.zipcode.apply(
    lambda x: str(x).replace('.0', ''))


### Encode

In [85]:
# One-hot encodes the host verifications

# Find unique verifications in host_verifications feature
unique_host_verifications = pd.Series(
    ', '.join(list(df.host_verifications.apply(
        lambda x: x.replace(']', '').replace('[', '').replace("'", '') \
            if x is not np.nan else '').unique())).split(','))\
    .apply(lambda x: x.replace(' ', ''))\
    .unique()
unique_host_verifications = np.delete(unique_host_verifications, 17)


verified_df = {v:[] for v in unique_host_verifications}
# Helper function to check if a verification is in the strings in host_verifications
def word_in_string(word, string):
    pattern = r'\b{}\b'.format(re.escape(word))  # \b is a word boundary
    match = re.search(pattern, string)
    return bool(match)
# For each string in host_verifications, extracts all the verif_strings
for verif_string in df_cleaned.host_verifications:
    if type(verif_string) != str:
        print(verif_string)
        for v in unique_host_verifications:
            verified_df[v].append(False)
        continue
    for v in unique_host_verifications:
        verified_df[v].append(word_in_string(v, verif_string))

verified_df = pd.DataFrame(verified_df)

df_cleaned = df_cleaned.reset_index(drop=True) \
    .merge(verified_df, left_index=True, right_index=True)
df_cleaned = df_cleaned.drop(['host_verifications'], axis=1)

nan
nan
nan
nan
nan


In [87]:
nyc_zip_codes = pd.read_csv('nyc-zip-codes.csv')

z1 = df_cleaned.zipcode.apply(lambda x: str(x).replace('.0', ''))

z2 = z1.apply(lambda x: x if '-' not in x else x.split('-')[0])

z3 = z2.apply(lambda x: x if '\n' not in x else x.split('\n')[0])

def zip_to_int(zipcode):
    try: return int(zipcode)
    except: return np.nan

int_zips = z3.apply(zip_to_int)

valid_zips = int_zips.apply(
    lambda x: x if sum(x==nyc_zip_codes.ZipCode)>=1 else np.nan)

one_hot_zipcodes = pd.get_dummies(valid_zips)
one_hot_zipcodes = one_hot_zipcodes.reindex(columns=nyc_zip_codes.ZipCode, fill_value=0)

df_cleaned = df_cleaned.reset_index(drop=True) \
    .merge(one_hot_zipcodes, left_index=True, right_index=True)
df_cleaned = df_cleaned.drop(['zipcode'], axis=1)

In [88]:
one_hot_features = [
    'room_type', 
    'neighbourhood_group_cleansed', 
    'bed_type', 
    'host_response_time', 
    'cancellation_policy', 
    'property_type',
    'neighbourhood_cleansed']

df_cleaned = pd.get_dummies(df_cleaned, columns=one_hot_features)

In [89]:
# one_hot_features = [
#     'room_type', 
#     'neighbourhood_group_cleansed', 
#     'bed_type', 
#     'host_response_time', 
#     'cancellation_policy', 
#     'property_type',
#     'neighbourhood_cleansed']

# one_hot_encoded_df = pd.get_dummies(
#     df_cleaned[one_hot_features], 
#     columns=one_hot_features)

# df_cleaned = df_cleaned.reset_index(drop=True) \
#     .merge(one_hot_zipcodes, left_index=True, right_index=True)
# df_cleaned = df_cleaned.drop(one_hot_features, axis=1)

### Imputation of Missing Values

In [90]:
# Impute host response time with means
df_cleaned.host_response_rate = df_cleaned.host_response_rate \
    .fillna(df_cleaned.host_response_rate.mean())

# Impute host_since with the mode
df_cleaned.host_since = df_cleaned.host_since \
    .fillna(1.427699e+09)

# Impute host_listings_count with mean
df_cleaned.host_listings_count = df_cleaned.host_listings_count \
    .fillna(df_cleaned.host_listings_count.mean())

# Impute bathrooms with mean
df_cleaned.bathrooms = df_cleaned.bathrooms \
    .fillna(df_cleaned.bathrooms.mean())

# Impute bedrooms with mean
df_cleaned.bedrooms = df_cleaned.bedrooms \
    .fillna(df_cleaned.bedrooms.mean())

# Impute beds with mean
df_cleaned.beds = df_cleaned.beds \
    .fillna(df_cleaned.beds.mean())


## The following are missing a significant number of values ## 
## Consider a stronger method of imputation for these ##

# Impute first_review with mode
df_cleaned.first_review = df_cleaned.first_review \
    .fillna(df_cleaned.first_review.mode())

# Impute last_review with mode
df_cleaned.last_review = df_cleaned.last_review \
    .fillna(df_cleaned.last_review.mode())

# Impute review_scores_rating with mean
df_cleaned.review_scores_rating = df_cleaned.review_scores_rating \
    .fillna(df_cleaned.review_scores_rating.mean())

# Impute review_scores_accuracy with mean
df_cleaned.review_scores_accuracy = df_cleaned.review_scores_accuracy \
    .fillna(df_cleaned.review_scores_accuracy.mean())

# Impute review_scores_cleanliness with mean
df_cleaned.review_scores_cleanliness = df_cleaned.review_scores_cleanliness \
    .fillna(df_cleaned.review_scores_cleanliness.mean())

# Impute review_scores_checkin with mean
df_cleaned.review_scores_checkin = df_cleaned.review_scores_checkin \
    .fillna(df_cleaned.review_scores_checkin.mean())

# Impute review_scores_communication with mean
df_cleaned.review_scores_communication = df_cleaned.review_scores_communication \
    .fillna(df_cleaned.review_scores_communication.mean())

# Impute review_scores_location with mean
df_cleaned.review_scores_location = df_cleaned.review_scores_location \
    .fillna(df_cleaned.review_scores_location.mean())

# Impute review_scores_value with mean
df_cleaned.review_scores_value = df_cleaned.review_scores_value \
    .fillna(df_cleaned.review_scores_value.mean())

# Impute reviews_per_month with mean
df_cleaned.reviews_per_month = df_cleaned.reviews_per_month \
    .fillna(df_cleaned.reviews_per_month.mean())

# Impute last_review with the mode
df_cleaned.last_review = df_cleaned.last_review \
    .fillna(1.540710e+09)

# Impute first_review with the mode
df_cleaned.first_review = df_cleaned.last_review \
    .fillna(1.514794e+09)

# Since almost all of square feet is missing we impute it as a boolean
df_cleaned.square_feet = ~df_cleaned.square_feet.isna()

## Model

In [91]:
from sklearn.model_selection import train_test_split

In [92]:
cols_to_exclude = [
    'name',
    'summary',
    'space',
    'description',
    'neighborhood_overview',
    'notes',
    'transit',
    'access',
    'interaction',
    'house_rules',
    'host_location',
    'host_about',
    'amenities',
    'host_id']

In [93]:
for i, c in enumerate(model_df.columns):
    print(i, c)

model_df

0 host_since
1 host_response_rate
2 host_is_superhost
3 host_listings_count
4 host_has_profile_pic
5 host_identity_verified
6 accommodates
7 bathrooms
8 bedrooms
9 beds
10 square_feet
11 price
12 guests_included
13 extra_people
14 minimum_nights
15 maximum_nights
16 number_of_reviews
17 first_review
18 last_review
19 review_scores_rating
20 review_scores_accuracy
21 review_scores_cleanliness
22 review_scores_checkin
23 review_scores_communication
24 review_scores_location
25 review_scores_value
26 instant_bookable
27 is_business_travel_ready
28 require_guest_profile_picture
29 require_guest_phone_verification
30 calculated_host_listings_count
31 reviews_per_month
32 email
33 phone
34 reviews
35 jumio
36 government_id
37 manual_offline
38 facebook
39 kba
40 offline_government_id
41 sent_id
42 selfie
43 identity_manual
44 work_email
45 google
46 manual_online
47 sesame
48 sesame_offline
49 zhima_selfie
50 weibo
51 10453_x
52 10457_x
53 10460_x
54 10458_x
55 10467_x
56 10468_x
57 10451_x


Unnamed: 0,host_since,host_response_rate,host_is_superhost,host_listings_count,host_has_profile_pic,host_identity_verified,accommodates,bathrooms,bedrooms,beds,...,10310_y,10306_y,10307_y,10308_y,10309_y,10312_y,10301_y,10304_y,10305_y,10314_y
0,1.331536e+09,0.928258,False,1.0,False,False,2,1.000000,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False
1,1.379142e+09,1.000000,False,4.0,False,False,8,1.000000,3.0,5.0,...,False,False,False,False,False,False,False,False,False,False
2,1.355213e+09,1.000000,False,1.0,False,False,2,1.000000,0.0,1.0,...,False,False,False,False,False,False,False,False,False,False
3,1.386058e+09,0.928258,False,1.0,False,False,1,1.000000,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False
4,1.336720e+09,1.000000,False,1.0,False,False,2,1.000000,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33533,1.340003e+09,0.928258,False,1.0,False,False,2,1.000000,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False
33534,1.440140e+09,0.928258,False,3.0,False,False,8,3.000000,4.0,4.0,...,False,False,False,False,False,False,False,False,False,False
33535,1.336028e+09,0.928258,False,1.0,False,False,1,1.141376,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False
33536,1.404630e+09,1.000000,False,4.0,False,False,2,1.000000,1.0,1.0,...,False,False,False,False,False,False,False,False,False,False


In [94]:
model_df = df_cleaned.drop(cols_to_exclude, axis=1)
model_df = model_df[model_df.columns[0:]]

X = model_df.drop(['price'], axis=1)
y = model_df.price

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (26830, 501)
X_test shape: (6708, 501)
y_train shape: (26830,)
y_test shape: (6708,)


In [95]:
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error

# Create an XGBoost regressor (or classifier based on your problem)
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the training data
y_train_pred = model.predict(X_train)

# Calculate the training RMSE (Root Mean Squared Error)
train_rmse = root_mean_squared_error(y_train, y_train_pred)
print("Training RMSE:", train_rmse)

# Optionally, make predictions on the testing data
y_test_pred = model.predict(X_test)

# Calculate the testing RMSE (Root Mean Squared Error)
test_rmse = root_mean_squared_error(y_test, y_test_pred)
print("Testing RMSE:", test_rmse)

Training RMSE: 57.50920596740549
Testing RMSE: 89.6666716547086


In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# Create an XGBoost regressor
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Define the grid of hyperparameters to search
param_grid = {
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200]
}

# Perform Grid Search with 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=kf, verbose=1, n_jobs=-1)
grid_result = grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_result.best_params_
best_score = grid_result.best_score_

print("Best Parameters:", best_params)
print("Best Score (RMSE):", (-best_score)**0.5)

# Train the model with the best parameters
best_model = xgb.XGBRegressor(objective='reg:squarederror', **best_params)
best_model.fit(X_train, y_train)

# Make predictions on the testing data using the best model
y_test_pred = best_model.predict(X_test)

# Calculate the testing RMSE with the best model
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
print("Testing RMSE with Best Model:", test_rmse)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


KeyboardInterrupt: 

### Predict Unlabled Dataset

In [60]:
unlabeled_data = pd.read_csv('test.csv')

unlabeled_cleaned = unlabeled_data.drop([
    'id', 
    'host_name', 
    'country', 
    'country_code', 
    'state', 
    'market', 
    'city', 
    'host_neighbourhood', 
    'host_acceptance_rate', 
    'experiences_offered'], axis=1)


# Clean and format dates
date_features = ['host_since', 'first_review', 'last_review']
for i in range(len(date_features)):
    unlabeled_cleaned[date_features[i]] = unlabeled_cleaned[date_features[i]].apply(lambda x: \
        datetime.strptime(x, '%Y-%m-%d').timestamp() \
        if x is not np.nan else x)


# Clean bool features
bool_features = [
    'require_guest_phone_verification', 
    'require_guest_profile_picture', 
    'is_business_travel_ready',
    'instant_bookable',
    'host_identity_verified',
    'host_has_profile_pic',
    'host_is_superhost']

for i in range(len(bool_features)):
    unlabeled_cleaned[bool_features[i]] = unlabeled_cleaned[bool_features[0]] \
        .apply(lambda x: x=='t' or x==True if x is not np.nan else x)
    

# Converting string features containing numeric information to numeric dtypes.
# Converts host_response_rate from strings representing percentages to proportions.
unlabeled_cleaned['host_response_rate'] = unlabeled_cleaned['host_response_rate'].apply(
    lambda x: float(x.replace('%',''))/100 if x is not np.nan else x)
# Converts extra_people from strings representing prices to floats.
unlabeled_cleaned['extra_people'] = unlabeled_cleaned['extra_people'].apply(
    lambda x: float(x.replace('$','')) if x is not np.nan else x)


# Removes most doubles from zipcode by correcting formatting 
unlabeled_cleaned.zipcode = unlabeled_cleaned.zipcode.apply(
    lambda x: str(x).replace('.0', ''))

# One-hot encodes the host verifications

# Find unique verifications in host_verifications feature
unique_host_verifications = pd.Series(
    ', '.join(list(df.host_verifications.apply(
        lambda x: x.replace(']', '').replace('[', '').replace("'", '') \
            if x is not np.nan else '').unique())).split(','))\
    .apply(lambda x: x.replace(' ', ''))\
    .unique()
unique_host_verifications = np.delete(unique_host_verifications, 17)


verified_df = {v:[] for v in unique_host_verifications}
# Helper function to check if a verification is in the strings in host_verifications
def word_in_string(word, string):
    pattern = r'\b{}\b'.format(re.escape(word))  # \b is a word boundary
    match = re.search(pattern, string)
    return bool(match)
# For each string in host_verifications, extracts all the verif_strings
for verif_string in unlabeled_cleaned.host_verifications:
    if type(verif_string) != str:
        print(verif_string)
        for v in unique_host_verifications:
            verified_df[v].append(False)
        continue
    for v in unique_host_verifications:
        verified_df[v].append(word_in_string(v, verif_string))

verified_df = pd.DataFrame(verified_df)

unlabeled_cleaned = unlabeled_cleaned.reset_index(drop=True) \
    .merge(verified_df, left_index=True, right_index=True)
unlabeled_cleaned = unlabeled_cleaned.drop(['host_verifications'], axis=1)



nyc_zip_codes = pd.read_csv('nyc-zip-codes.csv')

z1 = unlabeled_cleaned.zipcode.apply(lambda x: str(x).replace('.0', ''))

z2 = z1.apply(lambda x: x if '-' not in x else x.split('-')[0])

z3 = z2.apply(lambda x: x if '\n' not in x else x.split('\n')[0])

def zip_to_int(zipcode):
    try: return int(zipcode)
    except: return np.nan

int_zips = z3.apply(zip_to_int)

valid_zips = int_zips.apply(
    lambda x: x if sum(x==nyc_zip_codes.ZipCode)>=1 else np.nan)

one_hot_zipcodes = pd.get_dummies(valid_zips)
one_hot_zipcodes = one_hot_zipcodes.reindex(columns=nyc_zip_codes.ZipCode, fill_value=0)

unlabeled_cleaned = unlabeled_cleaned.reset_index(drop=True) \
    .merge(one_hot_zipcodes, left_index=True, right_index=True)
unlabeled_cleaned = unlabeled_cleaned.drop(['zipcode'], axis=1)



one_hot_features = [
    'room_type', 
    'neighbourhood_group_cleansed', 
    'bed_type', 
    'host_response_time', 
    'cancellation_policy', 
    'property_type',
    'neighbourhood_cleansed']

unlabeled_cleaned = pd.get_dummies(unlabeled_cleaned, columns=one_hot_features)

# Impute host response time with means
unlabeled_cleaned.host_response_rate = unlabeled_cleaned.host_response_rate \
    .fillna(unlabeled_cleaned.host_response_rate.mean())

# Impute host_since with the mode
unlabeled_cleaned.host_since = unlabeled_cleaned.host_since \
    .fillna(1.427699e+09)

# Impute host_listings_count with mean
unlabeled_cleaned.host_listings_count = unlabeled_cleaned.host_listings_count \
    .fillna(unlabeled_cleaned.host_listings_count.mean())

# Impute bathrooms with mean
unlabeled_cleaned.bathrooms = unlabeled_cleaned.bathrooms \
    .fillna(unlabeled_cleaned.bathrooms.mean())

# Impute bedrooms with mean
unlabeled_cleaned.bedrooms = unlabeled_cleaned.bedrooms \
    .fillna(unlabeled_cleaned.bedrooms.mean())

# Impute beds with mean
unlabeled_cleaned.beds = unlabeled_cleaned.beds \
    .fillna(unlabeled_cleaned.beds.mean())


## The following are missing a significant number of values ## 
## Consider a stronger method of imputation for these ##

# Impute first_review with mode
unlabeled_cleaned.first_review = unlabeled_cleaned.first_review \
    .fillna(unlabeled_cleaned.first_review.mode())

# Impute last_review with mode
unlabeled_cleaned.last_review = unlabeled_cleaned.last_review \
    .fillna(unlabeled_cleaned.last_review.mode())

# Impute review_scores_rating with mean
unlabeled_cleaned.review_scores_rating = unlabeled_cleaned.review_scores_rating \
    .fillna(unlabeled_cleaned.review_scores_rating.mean())

# Impute review_scores_accuracy with mean
unlabeled_cleaned.review_scores_accuracy = unlabeled_cleaned.review_scores_accuracy \
    .fillna(unlabeled_cleaned.review_scores_accuracy.mean())

# Impute review_scores_cleanliness with mean
unlabeled_cleaned.review_scores_cleanliness = unlabeled_cleaned.review_scores_cleanliness \
    .fillna(unlabeled_cleaned.review_scores_cleanliness.mean())

# Impute review_scores_checkin with mean
unlabeled_cleaned.review_scores_checkin = unlabeled_cleaned.review_scores_checkin \
    .fillna(unlabeled_cleaned.review_scores_checkin.mean())

# Impute review_scores_communication with mean
unlabeled_cleaned.review_scores_communication = unlabeled_cleaned.review_scores_communication \
    .fillna(unlabeled_cleaned.review_scores_communication.mean())

# Impute review_scores_location with mean
unlabeled_cleaned.review_scores_location = unlabeled_cleaned.review_scores_location \
    .fillna(unlabeled_cleaned.review_scores_location.mean())

# Impute review_scores_value with mean
unlabeled_cleaned.review_scores_value = unlabeled_cleaned.review_scores_value \
    .fillna(unlabeled_cleaned.review_scores_value.mean())

# Impute reviews_per_month with mean
unlabeled_cleaned.reviews_per_month = unlabeled_cleaned.reviews_per_month \
    .fillna(unlabeled_cleaned.reviews_per_month.mean())

# Impute last_review with the mode
unlabeled_cleaned.last_review = unlabeled_cleaned.last_review \
    .fillna(1.540710e+09)

# Impute first_review with the mode
unlabeled_cleaned.first_review = unlabeled_cleaned.last_review \
    .fillna(1.514794e+09)

# Since almost all of square feet is missing we impute it as a boolean
unlabeled_cleaned.square_feet = ~unlabeled_cleaned.square_feet.isna()

  unlabeled_data = pd.read_csv('test.csv')


nan
nan
nan


In [61]:
model.predict(unlabeled_cleaned)

ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:name: object, summary: object, space: object, description: object, neighborhood_overview: object, notes: object, transit: object, access: object, interaction: object, house_rules: object, host_location: object, host_about: object, amenities: object

In [62]:
unlabeled_cleaned

Unnamed: 0,name,summary,space,description,neighborhood_overview,notes,transit,access,interaction,house_rules,...,neighbourhood_cleansed_West Village,neighbourhood_cleansed_Westchester Square,neighbourhood_cleansed_Westerleigh,neighbourhood_cleansed_Whitestone,neighbourhood_cleansed_Williamsbridge,neighbourhood_cleansed_Williamsburg,neighbourhood_cleansed_Windsor Terrace,neighbourhood_cleansed_Woodhaven,neighbourhood_cleansed_Woodlawn,neighbourhood_cleansed_Woodside
0,Super Lux 2BR in Downtown Manhattan,Prepare to be WOWED! This spectacularly bright...,"Top of the line Wolf and Sub-Zero appliances, ...",Prepare to be WOWED! This spectacularly bright...,,,,,electronic lock ensures 24 hour check-in,,...,False,False,False,False,False,False,False,False,False,False
1,Vintage Eclectic Brownstone Pad in Brooklyn,"Ideal for romantic, creative types, this is an...","Not your typical New York abode, my apartment ...","Ideal for romantic, creative types, this is an...",Bed Stuy is a diverse historic neighborhood wi...,This is an actual unique living experience whe...,Close to buses and subways there is also free ...,"Entrance hallway, living room, bedroom, kitche...",I'm very social but understand the need for my...,Hi there. Thank you so much for taking the tim...,...,False,False,False,False,False,False,False,False,False,False
2,Spacious Harlem Hideaway,"Postive Vibes . This is our Harlem tree house,...",The private room is very spacious and cozy. Th...,"Postive Vibes . This is our Harlem tree house,...",You are in a Cultural Haven full of restaurant...,We also keep cucumber water in the fridge feel...,"Train, uber or a taxi. (Extremely taxi accessi...","Private Room, Kitchen And Bathroom",Very open to communication. If you have any qu...,If you are cooking we request you wash your di...,...,False,False,False,False,False,False,False,False,False,False
3,Spacius private room in Brooklyn,"Newly renovated apartment, its a 3 bedroom apa...","3 bedroom apartment, 1 full bathroom, living r...","Newly renovated apartment, its a 3 bedroom apa...",,,"There is the Mta 3 train Sutter stop, also the...",,,Clean up after yourself.,...,False,False,False,False,False,False,False,False,False,False
4,*Dg) Delightful Private Room 20 min to Manhattan,Hi my home is only 2 blocks from the subway st...,,Hi my home is only 2 blocks from the subway st...,,,,,,"- no illegal downloads, please do not or it wi...",...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17332,Charming room in Brooklyn,Charming newly renovated 2 bed apartment in Be...,,Charming newly renovated 2 bed apartment in Be...,,,"Though street parking is available, space is 2...",,,Please wash used dishes,...,False,False,False,False,False,False,False,False,False,False
17333,Luxurious 1BR in Herald Square,"-Apartment includes: Hardwood Floors, High Cei...",- ONE proper bed that can sleep 2. - ONE plush...,"-Apartment includes: Hardwood Floors, High Cei...",Smack in the middle of Manhattan. Near the Emp...,,,,Available for help during the day/evening.,No pets/No Smoking/No Parties,...,False,False,False,False,False,False,False,False,False,False
17334,"Master Bedrm, Steam Shr/Jacuzzi, FH","Elegant room w/AC, King Bed, Jacuzzi and Steam...",This stately red brick federal on a quiet stre...,"Elegant room w/AC, King Bed, Jacuzzi and Steam...",It's easy to enjoy all that NYC has to offer f...,Guests are expected to respect others' needs f...,A three block walk to the Forest Hills/71st Rd...,Guests have access to the common areas of the ...,One of us will be present during a good portio...,No smoking. No pets. Alcohol only in moderatio...,...,False,False,False,False,False,False,False,False,False,False
17335,Private rooms starting at $67 a night per person.,Family friendly neighborhood. Caribbean settin...,You have a choice of 1 of 4 bedrooms. Room#4 c...,Family friendly neighborhood. Caribbean settin...,The neighborhood has easy access to Manhattan....,Our place can also be available for $249 a nig...,Buses are 1 block away. A 10 minute ride in th...,"Coffee, tea, cocoa and various juices are prov...",Guest can mingle in the living room and share ...,,...,False,False,False,False,False,False,False,False,False,False


In [63]:
df_cleaned

Unnamed: 0,name,summary,space,description,neighborhood_overview,notes,transit,access,interaction,house_rules,...,sent_id,selfie,identity_manual,work_email,google,manual_online,sesame,sesame_offline,zhima_selfie,weibo
0,Modern and Cozy Large Studio in Brooklyn,Modern large studio with new amenities and app...,Our place is a little quiet sanctuary in the h...,Modern large studio with new amenities and app...,"BAM, Barclays, Brooklyn City Point, Fort Green...",,"Subway: 2,3,4,5,A,C,B,Q,G",Washer/Dryer Dishwasher Internet Gym Roof Top ...,"Depending on the time of your visit, I'll be h...","- Please be respectful of our neighbors, no lo...",...,False,False,False,False,False,False,False,False,False,False
1,Royal Harlem TRIPLEX Home 5 Beds,Harlem is back and so gorgeous! Visit and expl...,Harlem is back and so gorgeous! Visit and expl...,Harlem is back and so gorgeous! Visit and expl...,HARLEM is a piece of real NY history overflowi...,HARLEM RESTAURANTS Red Rooster Harlem -- excel...,PUBLIC TRANSPORTATION: Conveniently near all p...,The WHOLE ENTIRE HOUSE,,"Smoking, pets and unaccounted guests NOT permi...",...,False,False,False,False,False,False,False,False,False,False
2,Sunny East Village Studio,"Clean, hip and well designed sun drenched East...",This is a rare East Village studio with it's h...,"Clean, hip and well designed sun drenched East...",East Village is one of the last remaining neig...,,,You'll have access to the entire space - it's ...,"Very responsive via phone call, text or email.",,...,False,False,False,False,False,False,False,False,False,False
3,"Beautiful, airy, light-filled room","Private, spacious, comfortable room in 2-bed f...","Big closet, two big windows, tall ceiling and ...","Private, spacious, comfortable room in 2-bed f...",One block from Morgan L stop. Super cool area....,,,,,,...,False,False,False,False,False,False,False,False,False,False
4,Private Room in Prime Brooklyn Spot,"Comfy, quiet and big private room in a three b...",This big old apartment that we love and take c...,"Comfy, quiet and big private room in a three b...",I absolutely love this neighborhood - right at...,Just a note about the space: The window in you...,Super convenient to almost all subway lines. A...,Your room has a very comfortable queen sized b...,"We are my husband Joaquin and I, our sweet new...",This house is shoes off. Thank you! No guests ...,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33533,Private Room Amazing View in EV,"Private room in cozy, sun-drenched & plant fil...",,"Private room in cozy, sun-drenched & plant fil...",,,,,I will not be home when guests are here and ro...,,...,False,False,False,True,False,False,False,False,False,False
33534,Floor 35th in the heart of New York,This is a beautiful four bedroom and three bat...,This is a Duplex Apartment (2 floors) Main Flo...,This is a beautiful four bedroom and three bat...,LOCATION LOCATION LOCATION. Walk to Time Squar...,,Apartment is located at 70 W 45th Street (betw...,Access to the unit is through elevators.,A host is available via cell phone for recomme...,No sub leasing. All guest names need to be reg...,...,False,False,False,False,False,False,False,False,False,False
33535,Cozy room for rent in duplex house,,Room for rent in duplex house located in Astor...,Room for rent in duplex house located in Astor...,,,,,,,...,False,False,False,False,False,False,False,False,False,False
33536,"Beautiful 1 bedroom apart, Washington Heights",Very spacious bright beautiful apartment in Wa...,"Very spacious living room, bright kitchen with...",Very spacious bright beautiful apartment in Wa...,Very close to Times Square but a quiet residen...,,Very Close to subway lines A and 1 trains. Les...,"Living room, bedroom, shared kitchen and bathr...",Help will always be available.,- Must leave the room clean and orderly before...,...,False,False,False,False,False,False,False,False,False,False
