In [1]:
import numpy as np
import pandas as pd 
import re
from datetime import datetime

import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('train.csv')

  df = pd.read_csv('train.csv')


## Cleaning

### Purge

In [3]:
df_cleaned = df.drop([
    'id', 
    'host_name', 
    'country', 
    'country_code', 
    'state', 
    'market', 
    'city', 
    'host_neighbourhood', 
    'host_acceptance_rate', 
    'experiences_offered'], axis=1)

### Format

In [4]:
# Clean and format dates
date_features = ['host_since', 'first_review', 'last_review']
for i in range(len(date_features)):
    df_cleaned[date_features[i]] = df_cleaned[date_features[i]].apply(lambda x: \
        datetime.strptime(x, '%Y-%m-%d').timestamp() \
        if x is not np.nan else x)


# Clean bool features
bool_features = [
    'require_guest_phone_verification', 
    'require_guest_profile_picture', 
    'is_business_travel_ready',
    'instant_bookable',
    'host_identity_verified',
    'host_has_profile_pic',
    'host_is_superhost']

for i in range(len(bool_features)):
    df_cleaned[bool_features[i]] = df_cleaned[bool_features[0]] \
        .apply(lambda x: x=='t' or x==True if x is not np.nan else x)
    

# Converting string features containing numeric information to numeric dtypes.
# Converts host_response_rate from strings representing percentages to proportions.
df_cleaned['host_response_rate'] = df_cleaned['host_response_rate'].apply(
    lambda x: float(x.replace('%',''))/100 if x is not np.nan else x)
# Converts extra_people from strings representing prices to floats.
df_cleaned['extra_people'] = df_cleaned['extra_people'].apply(
    lambda x: float(x.replace('$','')) if x is not np.nan else x)


# Removes most doubles from zipcode by correcting formatting 
df_cleaned.zipcode = df_cleaned.zipcode.apply(
    lambda x: str(x).replace('.0', ''))


### Encode

In [5]:
# One-hot encodes the host verifications

# Find unique verifications in host_verifications feature
unique_host_verifications = pd.Series(
    ', '.join(list(df.host_verifications.apply(
        lambda x: x.replace(']', '').replace('[', '').replace("'", '') \
            if x is not np.nan else '').unique())).split(','))\
    .apply(lambda x: x.replace(' ', ''))\
    .unique()
unique_host_verifications = np.delete(unique_host_verifications, 17)


verified_df = {v:[] for v in unique_host_verifications}
# Helper function to check if a verification is in the strings in host_verifications
def word_in_string(word, string):
    pattern = r'\b{}\b'.format(re.escape(word))  # \b is a word boundary
    match = re.search(pattern, string)
    return bool(match)
# For each string in host_verifications, extracts all the verif_strings
for verif_string in df_cleaned.host_verifications:
    if type(verif_string) != str:
        print(verif_string)
        for v in unique_host_verifications:
            verified_df[v].append(False)
        continue
    for v in unique_host_verifications:
        verified_df[v].append(word_in_string(v, verif_string))

verified_df = pd.DataFrame(verified_df)

df_cleaned = df_cleaned.reset_index(drop=True) \
    .merge(verified_df, left_index=True, right_index=True)
df_cleaned = df_cleaned.drop(['host_verifications'], axis=1)

nan
nan
nan
nan
nan


In [6]:
nyc_zip_codes = pd.read_csv('nyc-zip-codes.csv')

z1 = df_cleaned.zipcode.apply(lambda x: str(x).replace('.0', ''))

z2 = z1.apply(lambda x: x if '-' not in x else x.split('-')[0])

z3 = z2.apply(lambda x: x if '\n' not in x else x.split('\n')[0])

def zip_to_int(zipcode):
    try: return int(zipcode)
    except: return np.nan

int_zips = z3.apply(zip_to_int)

valid_zips = int_zips.apply(
    lambda x: x if sum(x==nyc_zip_codes.ZipCode)>=1 else np.nan)

one_hot_zipcodes = pd.get_dummies(valid_zips)
one_hot_zipcodes = one_hot_zipcodes.reindex(columns=nyc_zip_codes.ZipCode, fill_value=0)

df_cleaned = df_cleaned.reset_index(drop=True) \
    .merge(one_hot_zipcodes, left_index=True, right_index=True)
df_cleaned = df_cleaned.drop(['zipcode'], axis=1)

In [7]:
one_hot_features = [
    'room_type', 
    'neighbourhood_group_cleansed', 
    'bed_type', 
    'host_response_time', 
    'cancellation_policy', 
    'property_type',
    'neighbourhood_cleansed']

df_cleaned = pd.get_dummies(df_cleaned, columns=one_hot_features)

### Imputation of Missing Values

In [8]:
# Impute host response time with means
df_cleaned.host_response_rate = df_cleaned.host_response_rate \
    .fillna(df_cleaned.host_response_rate.mean())

# Impute host_since with the mode
df_cleaned.host_since = df_cleaned.host_since \
    .fillna(1.427699e+09)

# Impute host_listings_count with mean
df_cleaned.host_listings_count = df_cleaned.host_listings_count \
    .fillna(df_cleaned.host_listings_count.mean())

# Impute bathrooms with mean
df_cleaned.bathrooms = df_cleaned.bathrooms \
    .fillna(df_cleaned.bathrooms.mean())

# Impute bedrooms with mean
df_cleaned.bedrooms = df_cleaned.bedrooms \
    .fillna(df_cleaned.bedrooms.mean())

# Impute beds with mean
df_cleaned.beds = df_cleaned.beds \
    .fillna(df_cleaned.beds.mean())


## The following are missing a significant number of values ## 
## Consider a stronger method of imputation for these ##

# Impute first_review with mode
df_cleaned.first_review = df_cleaned.first_review \
    .fillna(df_cleaned.first_review.mode())

# Impute last_review with mode
df_cleaned.last_review = df_cleaned.last_review \
    .fillna(df_cleaned.last_review.mode())

# Impute review_scores_rating with mean
df_cleaned.review_scores_rating = df_cleaned.review_scores_rating \
    .fillna(df_cleaned.review_scores_rating.mean())

# Impute review_scores_accuracy with mean
df_cleaned.review_scores_accuracy = df_cleaned.review_scores_accuracy \
    .fillna(df_cleaned.review_scores_accuracy.mean())

# Impute review_scores_cleanliness with mean
df_cleaned.review_scores_cleanliness = df_cleaned.review_scores_cleanliness \
    .fillna(df_cleaned.review_scores_cleanliness.mean())

# Impute review_scores_checkin with mean
df_cleaned.review_scores_checkin = df_cleaned.review_scores_checkin \
    .fillna(df_cleaned.review_scores_checkin.mean())

# Impute review_scores_communication with mean
df_cleaned.review_scores_communication = df_cleaned.review_scores_communication \
    .fillna(df_cleaned.review_scores_communication.mean())

# Impute review_scores_location with mean
df_cleaned.review_scores_location = df_cleaned.review_scores_location \
    .fillna(df_cleaned.review_scores_location.mean())

# Impute review_scores_value with mean
df_cleaned.review_scores_value = df_cleaned.review_scores_value \
    .fillna(df_cleaned.review_scores_value.mean())

# Impute reviews_per_month with mean
df_cleaned.reviews_per_month = df_cleaned.reviews_per_month \
    .fillna(df_cleaned.reviews_per_month.mean())

# Impute last_review with the mode
df_cleaned.last_review = df_cleaned.last_review \
    .fillna(1.540710e+09)

# Impute first_review with the mode
df_cleaned.first_review = df_cleaned.last_review \
    .fillna(1.514794e+09)

# Since almost all of square feet is missing we impute it as a boolean
df_cleaned.square_feet = ~df_cleaned.square_feet.isna()

## Model

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
cols_to_exclude = [
    'name',
    'summary',
    'space',
    'description',
    'neighborhood_overview',
    'notes',
    'transit',
    'access',
    'interaction',
    'house_rules',
    'host_location',
    'host_about',
    'amenities',
    'host_id']

In [13]:
model_df = df_cleaned.drop(cols_to_exclude, axis=1)
model_df = model_df[model_df.columns[0:]]

X = model_df.drop(['price'], axis=1)
y = model_df.price

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shapes of the training and testing sets
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (26830, 501)
X_test shape: (6708, 501)
y_train shape: (26830,)
y_test shape: (6708,)


In [14]:
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error

# Create an XGBoost regressor (or classifier based on your problem)
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the training data
y_train_pred = model.predict(X_train)

# Calculate the training RMSE (Root Mean Squared Error)
train_rmse = root_mean_squared_error(y_train, y_train_pred)
print("Training RMSE:", train_rmse)

# Optionally, make predictions on the testing data
y_test_pred = model.predict(X_test)

# Calculate the testing RMSE (Root Mean Squared Error)
test_rmse = root_mean_squared_error(y_test, y_test_pred)
print("Testing RMSE:", test_rmse)

Training RMSE: 57.50920596740549
Testing RMSE: 89.6666716547086


In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

# Create an XGBoost regressor
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Define the grid of hyperparameters to search
param_grid = {
    'learning_rate': [0.1, 0.01],
    'max_depth': [3, 5, 7],
    'n_estimators': [50, 100, 200]
}

# Perform Grid Search with 5-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=kf, verbose=1, n_jobs=-1)
grid_result = grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_result.best_params_
best_score = grid_result.best_score_

print("Best Parameters:", best_params)
print("Best Score (RMSE):", (-best_score)**0.5)

# Train the model with the best parameters
best_model = xgb.XGBRegressor(objective='reg:squarederror', **best_params)
best_model.fit(X_train, y_train)

# Make predictions on the testing data using the best model
y_test_pred = best_model.predict(X_test)

# Calculate the testing RMSE with the best model
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
print("Testing RMSE with Best Model:", test_rmse)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


Best Parameters: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 200}
Best Score (RMSE): 86.02238819820607
Testing RMSE with Best Model: 88.28455816518493




### Predict Unlabled Dataset

#### Clean Unlabled Data

In [15]:
unlabeled_data = pd.read_csv('test.csv')

unlabeled_cleaned = unlabeled_data.drop([
    'id', 
    'host_name', 
    'country', 
    'country_code', 
    'state', 
    'market', 
    'city', 
    'host_neighbourhood', 
    'host_acceptance_rate', 
    'experiences_offered'], axis=1)


# Clean and format dates
date_features = ['host_since', 'first_review', 'last_review']
for i in range(len(date_features)):
    unlabeled_cleaned[date_features[i]] = unlabeled_cleaned[date_features[i]].apply(lambda x: \
        datetime.strptime(x, '%Y-%m-%d').timestamp() \
        if x is not np.nan else x)


# Clean bool features
bool_features = [
    'require_guest_phone_verification', 
    'require_guest_profile_picture', 
    'is_business_travel_ready',
    'instant_bookable',
    'host_identity_verified',
    'host_has_profile_pic',
    'host_is_superhost']

for i in range(len(bool_features)):
    unlabeled_cleaned[bool_features[i]] = unlabeled_cleaned[bool_features[0]] \
        .apply(lambda x: x=='t' or x==True if x is not np.nan else x)
    

# Converting string features containing numeric information to numeric dtypes.
# Converts host_response_rate from strings representing percentages to proportions.
unlabeled_cleaned['host_response_rate'] = unlabeled_cleaned['host_response_rate'].apply(
    lambda x: float(x.replace('%',''))/100 if x is not np.nan else x)
# Converts extra_people from strings representing prices to floats.
unlabeled_cleaned['extra_people'] = unlabeled_cleaned['extra_people'].apply(
    lambda x: float(x.replace('$','')) if x is not np.nan else x)


# Removes most doubles from zipcode by correcting formatting 
unlabeled_cleaned.zipcode = unlabeled_cleaned.zipcode.apply(
    lambda x: str(x).replace('.0', ''))

# One-hot encodes the host verifications

# Find unique verifications in host_verifications feature
unique_host_verifications = pd.Series(
    ', '.join(list(df.host_verifications.apply(
        lambda x: x.replace(']', '').replace('[', '').replace("'", '') \
            if x is not np.nan else '').unique())).split(','))\
    .apply(lambda x: x.replace(' ', ''))\
    .unique()
unique_host_verifications = np.delete(unique_host_verifications, 17)


verified_df = {v:[] for v in unique_host_verifications}
# Helper function to check if a verification is in the strings in host_verifications
def word_in_string(word, string):
    pattern = r'\b{}\b'.format(re.escape(word))  # \b is a word boundary
    match = re.search(pattern, string)
    return bool(match)
# For each string in host_verifications, extracts all the verif_strings
for verif_string in unlabeled_cleaned.host_verifications:
    if type(verif_string) != str:
        print(verif_string)
        for v in unique_host_verifications:
            verified_df[v].append(False)
        continue
    for v in unique_host_verifications:
        verified_df[v].append(word_in_string(v, verif_string))

verified_df = pd.DataFrame(verified_df)

unlabeled_cleaned = unlabeled_cleaned.reset_index(drop=True) \
    .merge(verified_df, left_index=True, right_index=True)
unlabeled_cleaned = unlabeled_cleaned.drop(['host_verifications'], axis=1)



nyc_zip_codes = pd.read_csv('nyc-zip-codes.csv')

z1 = unlabeled_cleaned.zipcode.apply(lambda x: str(x).replace('.0', ''))

z2 = z1.apply(lambda x: x if '-' not in x else x.split('-')[0])

z3 = z2.apply(lambda x: x if '\n' not in x else x.split('\n')[0])

def zip_to_int(zipcode):
    try: return int(zipcode)
    except: return np.nan

int_zips = z3.apply(zip_to_int)

valid_zips = int_zips.apply(
    lambda x: x if sum(x==nyc_zip_codes.ZipCode)>=1 else np.nan)

one_hot_zipcodes = pd.get_dummies(valid_zips)
one_hot_zipcodes = one_hot_zipcodes.reindex(columns=nyc_zip_codes.ZipCode, fill_value=0)

unlabeled_cleaned = unlabeled_cleaned.reset_index(drop=True) \
    .merge(one_hot_zipcodes, left_index=True, right_index=True)
unlabeled_cleaned = unlabeled_cleaned.drop(['zipcode'], axis=1)



one_hot_features = [
    'room_type', 
    'neighbourhood_group_cleansed', 
    'bed_type', 
    'host_response_time', 
    'cancellation_policy', 
    'property_type',
    'neighbourhood_cleansed']

unlabeled_cleaned = pd.get_dummies(unlabeled_cleaned, columns=one_hot_features)
unlabeled_cleaned = unlabeled_cleaned.reindex(columns=X.columns, fill_value=False)




# Impute host response time with means
unlabeled_cleaned.host_response_rate = unlabeled_cleaned.host_response_rate \
    .fillna(unlabeled_cleaned.host_response_rate.mean())

# Impute host_since with the mode
unlabeled_cleaned.host_since = unlabeled_cleaned.host_since \
    .fillna(1.427699e+09)

# Impute host_listings_count with mean
unlabeled_cleaned.host_listings_count = unlabeled_cleaned.host_listings_count \
    .fillna(unlabeled_cleaned.host_listings_count.mean())

# Impute bathrooms with mean
unlabeled_cleaned.bathrooms = unlabeled_cleaned.bathrooms \
    .fillna(unlabeled_cleaned.bathrooms.mean())

# Impute bedrooms with mean
unlabeled_cleaned.bedrooms = unlabeled_cleaned.bedrooms \
    .fillna(unlabeled_cleaned.bedrooms.mean())

# Impute beds with mean
unlabeled_cleaned.beds = unlabeled_cleaned.beds \
    .fillna(unlabeled_cleaned.beds.mean())


## The following are missing a significant number of values ## 
## Consider a stronger method of imputation for these ##

# Impute first_review with mode
unlabeled_cleaned.first_review = unlabeled_cleaned.first_review \
    .fillna(unlabeled_cleaned.first_review.mode())

# Impute last_review with mode
unlabeled_cleaned.last_review = unlabeled_cleaned.last_review \
    .fillna(unlabeled_cleaned.last_review.mode())

# Impute review_scores_rating with mean
unlabeled_cleaned.review_scores_rating = unlabeled_cleaned.review_scores_rating \
    .fillna(unlabeled_cleaned.review_scores_rating.mean())

# Impute review_scores_accuracy with mean
unlabeled_cleaned.review_scores_accuracy = unlabeled_cleaned.review_scores_accuracy \
    .fillna(unlabeled_cleaned.review_scores_accuracy.mean())

# Impute review_scores_cleanliness with mean
unlabeled_cleaned.review_scores_cleanliness = unlabeled_cleaned.review_scores_cleanliness \
    .fillna(unlabeled_cleaned.review_scores_cleanliness.mean())

# Impute review_scores_checkin with mean
unlabeled_cleaned.review_scores_checkin = unlabeled_cleaned.review_scores_checkin \
    .fillna(unlabeled_cleaned.review_scores_checkin.mean())

# Impute review_scores_communication with mean
unlabeled_cleaned.review_scores_communication = unlabeled_cleaned.review_scores_communication \
    .fillna(unlabeled_cleaned.review_scores_communication.mean())

# Impute review_scores_location with mean
unlabeled_cleaned.review_scores_location = unlabeled_cleaned.review_scores_location \
    .fillna(unlabeled_cleaned.review_scores_location.mean())

# Impute review_scores_value with mean
unlabeled_cleaned.review_scores_value = unlabeled_cleaned.review_scores_value \
    .fillna(unlabeled_cleaned.review_scores_value.mean())

# Impute reviews_per_month with mean
unlabeled_cleaned.reviews_per_month = unlabeled_cleaned.reviews_per_month \
    .fillna(unlabeled_cleaned.reviews_per_month.mean())

# Impute last_review with the mode
unlabeled_cleaned.last_review = unlabeled_cleaned.last_review \
    .fillna(1.540710e+09)

# Impute first_review with the mode
unlabeled_cleaned.first_review = unlabeled_cleaned.last_review \
    .fillna(1.514794e+09)

# Since almost all of square feet is missing we impute it as a boolean
unlabeled_cleaned.square_feet = ~unlabeled_cleaned.square_feet.isna()

  unlabeled_data = pd.read_csv('test.csv')


nan
nan
nan


#### Run Predictions

In [16]:
unlabled_predictions = pd.DataFrame().assign(
    Id = unlabeled_data.id,
    Predicted = best_model.predict(unlabeled_cleaned)
)

NameError: name 'best_model' is not defined

In [141]:
unlabled_predictions.to_csv('submission.csv', index=False)