In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time as time
import os

from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_regression

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, LogisticRegressionCV, LogisticRegression, ElasticNet

from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate, \
GridSearchCV, RandomizedSearchCV, ParameterGrid, KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from skopt import BayesSearchCV

from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier

from sklearn.ensemble import BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier, \
VotingRegressor, VotingClassifier, StackingRegressor, StackingClassifier

from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, AdaBoostRegressor,AdaBoostClassifier
import xgboost as xgb
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, \
accuracy_score, precision_score, confusion_matrix, mean_squared_error, r2_score, mean_absolute_error

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from skopt.plots import plot_objective, plot_histogram, plot_convergence

from sklearn import impute
import ast
import itertools as it
from sklearn.tree import export_graphviz 
from six import StringIO
from IPython.display import Image  
import pydotplus

os.getcwd()
os.chdir('/Users/kevin/Downloads/Northwestern University/Data Science/STAT_303-3/Prediction Problems/Datasets')

## Step 0) Read data

In [2]:
train = pd.read_csv('train_regression.csv')
test = pd.read_csv('test_regression.csv')

## Step 1) Data pre-processing

### <font color = 'blue'>Pre-processing training data</font>

In [3]:
# Cleaning the columns


first_ten = train.iloc[:, :10]

# Removing: ['id', 'host_location', 'host_neighbourhood']
cleaned_ten = first_ten.drop(columns=['id', 'host_id', 'host_location', 'host_neighbourhood'])

# Converting: ['host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_since']
cleaned_ten['host_response_rate'] = pd.to_numeric(cleaned_ten['host_response_rate'].str.strip('%')) / 100
cleaned_ten['host_acceptance_rate'] = pd.to_numeric(cleaned_ten['host_acceptance_rate'].str.strip('%')) / 100
cleaned_ten['host_is_superhost'] = cleaned_ten['host_is_superhost'].map({'t': 1, 'f': 0})
cleaned_ten['host_since'] = pd.to_datetime(cleaned_ten['host_since'])
cleaned_ten['days_since_host'] = (pd.datetime.now() - cleaned_ten['host_since']).dt.days
cleaned_ten = cleaned_ten.drop(columns=['host_since'])



second_ten = train.iloc[:, 10:20]

# Converting: ['host_has_profile_pic', 'neighbourhood_cleansed', 'host_identity_verified','latitude', 'longitude', 'property_type', 'room_type']
cleaned_twenty = second_ten
neighbourhood_counts = cleaned_twenty.neighbourhood_cleansed.value_counts()
neighbourhoods_to_replace = neighbourhood_counts[neighbourhood_counts < 107].index.tolist()
cleaned_twenty['neighbourhood_cleansed'] = cleaned_twenty['neighbourhood_cleansed'].replace(neighbourhoods_to_replace, 'Other')
cleaned_twenty['num_verifications'] = cleaned_twenty['host_verifications'].apply(lambda x: len(ast.literal_eval(x)))
cleaned_twenty = cleaned_twenty.drop(columns=['host_verifications'])
cleaned_twenty['host_has_profile_pic'] = cleaned_twenty['host_has_profile_pic'].map({'t': 1, 'f': 0})
cleaned_twenty['host_identity_verified'] = cleaned_twenty['host_identity_verified'].map({'t': 1, 'f': 0})
cleaned_twenty['latitude'] = pd.to_numeric(cleaned_twenty['latitude'])
cleaned_twenty['longitude'] = pd.to_numeric(cleaned_twenty['longitude'])
cleaned_twenty['property_category'] = "Entire property"
cleaned_twenty.loc[cleaned_twenty['property_type'].str.contains('room', case=False), 'property_category'] = 'Room'
cleaned_twenty = cleaned_twenty.drop(columns=['property_type'])



third_ten = train.iloc[:, 20:30]

# Converting: ['bathrooms_text', 'price']

third_ten['bathrooms_text'] = third_ten['bathrooms_text'].replace({"Half-bath": "0.5", "Shared half-bath": "0.5", "Private half-bath": "0.5"})
third_ten['num_bathrooms'] = third_ten['bathrooms_text'].str.extract(r'(\d+(\.\d+)?)')[0].astype(float)
cleaned_third = third_ten.drop(columns=['bathrooms_text'])
cleaned_third['price'] = cleaned_third['price'].str.replace('[$,]', '', regex=True).astype(float)



fourth_ten = train.iloc[:, 30:40]
fourth_ten.dtypes

# Removing: ['first_review']
# Converting: ['has_availability']

cleaned_fourth = fourth_ten.drop(columns=['first_review'])
cleaned_fourth['has_availability'] = cleaned_fourth['has_availability'].map({'t': 1, 'f': 0})



fifth_ten = train.iloc[:, 40:50]
fifth_ten

# Removing: ['last_review']
# Converting: ['instant_bookable']

cleaned_fifth = fifth_ten.drop(columns=['last_review'])
cleaned_fifth['instant_bookable'] = cleaned_fifth['instant_bookable'].map({'t': 1, 'f': 0})



last_four = train.iloc[:, 50:]

#### <font color = blue>Imputing Missing Values</font>

In [4]:
# Combining the cleaned datasets

cleaned_train = pd.concat([cleaned_ten, cleaned_twenty, cleaned_third, cleaned_fourth, cleaned_fifth, last_four], axis=1)

In [5]:
columns_with_missing = ['num_bathrooms', 'reviews_per_month', 'host_is_superhost', 
                        'review_scores_rating', 'host_response_rate', 
                        'host_acceptance_rate', 'beds', 'review_scores_communication', 
                        'review_scores_cleanliness', 'review_scores_accuracy', 
                        'review_scores_value', 'review_scores_location', 'review_scores_checkin']

In [6]:
# Computing the missing values of dummy variables using mode

cleaned_train['host_is_superhost'].fillna(cleaned_train['host_is_superhost'].mode()[0], inplace=True)
cleaned_train['host_response_time'].fillna(cleaned_train['host_response_time'].mode()[0], inplace=True)

In [7]:
# Computing the missing values of numeric variables using KNN

knn_imputer = impute.KNNImputer(n_neighbors=10)
cleaned_train_imputed = knn_imputer.fit_transform(cleaned_train[columns_with_missing])
cleaned_train_imputed_df = pd.DataFrame(cleaned_train_imputed, columns=columns_with_missing)
cleaned_train[columns_with_missing] = cleaned_train_imputed_df

#### <font color=blue>Creating predictors and the response variables</font>

In [8]:
y_train = np.log(cleaned_train.price)
X_train = cleaned_train.drop(columns = 'price')

#### <font color = blue>Transforming predictors</font>

In [9]:
to_be_logged = ['reviews_per_month','accommodates','beds', 'host_total_listings_count', 'minimum_nights', 'maximum_nights']

In [10]:
to_be_logged_zero = ['number_of_reviews_ltm', 'number_of_reviews', 'num_bathrooms', \
                     'calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms']

In [11]:
for column in to_be_logged:
    X_train[column] = np.log(X_train[column])

In [12]:
for column in to_be_logged_zero:
    X_train[column] = np.log(1 + X_train[column])

#### <font color = blue>One-hot encoding categorical predictors and applying PolynomialFeatures</font>

In [13]:
X_train_onehot = pd.get_dummies(X_train)

In [14]:
# Checkpoint 1
for x in list(X_train_onehot.isnull().sum().sort_values().values):
    if x != 0:
        raise Error

In [15]:
# Checkpoint 2
for x in list((X_train_onehot == -np.inf).sum().sort_values().values):
    if x != 0:
        raise Error

In [16]:
poly_redundant = PolynomialFeatures(2, include_bias = False)
X_train_redundant_poly = poly_redundant.fit_transform(X_train_onehot)
X_train_redundant_poly_df = pd.DataFrame(X_train_redundant_poly, columns = poly_redundant.get_feature_names_out(X_train_onehot.columns))

### <font color = 'green'>Pre-processing test data</font>

In [17]:
# Cleaning the test data

first_ten = test.iloc[:, :10]
first_ten

# Removing: ['host_location', 'host_neighbourhood']
# Converting: ['host_response_rate', 'host_acceptance_rate', 'host_is_superhost']

cleaned_ten = first_ten.drop(columns=['host_location', 'host_neighbourhood'])

cleaned_ten['host_response_rate'] = pd.to_numeric(cleaned_ten['host_response_rate'].str.strip('%')) / 100
cleaned_ten['host_acceptance_rate'] = pd.to_numeric(cleaned_ten['host_acceptance_rate'].str.strip('%')) / 100
cleaned_ten['host_is_superhost'] = cleaned_ten['host_is_superhost'].map({'t': 1, 'f': 0})
cleaned_ten['host_since'] = pd.to_datetime(cleaned_ten['host_since'])
cleaned_ten['days_since_host'] = (pd.datetime.now() - cleaned_ten['host_since']).dt.days
cleaned_ten = cleaned_ten.drop(columns=['host_since'])



second_ten = test.iloc[:, 10:20]

# Consider removing: []
# Consider converting: ['host_has_profile_pic', 'host_identity_verified','latitude', 'longitude', 'property_type', 'room_type']

cleaned_twenty = second_ten
neighbourhood_counts = cleaned_twenty.neighbourhood_cleansed.value_counts()
neighbourhoods_to_replace = neighbourhood_counts[neighbourhood_counts < 64].index.tolist()
cleaned_twenty['neighbourhood_cleansed'] = cleaned_twenty['neighbourhood_cleansed'].replace(neighbourhoods_to_replace, 'Other')
cleaned_twenty['num_verifications'] = cleaned_twenty['host_verifications'].apply(lambda x: len(ast.literal_eval(x)))
cleaned_twenty = cleaned_twenty.drop(columns=['host_verifications'])
cleaned_twenty['host_has_profile_pic'] = cleaned_twenty['host_has_profile_pic'].map({'t': 1, 'f': 0})
cleaned_twenty['host_identity_verified'] = cleaned_twenty['host_identity_verified'].map({'t': 1, 'f': 0})
cleaned_twenty['latitude'] = pd.to_numeric(cleaned_twenty['latitude'])
cleaned_twenty['longitude'] = pd.to_numeric(cleaned_twenty['longitude'])
cleaned_twenty['property_category'] = "Entire property"
cleaned_twenty.loc[cleaned_twenty['property_type'].str.contains('room', case=False), 'property_category'] = 'Room'
cleaned_twenty = cleaned_twenty.drop(columns=['property_type'])



third_ten = test.iloc[:, 20:30]

# Converting: ['bathrooms_text']

third_ten['bathrooms_text'] = third_ten['bathrooms_text'].replace({"Half-bath": "0.5", "Shared half-bath": "0.5", "Private half-bath": "0.5"})
third_ten['num_bathrooms'] = third_ten['bathrooms_text'].str.extract(r'(\d+(\.\d+)?)')[0].astype(float)
cleaned_third = third_ten.drop(columns=['bathrooms_text'])



fourth_ten = test.iloc[:, 30:40]

# Removing: ['first_review', 'last_review']
# Converting: ['has_availability']

cleaned_fourth = fourth_ten.drop(columns=['first_review', 'last_review'])
cleaned_fourth['has_availability'] = cleaned_fourth['has_availability'].map({'t': 1, 'f': 0})



fifth_ten = test.iloc[:, 40:50]

# Consider removing: []
# Consider converting: ['instant_bookable']

fifth_ten['instant_bookable'] = fifth_ten['instant_bookable'].map({'t': 1, 'f': 0})

last_three = test.iloc[:, 50:]

In [18]:
# Combining the test datasets
cleaned_test = pd.concat([cleaned_ten, cleaned_twenty, cleaned_third, cleaned_fourth, fifth_ten, last_three], axis=1)

In [19]:
copy_ct = cleaned_test.copy()

# Imputing missing values

copy_ct['host_is_superhost'].fillna(copy_ct['host_is_superhost'].mode()[0], inplace=True)
copy_ct['host_response_time'].fillna(copy_ct['host_response_time'].mode()[0], inplace=True)

columns_with_missing = ['num_bathrooms', 'reviews_per_month', 'host_is_superhost', 
                        'review_scores_rating', 'host_response_rate', 
                        'host_acceptance_rate', 'beds', 'review_scores_communication', 
                        'review_scores_cleanliness', 'review_scores_accuracy', 
                        'review_scores_value', 'review_scores_location', 'review_scores_checkin', 'number_of_reviews_ltm']

knn_imputer = impute.KNNImputer(n_neighbors=10)
copy_ct_imputed = knn_imputer.fit_transform(copy_ct[columns_with_missing])
copy_ct_imputed_df = pd.DataFrame(copy_ct_imputed, columns=columns_with_missing)
copy_ct[columns_with_missing] = copy_ct_imputed_df

In [20]:
to_be_logged = ['reviews_per_month','accommodates','beds', 'host_total_listings_count', 'minimum_nights', 'maximum_nights']

to_be_logged_zero = ['number_of_reviews_ltm', 'number_of_reviews', 'num_bathrooms', \
                     'calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms']

#### <font color = green>Transforming predictors</font>

In [21]:
for column in to_be_logged:
    copy_ct[column] = np.log(copy_ct[column], where = copy_ct[column] > 0)
    
for column in to_be_logged_zero:
    copy_ct[column] = np.log(1 + copy_ct[column])

#### <font color = green>One-hot encoding categorical predictors and applying PolynomialFeatures</font>

In [22]:
X_test_redundant = copy_ct
X_test_redundant = pd.get_dummies(X_test_redundant)
X_test_redundant = X_test_redundant.iloc[:, 2:]

In [23]:
poly = PolynomialFeatures(2, include_bias = False)
poly.fit(X_test_redundant)
X_test_poly = poly.transform(X_test_redundant)

In [24]:
X_test_non_scaled_poly_df = pd.DataFrame(X_test_poly, columns = poly.get_feature_names_out(X_test_redundant.columns))

## Step 2) Hyperparameter tuning

### How many attempts did it take you to tune the model hyperparameters?

I made 30 attempts at tuning the model hyperparameters before I was able to reach a score of under 105 on Kaggle; however, in the end, it was a simple untuned CatBoost that got me below the threshold. During my 30 attempts, I tried a variety of boosting models, as well as several combinations of models ensembled using `StackingRegressor`. However, none of these models or ensembles were able to reach a score of under 105. The critical change that I made to achieve my score was improve my variable selection process by log-transforming predictors during the data pre-processing stage. Once I finished this, I was able to achieve my lowest score at the time by simply using an untuned CatBoost model. To get the model under the 105 threshold, I used trial-and-error to determine the appropriate slope to multiply the predictions made by the untuned CatBoost model.

### Which tuning method did you use (grid search / Bayes search / etc.)?

As I was tuning the boosting models, I used the `RandomizedSearchCV()` search method because of its computational benefits.

### What challenges did you face while tuning the hyperparameters, and what actions did you take to address those challenges?

The main challenge that I faced when tuning the hyperparameters was the uncertainty of whether my actions were actually beneficial. For example, tuning my CatBoost model sometimes decreased model performance, which was extremely frustrating since the tuning process took a long time to complete and I had maintained the belief that tuning would result in a better performance. As a result, I often found myself uncertain of what the best courses of action were. Eventually, this uncertainty reached a point where I ran out of ideas to pursue and decided to review the pre-processing portion of the prediction problem. This is when I realized that I was not log-transforming any of the predictors, which could have been leading to the selection of a poor set of predictors, thus affecting the performance of my boosting models. After visualizing the distribution of the predictors and log-transforming the ones that seemed skewed, I developed an untuned CatBoost model, which performed the best out of all of my attempts and gave me the encouragement that I needed to reach the 105 threshold.

### How many hours did you spend on hyperparameter tuning?

I spent over 70 hours on hyperparameter tuning for the various boosting models that I tried out. From Wednesday, May 29th, through Sunday, June 2nd, my computer was constantly tuning boosting models. I found a method to leave my computer running both overnight and when I was away from my computer in order to finish searching over particularly large grids. By contrast, the hyperparameter tuning process for variable selection using Lasso required less than a minute to complete. In addition, the final model that I used to reach the 105 threshold was an untuned CatBoost model, which took about one minute to code and train.

### Hyperparameter Tuning Code

#### <font color = red>Variable selection using Lasso</font>

In [25]:
scaler = StandardScaler()
scaler.fit(X_train_redundant_poly_df)
X_train_scaled = scaler.transform(X_train_redundant_poly_df)

In [26]:
start_time = time.time()
alphas = np.logspace(-1,-3,30)
lassocv = LassoCV(alphas = alphas, cv = 5, max_iter = 1000)
lassocv.fit(X_train_scaled, y_train)
print("Time taken = ", np.round((time.time() - start_time)/60,2), "minutes")

Time taken =  0.45 minutes


In [27]:
lasso = Lasso(alpha = lassocv.alpha_)
lasso.fit(X_train_scaled, y_train)
coefficients = {}
for i in range(len(lasso.coef_)):
    coefficients[poly_redundant.get_feature_names_out()[i]] = lasso.coef_[i]

In [28]:
coefficients = pd.Series(data = coefficients)
non_zero_coefficients = coefficients[coefficients != 0]

#### Optimal hyperparameter values

In [29]:
print('The optimal value of the lasso regularization hyperparameter is:', lassocv.alpha_)

The optimal value of the lasso regularization hyperparameter is: 0.0016102620275609393


## Step 3) Developing the model

In [30]:
catboost_without_tuning = CatBoostRegressor(random_state=1, verbose = False).fit(X_train_redundant_poly_df.loc[:, non_zero_coefficients.index],y_train)

## Step 4) Ad-hoc steps for further improving model accuracy

In [31]:
untuned_pred = np.exp(catboost_without_tuning.predict(X_test_non_scaled_poly_df.loc[:, non_zero_coefficients.index]))

In [32]:
# Scaling up the predictions
final_pred = 1.1 * untuned_pred

## Step 5) Exporting the predictions in the format required to submit on Kaggle

In [33]:
copy_ct.insert(1, "predicted", final_pred)
to_submit = copy_ct.iloc[:, :2]

In [34]:
to_submit.to_csv('Ensembling - Completed.csv', index=False)  