In [321]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time as time
import os

from sklearn.preprocessing import PolynomialFeatures, StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, LogisticRegressionCV, LogisticRegression

from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate, \
GridSearchCV, RandomizedSearchCV, ParameterGrid, KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from skopt import BayesSearchCV

from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier

from sklearn.ensemble import BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier, \
VotingRegressor, VotingClassifier, StackingRegressor, StackingClassifier

from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, AdaBoostRegressor,AdaBoostClassifier
import xgboost as xgb
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier

from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, \
accuracy_score, precision_score, confusion_matrix, mean_squared_error, r2_score, mean_absolute_error

from sklearn import impute
import ast
import itertools as it
from sklearn.tree import export_graphviz 
from six import StringIO
from IPython.display import Image  
import pydotplus

os.getcwd()
os.chdir('/Users/kevin/Downloads/Northwestern University/Data Science/STAT_303-3/Prediction Problems/Datasets')

## Step 0) Reading the data

In [359]:
train = pd.read_csv('train_classification.csv')
test = pd.read_csv('test_classification.csv')

## Step 1) Data pre-processing

### <font color = 'red'>Pre-processing training data</font>

In [360]:
first_ten = train.iloc[:, :10]

# Removing: ['id', 'host_location', 'host_neighbourhood']
cleaned_ten = first_ten.drop(columns=['id', 'host_location', 'host_neighbourhood'])

# Converting: ['host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_since']
cleaned_ten['host_response_rate'] = pd.to_numeric(cleaned_ten['host_response_rate'].str.strip('%')) / 100
cleaned_ten['host_acceptance_rate'] = pd.to_numeric(cleaned_ten['host_acceptance_rate'].str.strip('%')) / 100
cleaned_ten['host_is_superhost'] = cleaned_ten['host_is_superhost'].map({'t': 1, 'f': 0})
cleaned_ten['host_since'] = pd.to_datetime(cleaned_ten['host_since'])
cleaned_ten['days_since_host'] = (pd.datetime.now() - cleaned_ten['host_since']).dt.days
cleaned_ten = cleaned_ten.drop(columns=['host_since'])



second_ten = train.iloc[:, 10:20]

# Converting: ['host_has_profile_pic', 'neighbourhood_cleansed', 'host_identity_verified','latitude', 'longitude', 'property_type', 'room_type']
cleaned_twenty = second_ten
neighbourhood_counts = cleaned_twenty.neighbourhood_cleansed.value_counts()
neighbourhoods_to_replace = neighbourhood_counts[neighbourhood_counts < 107].index.tolist()
cleaned_twenty['neighbourhood_cleansed'] = cleaned_twenty['neighbourhood_cleansed'].replace(neighbourhoods_to_replace, 'Other')
cleaned_twenty['num_verifications'] = cleaned_twenty['host_verifications'].apply(lambda x: len(ast.literal_eval(x)))
cleaned_twenty = cleaned_twenty.drop(columns=['host_verifications'])
cleaned_twenty['host_has_profile_pic'] = cleaned_twenty['host_has_profile_pic'].map({'t': 1, 'f': 0})
cleaned_twenty['host_identity_verified'] = cleaned_twenty['host_identity_verified'].map({'t': 1, 'f': 0})
cleaned_twenty['latitude'] = pd.to_numeric(cleaned_twenty['latitude'])
cleaned_twenty['longitude'] = pd.to_numeric(cleaned_twenty['longitude'])
cleaned_twenty['property_category'] = "Entire property"
cleaned_twenty.loc[cleaned_twenty['property_type'].str.contains('room', case=False), 'property_category'] = 'Room'
cleaned_twenty = cleaned_twenty.drop(columns=['property_type'])



third_ten = train.iloc[:, 20:30]

# Converting: ['bathrooms_text', 'price']
third_ten['bathrooms_text'] = third_ten['bathrooms_text'].replace({"Half-bath": "0.5", "Shared half-bath": "0.5", "Private half-bath": "0.5"})
third_ten['num_bathrooms'] = third_ten['bathrooms_text'].str.extract(r'(\d+(\.\d+)?)')[0].astype(float)
cleaned_third = third_ten.drop(columns=['bathrooms_text'])



fourth_ten = train.iloc[:, 30:40]

# Removing: ['first_review', 'last_review']
cleaned_fourth = fourth_ten.drop(columns=['first_review'])
cleaned_fourth = cleaned_fourth.drop(columns=['last_review'])

# Converting: ['has_availability']
cleaned_fourth['has_availability'] = cleaned_fourth['has_availability'].map({'t': 1, 'f': 0})



fifth_ten = train.iloc[:, 40:50]
fifth_ten

# Converting: ['instant_bookable']
cleaned_fifth = fifth_ten
cleaned_fifth['instant_bookable'] = cleaned_fifth['instant_bookable'].map({'t': 1, 'f': 0})


last_three = train.iloc[:, 50:]

In [361]:
# Combining the cleaned datasets
cleaned_train = pd.concat([cleaned_ten, cleaned_twenty, cleaned_third, cleaned_fourth, cleaned_fifth, last_three], axis=1)

In [362]:
# Imputing missing values

columns_with_missing = ['num_bathrooms', 'reviews_per_month', 'host_is_superhost', 
                        'review_scores_rating', 'host_response_rate', 
                        'host_acceptance_rate', 'beds', 'review_scores_communication', 
                        'review_scores_cleanliness', 'review_scores_accuracy', 
                        'review_scores_value', 'review_scores_location', 'review_scores_checkin']

cleaned_train['reviews_per_month'].fillna(cleaned_train['reviews_per_month'].mode()[0], inplace=True)
cleaned_train['host_response_time'].fillna(cleaned_train['host_response_time'].mode()[0], inplace=True)

# Computing the missing values of numeric variables using KNN

knn_imputer = impute.KNNImputer(n_neighbors=10)
cleaned_train_imputed = knn_imputer.fit_transform(cleaned_train[columns_with_missing])
cleaned_train_imputed_df = pd.DataFrame(cleaned_train_imputed, columns=columns_with_missing)
cleaned_train[columns_with_missing] = cleaned_train_imputed_df

In [363]:
y_train = cleaned_train.host_is_superhost
X_train = cleaned_train.drop("host_is_superhost", axis = 1).iloc[:, 1:]

In [364]:
to_be_logged = ['reviews_per_month','accommodates','beds', 'host_total_listings_count', 'minimum_nights', 'maximum_nights']

In [365]:
to_be_logged_zero = ['number_of_reviews_ltm', 'number_of_reviews', 'num_bathrooms', \
                     'calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms']

In [366]:
for column in to_be_logged:
    X_train[column] = np.log(X_train[column])

In [367]:
for column in to_be_logged_zero:
    X_train[column] = np.log(1 + X_train[column])

In [368]:
X_train_onehot = pd.get_dummies(X_train)

In [369]:
# Checkpoint 1
for x in list(X_train_onehot.isnull().sum().sort_values().values):
    if x != 0:
        raise Error

In [370]:
# Checkpoint 2
for x in list((X_train_onehot == -np.inf).sum().sort_values().values):
    if x != 0:
        raise Error

#### PolynomialFeatures

In [371]:
poly_redundant = PolynomialFeatures(2, include_bias = False)
X_train_redundant_poly = poly_redundant.fit_transform(X_train_onehot)
X_train_redundant_poly_df = pd.DataFrame(X_train_redundant_poly, columns = poly_redundant.get_feature_names_out(X_train_onehot.columns))

### <font color = 'red'>Pre-processing test data</font>

In [372]:
first_ten = test.iloc[:, :10]

# Removing: ['id', 'host_id', 'host_since', 'host_location', 'host_response_time', 'host_neighbourhood']
cleaned_ten = first_ten.drop(columns=['host_id', 'host_location', 'host_neighbourhood'])

# Converting: ['host_response_rate', 'host_acceptance_rate', 'host_is_superhost']
cleaned_ten['host_response_rate'] = pd.to_numeric(cleaned_ten['host_response_rate'].str.strip('%')) / 100
cleaned_ten['host_acceptance_rate'] = pd.to_numeric(cleaned_ten['host_acceptance_rate'].str.strip('%')) / 100
cleaned_ten['host_since'] = pd.to_datetime(cleaned_ten['host_since'])
cleaned_ten['days_since_host'] = (pd.datetime.now() - cleaned_ten['host_since']).dt.days
cleaned_ten = cleaned_ten.drop(columns=['host_since'])



second_ten = test.iloc[:, 10:20]

# Converting: ['host_has_profile_pic', 'neighbourhood_cleansed', 'host_verifications', 'host_identity_verified','latitude', 'longitude', 'property_type', 'room_type']
cleaned_twenty = second_ten
neighbourhood_counts = cleaned_twenty.neighbourhood_cleansed.value_counts()
neighbourhoods_to_replace = neighbourhood_counts[neighbourhood_counts < 64].index.tolist()
cleaned_twenty['neighbourhood_cleansed'] = cleaned_twenty['neighbourhood_cleansed'].replace(neighbourhoods_to_replace, 'Other')
cleaned_twenty['num_verifications'] = cleaned_twenty['host_verifications'].apply(lambda x: len(ast.literal_eval(x)))
cleaned_twenty = second_ten.drop(columns=['host_verifications'])
cleaned_twenty['host_has_profile_pic'] = cleaned_twenty['host_has_profile_pic'].map({'t': 1, 'f': 0})
cleaned_twenty['host_identity_verified'] = cleaned_twenty['host_identity_verified'].map({'t': 1, 'f': 0})
cleaned_twenty['latitude'] = pd.to_numeric(cleaned_twenty['latitude'])
cleaned_twenty['longitude'] = pd.to_numeric(cleaned_twenty['longitude'])
cleaned_twenty['bathrooms_text'] = cleaned_twenty['bathrooms_text'].replace({"Half-bath": "0.5", "Shared half-bath": "0.5", "Private half-bath": "0.5"})
cleaned_twenty['num_bathrooms'] = cleaned_twenty['bathrooms_text'].str.extract(r'(\d+(\.\d+)?)')[0].astype(float)
cleaned_twenty = cleaned_twenty.drop(columns=['bathrooms_text'])

cleaned_twenty['property_category'] = "Entire property"
cleaned_twenty.loc[cleaned_twenty['property_type'].str.contains('room', case=False), 'property_category'] = 'Room'
cleaned_twenty = cleaned_twenty.drop(columns=['property_type'])




third_ten = test.iloc[:, 20:30]

# Converting: ['has_availability']
third_ten['has_availability'] = third_ten['has_availability'].map({'t': 1, 'f': 0})




fourth_ten = test.iloc[:, 30:40]

# Removing: ['first_review', 'last_review']
# Converting: []
cleaned_fourth = fourth_ten.drop(columns=['first_review', 'last_review'])



fifth_ten = test.iloc[:, 40:50]
# Converting: ['instant_bookable']
cleaned_fifth = fifth_ten
cleaned_fifth['instant_bookable'] = cleaned_fifth['instant_bookable'].map({'t': 1, 'f': 0})



last_three = test.iloc[:, 50:]

In [373]:
# Combining test data
cleaned_test = pd.concat([cleaned_ten, cleaned_twenty, third_ten, cleaned_fourth, cleaned_fifth, last_three], axis=1)

In [374]:
# Imputing missing values

columns_with_missing = ['num_bathrooms', 'reviews_per_month', 
                        'review_scores_rating', 'host_response_rate', 
                        'host_acceptance_rate', 'beds', 'review_scores_communication', 
                        'review_scores_cleanliness', 'review_scores_accuracy', 
                        'review_scores_value', 'review_scores_location', 'review_scores_checkin', 'number_of_reviews_ltm']

#cleaned_test['reviews_per_month'].fillna(cleaned_test['reviews_per_month'].mode()[0], inplace=True)
cleaned_test['host_response_time'].fillna(cleaned_test['host_response_time'].mode()[0], inplace=True)

# Computing the missing values of numeric variables using KNN

knn_imputer = impute.KNNImputer(n_neighbors=10)
cleaned_test_imputed = knn_imputer.fit_transform(cleaned_test[columns_with_missing])
cleaned_test_imputed_df = pd.DataFrame(cleaned_test_imputed, columns=columns_with_missing)
cleaned_test[columns_with_missing] = cleaned_test_imputed_df

In [375]:
to_be_logged = ['reviews_per_month','accommodates','beds', 'host_total_listings_count', 'minimum_nights', 'maximum_nights']

to_be_logged_zero = ['number_of_reviews_ltm', 'number_of_reviews', 'num_bathrooms', \
                     'calculated_host_listings_count_private_rooms','calculated_host_listings_count_shared_rooms']

In [376]:
for column in to_be_logged:
    cleaned_test[column] = np.log(cleaned_test[column], where = cleaned_test[column] > 0)
    
for column in to_be_logged_zero:
    cleaned_test[column] = np.log(1 + cleaned_test[column])

In [377]:
X_test_redundant = cleaned_test
X_test_redundant = pd.get_dummies(X_test_redundant)
X_test_redundant = X_test_redundant.iloc[:, 1:]

#### PolynomialFeatures

In [378]:
poly_test = PolynomialFeatures(2, include_bias = False)
poly_test.fit(X_test_redundant)
X_test_poly = poly_test.transform(X_test_redundant)

In [379]:
X_test_non_scaled_poly_df = pd.DataFrame(X_test_poly, columns = poly_test.get_feature_names_out(X_test_redundant.columns))

In [380]:
reversed_columns = ['beds num_bathrooms']

for column in reversed_columns:
    predictors = column.split(' ')
    old_column = predictors[1] + ' ' + predictors[0]
    X_test_non_scaled_poly_df[column] = X_test_non_scaled_poly_df[old_column]

## Step 2) Hyperparameter tuning

### How many attempts did it take you to tune the model hyperparameters?

It took me 28 attempts to tune the model hyperparameters. During these attempts, I tried tuning a variety of different models, including bagging, random forest, and boosting, as well as multiple different ensembles of the models using both soft and hard voting. In the end, I was able to reach the 93.5% threshold using a soft voting ensemble with an untuned CatBoost, a tuned XGBoost, and a tuned GradientBoost as the base models.

### Which tuning method did you use (grid search / Bayes search / etc.)?

I used the `RandomizedSearchCV` tuning method to tune the GradientBoost model. For the XGBoost model, I used the `GridSearchCV` tuning method.

### What challenges did you face while tuning the hyperparameters, and what actions did you take to address those challenges?

The main challenge that I faced was that no matter how much effort I put into the tuning process, the classification score on Kaggle would plateau at 93.2%. This was after I had undergone several tuning rounds for multiple different boosting models. In order to address this challenge, I decided to review the entirety of my code to determine if there was another aspect of the prediction problem that I was doing wrong. As I reviewed the pre-processing stage, I realized that I did not log-transform any of the predictors, despite the fact that some were skewed. Once I corrected this oversight, I retrained the boosting models on the newly selected predictors. Even though this did not immediately push me over the threshold, by using trial-and-error with the `VotingClassifier` function, I was able to find a combination of base boosting models that got me over the threshold.

### How many hours did you spend on hyperparameter tuning?

In total, I spent about 3 days on hyperparameter tuning. Some of my earlier attempts at tuning boosting models took more than 3 hours to finish, so I would leave them running overnight. However, the tuning process for the base models that helped me reach the threshold were: 2 hours and 40 minutes for the GradientBoost model and 22 minutes for the XGBoost model. In addition, the hyperparameter tuning process for variable selection using Lasso took about 15 minutes to finish. 

### Hyperparameter tuning code

### <font color=blue> Variable Selection with Lasso</font>

In [259]:
scaler = StandardScaler()

scaler.fit(X_train_redundant_poly_df)
X_train_scaled = scaler.transform(X_train_redundant_poly_df)

In [382]:
start_time = time.time()

Cs = np.logspace(-1,-3,30)
model_cv = LogisticRegressionCV(Cs = Cs, cv=5, penalty='l1', solver = 'saga', random_state=1)
model_cv.fit(X_train_scaled, y_train)

print("Time taken = ", round((time.time()-start_time)/60), " minutes")

Time taken =  14  minutes


In [383]:
coefficients = {}
for i in range(len(model_cv.coef_[0])):
    coefficients[poly_redundant.get_feature_names_out()[i]] = model_cv.coef_[0][i]
coefficients = pd.Series(data = coefficients)
non_zero_coefficients = coefficients[coefficients != 0]

### <font color = blue>Gradient Boost</font>

In [27]:
start_time = time.time()
model = GradientBoostingClassifier(random_state=1)
grid = dict()
grid['n_estimators'] = [100,200,500,1000]
grid['learning_rate'] = [0.0001, 0.001, 0.01,0.1, 1.0]
grid['max_depth'] = [5,10,16,24,32,40]
grid['subsample'] = [0.25,0.5,0.75,1.0]
# define the evaluation procedure
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
# define the grid search procedure
randomized_search = RandomizedSearchCV(estimator=model, param_distributions=grid, n_jobs=-1, cv=cv, n_iter=100, verbose=True, scoring='accuracy')
# execute the grid search
randomized_result = randomized_search.fit(X_train_redundant_poly_df.loc[:, non_zero_coefficients.index],y_train)
print("Time taken = ", (time.time() - start_time)/60, "minutes")

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Time taken =  194.54656725327175 minutes


In [28]:
print("Best: %f using %s" % (randomized_result.best_score_, randomized_result.best_params_))

Best: 0.903757 using {'subsample': 0.75, 'n_estimators': 500, 'max_depth': 10, 'learning_rate': 0.1}


### <font color = blue>XG Boost</font>

In [262]:
start_time = time.time()
param_grid = {'n_estimators':[300,500,700,1000],
                'max_depth': [2,4,7,10],
              'learning_rate': [0.01,0.1,0.5],
               'gamma': [0.1,0.25,0.5],
               'reg_lambda':[0,0.001,0.01],
                'scale_pos_weight':[1.278846]
             }

cv = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)
optimal_params = GridSearchCV(estimator=xgb.XGBClassifier(objective = 'binary:logistic',random_state=1,
                                                         use_label_encoder=False),
                             param_grid = param_grid,
                             scoring = 'accuracy',
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv)
optimal_params.fit(X_train_redundant_poly_df.loc[:, non_zero_coefficients.index],y_train)
print(optimal_params.best_params_,optimal_params.best_score_)
print("Time taken = ", (time.time()-start_time)/60, " minutes")

Fitting 5 folds for each of 432 candidates, totalling 2160 fits
{'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 700, 'reg_lambda': 0.01, 'scale_pos_weight': 1.278846} 0.9003424754293556
Time taken =  39.94512161413829  minutes


### Optimal hyperparameter values

**For the GradientBoost model, the optimal value of `subsample` is 0.75, the optimal `n_estimators` is 500, the optimal value of `max_depth` is 10, and the optimal learning rate is 0.1.**

**For the XGBoost model, the optimal value of `gamma` is 0.1, the optimal learning rate is 0.1, the optimal value of `max_depth` is 4, the optimal value of `n_estimators` is 700, the optimal value of `reg_lambda` is 0.01, and the optimal value of `scale_pos_weight` is 1.278846.**

In [384]:
print('The optimal value of the lasso regularization hyperparameter is:', model_cv.C_[0])

The optimal value of the lasso regularization hyperparameter is: 0.04520353656360243


## Step 3) Developing the model

**CatBoost**

In [345]:
untuned_catboost = CatBoostClassifier(random_state=1, verbose=0).fit(X_train_redundant_poly_df.loc[:, non_zero_coefficients.index],y_train)

In [346]:
untuned_test_pred = untuned_catboost.predict(X_test_non_scaled_poly_df.loc[:, non_zero_coefficients.index])

**XG Boost**

In [347]:
xgb_model = xgb.XGBClassifier(objective = 'binary:logistic',random_state=1,gamma=0.1,learning_rate = 0.1,max_depth=4,
                              n_estimators = 700,reg_lambda = 0.01,scale_pos_weight=1.278846).fit(X_train_redundant_poly_df.loc[:, non_zero_coefficients.index],y_train)

In [348]:
xgb_test_pred = xgb_model.predict(X_test_non_scaled_poly_df.loc[:, non_zero_coefficients.index])

**Gradient Boost**

In [349]:
gradient_model = GradientBoostingClassifier(random_state=1,max_depth=10,learning_rate=0.1,subsample=0.75,
                          n_estimators=500).fit(X_train_redundant_poly_df.loc[:, non_zero_coefficients.index],y_train)

In [350]:
gradient_test_pred = gradient_model.predict(X_test_non_scaled_poly_df.loc[:, non_zero_coefficients.index])

### <font color = blue>Ensembling using soft voting</font>

In [351]:
ensemble_model = VotingClassifier(estimators=[('untuned catboost',untuned_catboost),('gb',gradient_model),
                                              ('xgb',xgb_model)], voting = 'soft')

ensemble_model.fit(X_train_redundant_poly_df.loc[:, non_zero_coefficients.index],y_train)

VotingClassifier(estimators=[('untuned catboost',
                              <catboost.core.CatBoostClassifier object at 0x7fc179b11100>),
                             ('gb',
                              GradientBoostingClassifier(max_depth=10,
                                                         n_estimators=500,
                                                         random_state=1,
                                                         subsample=0.75)),
                             ('xgb',
                              XGBClassifier(base_score=None, booster=None,
                                            callbacks=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None, device=None,
                                            early...
                                            grow_policy=None,
                          

In [352]:
en_test = ensemble_model.predict(X_test_non_scaled_poly_df.loc[:, non_zero_coefficients.index])

### Inserting the prediction columns

In [353]:
# Creating a column in the dataset called `predicted`

cleaned_test.insert(1, "predicted", en_test)
to_submit = cleaned_test.iloc[:, :2]

## Step 4) Ad-hoc steps for further improving model accuracy

#### <font color = 'red'>Matching known host_id's</font>

In [354]:
known = train.loc[:, ['id', 'host_id','host_is_superhost']]
known['host_is_superhost'] = known['host_is_superhost'].map({'t': 1, 'f': 0})

not_known = test.loc[:, ['id', 'host_id']]

overlapping_host_ids = set(known['host_id']).intersection(set(not_known['host_id']))

In [355]:
for host_id in overlapping_host_ids:
    host_is_superhost = known.loc[known['host_id'] == host_id, 'host_is_superhost'].iloc[0]
    not_known.loc[not_known['host_id'] == host_id, 'host_is_superhost'] = host_is_superhost

In [356]:
merged_df = pd.merge(to_submit, not_known[['id', 'host_is_superhost']], on='id', how='left')
merged_df.host_is_superhost = merged_df.host_is_superhost.fillna(merged_df.predicted)

In [357]:
final_submission = merged_df.iloc[:, [0,2]].rename(columns = {'host_is_superhost':'predicted'})

## Step 5) Exporting the predictions in the format required to submit on Kaggle

In [358]:
final_submission.to_csv('Ensembling Classification - Confirmation.csv', index=False)  