In [1]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import time as time
import os

from sklearn.preprocessing import PolynomialFeatures, StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, LogisticRegressionCV, LogisticRegression

from sklearn.neighbors import KNeighborsRegressor

from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, cross_validate, \
GridSearchCV, RandomizedSearchCV, ParameterGrid, KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from skopt import BayesSearchCV

from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier

from sklearn.ensemble import BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier, \
VotingRegressor, VotingClassifier, StackingRegressor, StackingClassifier

from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, AdaBoostRegressor,AdaBoostClassifier
import xgboost as xgb
from lightgbm import LGBMRegressor, LGBMClassifier
from catboost import CatBoostRegressor, CatBoostClassifier

from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, \
accuracy_score, precision_score, confusion_matrix, mean_squared_error, r2_score, mean_absolute_error

from sklearn import impute
import ast
import itertools as it
from sklearn.tree import export_graphviz 
from six import StringIO
from IPython.display import Image  
import pydotplus

os.getcwd()
os.chdir('/Users/kevin/Downloads/Northwestern University/Data Science/STAT_303-3/Prediction Problems/Datasets')

## Step 0) Reading the data

In [2]:
train = pd.read_csv('train_classification.csv')
test = pd.read_csv('test_classification.csv')

## Step 1) Data pre-processing

### <font color = 'red'>Pre-processing training data</font>

In [584]:
first_ten = train.iloc[:, :10]

# Removing: ['id', 'host_location', 'host_neighbourhood']
cleaned_ten = first_ten.drop(columns=['id', 'host_location', 'host_neighbourhood'])

# Converting: ['host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_since']
cleaned_ten['host_response_rate'] = pd.to_numeric(cleaned_ten['host_response_rate'].str.strip('%')) / 100
cleaned_ten['host_acceptance_rate'] = pd.to_numeric(cleaned_ten['host_acceptance_rate'].str.strip('%')) / 100
cleaned_ten['host_is_superhost'] = cleaned_ten['host_is_superhost'].map({'t': 1, 'f': 0})
cleaned_ten['host_since'] = pd.to_datetime(cleaned_ten['host_since'])
cleaned_ten['days_since_host'] = (pd.datetime.now() - cleaned_ten['host_since']).dt.days
cleaned_ten = cleaned_ten.drop(columns=['host_since'])



second_ten = train.iloc[:, 10:20]

# Converting: ['host_has_profile_pic', 'neighbourhood_cleansed', 'host_identity_verified','latitude', 'longitude', 'property_type', 'room_type']
cleaned_twenty = second_ten
neighbourhood_counts = cleaned_twenty.neighbourhood_cleansed.value_counts()
neighbourhoods_to_replace = neighbourhood_counts[neighbourhood_counts < 107].index.tolist()
cleaned_twenty['neighbourhood_cleansed'] = cleaned_twenty['neighbourhood_cleansed'].replace(neighbourhoods_to_replace, 'Other')
cleaned_twenty['num_verifications'] = cleaned_twenty['host_verifications'].apply(lambda x: len(ast.literal_eval(x)))
cleaned_twenty = cleaned_twenty.drop(columns=['host_verifications'])
cleaned_twenty['host_has_profile_pic'] = cleaned_twenty['host_has_profile_pic'].map({'t': 1, 'f': 0})
cleaned_twenty['host_identity_verified'] = cleaned_twenty['host_identity_verified'].map({'t': 1, 'f': 0})
cleaned_twenty['latitude'] = pd.to_numeric(cleaned_twenty['latitude'])
cleaned_twenty['longitude'] = pd.to_numeric(cleaned_twenty['longitude'])
cleaned_twenty['property_category'] = "Entire property"
cleaned_twenty.loc[cleaned_twenty['property_type'].str.contains('room', case=False), 'property_category'] = 'Room'
cleaned_twenty = cleaned_twenty.drop(columns=['property_type'])



third_ten = train.iloc[:, 20:30]

# Converting: ['bathrooms_text', 'price']
third_ten['bathrooms_text'] = third_ten['bathrooms_text'].replace({"Half-bath": "0.5", "Shared half-bath": "0.5", "Private half-bath": "0.5"})
third_ten['num_bathrooms'] = third_ten['bathrooms_text'].str.extract(r'(\d+(\.\d+)?)')[0].astype(float)
cleaned_third = third_ten.drop(columns=['bathrooms_text'])



fourth_ten = train.iloc[:, 30:40]

# Removing: ['first_review', 'last_review']
cleaned_fourth = fourth_ten.drop(columns=['first_review'])
cleaned_fourth = cleaned_fourth.drop(columns=['last_review'])

# Converting: ['has_availability']
cleaned_fourth['has_availability'] = cleaned_fourth['has_availability'].map({'t': 1, 'f': 0})



fifth_ten = train.iloc[:, 40:50]
fifth_ten

# Converting: ['instant_bookable']
cleaned_fifth = fifth_ten
cleaned_fifth['instant_bookable'] = cleaned_fifth['instant_bookable'].map({'t': 1, 'f': 0})


last_three = train.iloc[:, 50:]

In [585]:
# Combining the cleaned datasets
cleaned_train = pd.concat([cleaned_ten, cleaned_twenty, cleaned_third, cleaned_fourth, cleaned_fifth, last_three], axis=1)

In [586]:
# Imputing missing values

columns_with_missing = ['host_response_rate', 'host_acceptance_rate', 'beds', \
                        'num_bathrooms', 'review_scores_rating']

cleaned_train['reviews_per_month'].fillna(cleaned_train['reviews_per_month'].mode()[0], inplace=True)
cleaned_train['host_response_time'].fillna(cleaned_train['host_response_time'].mode()[0], inplace=True)

# Computing the missing values of numeric variables using KNN

knn_imputer = impute.KNNImputer(n_neighbors=10)
cleaned_train_imputed = knn_imputer.fit_transform(cleaned_train[columns_with_missing])
cleaned_train_imputed_df = pd.DataFrame(cleaned_train_imputed, columns=columns_with_missing)
cleaned_train[columns_with_missing] = cleaned_train_imputed_df

In [587]:
to_be_removed = ['review_scores_communication','review_scores_cleanliness', 'number_of_reviews_l30d', \
                                'review_scores_accuracy', 'review_scores_value','review_scores_location', \
                                'review_scores_checkin', 'minimum_minimum_nights', 'maximum_minimum_nights', \
                                'minimum_maximum_nights', 'maximum_maximum_nights', 'availability_60', \
                                'availability_90', 'availability_365','calculated_host_listings_count',
                                'calculated_host_listings_count_entire_homes', 'host_listings_count']

In [588]:
y_train = cleaned_train.host_is_superhost
X_train = cleaned_train.drop("host_is_superhost", axis = 1).iloc[:, 1:]

In [589]:
X_train_non_redundant = X_train.copy()
X_train_non_redundant.drop(columns = to_be_removed, inplace = True)
X_train_non_redundant = pd.get_dummies(X_train_non_redundant, drop_first = True)

#### PolynomialFeatures

In [590]:
poly = PolynomialFeatures(2, include_bias = False)
X_train_poly = poly.fit_transform(X_train_non_redundant)

In [591]:
X_train_non_scaled_poly_df = pd.DataFrame(X_train_poly, columns = poly.get_feature_names_out(X_train_non_redundant.columns))

### <font color = 'red'>Pre-processing test data</font>

In [592]:
first_ten = test.iloc[:, :10]

# Removing: ['id', 'host_id', 'host_since', 'host_location', 'host_response_time', 'host_neighbourhood']
cleaned_ten = first_ten.drop(columns=['host_id', 'host_location', 'host_neighbourhood'])

# Converting: ['host_response_rate', 'host_acceptance_rate', 'host_is_superhost']
cleaned_ten['host_response_rate'] = pd.to_numeric(cleaned_ten['host_response_rate'].str.strip('%')) / 100
cleaned_ten['host_acceptance_rate'] = pd.to_numeric(cleaned_ten['host_acceptance_rate'].str.strip('%')) / 100
cleaned_ten['host_since'] = pd.to_datetime(cleaned_ten['host_since'])
cleaned_ten['days_since_host'] = (pd.datetime.now() - cleaned_ten['host_since']).dt.days
cleaned_ten = cleaned_ten.drop(columns=['host_since'])



second_ten = test.iloc[:, 10:20]

# Converting: ['host_has_profile_pic', 'neighbourhood_cleansed', 'host_verifications', 'host_identity_verified','latitude', 'longitude', 'property_type', 'room_type']
cleaned_twenty = second_ten
neighbourhood_counts = cleaned_twenty.neighbourhood_cleansed.value_counts()
neighbourhoods_to_replace = neighbourhood_counts[neighbourhood_counts < 64].index.tolist()
cleaned_twenty['neighbourhood_cleansed'] = cleaned_twenty['neighbourhood_cleansed'].replace(neighbourhoods_to_replace, 'Other')
cleaned_twenty['num_verifications'] = cleaned_twenty['host_verifications'].apply(lambda x: len(ast.literal_eval(x)))
cleaned_twenty = second_ten.drop(columns=['host_verifications'])
cleaned_twenty['host_has_profile_pic'] = cleaned_twenty['host_has_profile_pic'].map({'t': 1, 'f': 0})
cleaned_twenty['host_identity_verified'] = cleaned_twenty['host_identity_verified'].map({'t': 1, 'f': 0})
cleaned_twenty['latitude'] = pd.to_numeric(cleaned_twenty['latitude'])
cleaned_twenty['longitude'] = pd.to_numeric(cleaned_twenty['longitude'])
cleaned_twenty['bathrooms_text'] = cleaned_twenty['bathrooms_text'].replace({"Half-bath": "0.5", "Shared half-bath": "0.5", "Private half-bath": "0.5"})
cleaned_twenty['num_bathrooms'] = cleaned_twenty['bathrooms_text'].str.extract(r'(\d+(\.\d+)?)')[0].astype(float)
cleaned_twenty = cleaned_twenty.drop(columns=['bathrooms_text'])

cleaned_twenty['property_category'] = "Entire property"
cleaned_twenty.loc[cleaned_twenty['property_type'].str.contains('room', case=False), 'property_category'] = 'Room'
cleaned_twenty = cleaned_twenty.drop(columns=['property_type'])




third_ten = test.iloc[:, 20:30]

# Converting: ['has_availability']
third_ten['has_availability'] = third_ten['has_availability'].map({'t': 1, 'f': 0})




fourth_ten = test.iloc[:, 30:40]

# Removing: ['first_review', 'last_review']
# Converting: []
cleaned_fourth = fourth_ten.drop(columns=['first_review', 'last_review'])



fifth_ten = test.iloc[:, 40:50]
# Converting: ['instant_bookable']
cleaned_fifth = fifth_ten
cleaned_fifth['instant_bookable'] = cleaned_fifth['instant_bookable'].map({'t': 1, 'f': 0})



last_three = test.iloc[:, 50:]

In [593]:
# Combining test data
cleaned_test = pd.concat([cleaned_ten, cleaned_twenty, third_ten, cleaned_fourth, cleaned_fifth, last_three], axis=1)

In [594]:
# Imputing missing values

columns_with_missing = ['host_response_rate', 'host_acceptance_rate', 'beds', \
                        'num_bathrooms', 'review_scores_rating']

cleaned_test['reviews_per_month'].fillna(cleaned_test['reviews_per_month'].mode()[0], inplace=True)
cleaned_test['host_response_time'].fillna(cleaned_test['host_response_time'].mode()[0], inplace=True)

# Computing the missing values of numeric variables using KNN

knn_imputer = impute.KNNImputer(n_neighbors=10)
cleaned_test_imputed = knn_imputer.fit_transform(cleaned_test[columns_with_missing])
cleaned_test_imputed_df = pd.DataFrame(cleaned_test_imputed, columns=columns_with_missing)
cleaned_test[columns_with_missing] = cleaned_test_imputed_df

In [595]:
X_test_non_redundant = cleaned_test.drop(columns = to_be_removed)
X_test_non_redundant = pd.get_dummies(X_test_non_redundant, drop_first = True)
X_test_non_redundant = X_test_non_redundant.iloc[:, 1:]

In [596]:
poly_test = PolynomialFeatures(2, include_bias = False)
poly_test.fit(X_test_non_redundant)
X_test_poly = poly_test.transform(X_test_non_redundant)

In [597]:
X_test_non_scaled_poly_df = pd.DataFrame(X_test_poly, columns = poly_test.get_feature_names_out(X_test_non_redundant.columns))

In [598]:
reversed_columns = ['days_since_host host_total_listings_count', 'beds num_bathrooms', 'minimum_nights num_bathrooms', 'maximum_nights num_bathrooms', 'minimum_nights_avg_ntm num_bathrooms', 'maximum_nights_avg_ntm num_bathrooms']
for column in reversed_columns:
    predictors = column.split(' ')
    old_column = predictors[1] + ' ' + predictors[0]
    X_test_non_scaled_poly_df[column] = X_test_non_scaled_poly_df[old_column]

## Step 2) Hyperparameter tuning

### How many attempts did it take you to tune the model hyperparameters?

### Which tuning method did you use (grid search / Bayes search / etc.)?

### What challenges did you face while tuning the hyperparameters, and what actions did you take to address those challenges?

### How many hours did you spend on hyperparameter tuning?

### Hyperparameter tuning code

### <font color = red>Variable selection with Lasso</font>

In [599]:
scaler = StandardScaler().fit(X_train_poly)
X_train_scaled = scaler.transform(X_train_poly)

In [600]:
start_time = time.time()

Cs = np.logspace(-1,-5, 50)
model_cv = LogisticRegressionCV(Cs = Cs, cv=5, penalty='l1', solver = 'saga', random_state=1)
model_cv.fit(X_train_scaled, y_train)

print("Time taken = ", round((time.time()-start_time)/60), " minutes")

Time taken =  7  minutes


In [601]:
coefficients = {}
for i in range(len(model_cv.coef_[0])):
    coefficients[poly.get_feature_names_out()[i]] = model_cv.coef_[0][i]

In [602]:
coefficients = pd.Series(data = coefficients)
non_zero_coefficients = coefficients[coefficients != 0]

### Gradient Boost

In [368]:
start_time = time.time()
model = GradientBoostingClassifier(random_state=1)
grid = dict()
grid['n_estimators'] = [500,1000,2000,5000]
grid['learning_rate'] = [0.01,0.1,1.0,2.0,5.0]
grid['max_depth'] = [5,10,16,24,32,40]
grid['subsample'] = [0.25,0.5,0.75,1.0]
# define the evaluation procedure
cv = KFold(n_splits=5, shuffle=True, random_state=1)
# define the grid search procedure
randomized_search = RandomizedSearchCV(estimator=model, param_distributions=grid, n_iter = 150, n_jobs=-1, cv=cv, verbose = True, scoring = 'accuracy')
# execute the grid search
randomized_result = randomized_search.fit(X_train_non_scaled_poly_df.loc[:, non_zero_coefficients.index],y_train)
print("Time taken = ", (time.time() - start_time)/60, "minutes")

Fitting 5 folds for each of 150 candidates, totalling 750 fits
Time taken =  449.3603666861852 minutes


In [369]:
# summarize the best score and configuration
print("Best: %f using %s" % (randomized_result.best_score_, randomized_result.best_params_))

Best: 0.895921 using {'subsample': 0.75, 'n_estimators': 5000, 'max_depth': 5, 'learning_rate': 0.1}


In [None]:
gradient_model = GradientBoostingClassifier(random_state=1,max_depth=5,learning_rate=0.1,subsample=0.75,
                          n_estimators=5000).fit(X_train_non_scaled_poly_df.loc[:, non_zero_coefficients.index],y_train)

### XG Boost

In [568]:
start_time = time.time()
param_grid = {'n_estimators':[500, 1000, 2250, 3750, 5000],
                'max_depth': [2,4,7,10,16,24],
              'learning_rate': [0.01,0.2,0.5,1.0,2.0],
               'gamma': [0.005,0.1,0.175,0.3, 0.5],
               'reg_lambda':[0,0.01,0.005,0.001],
                'scale_pos_weight':[1.0,1.25,1.5]#Control the balance of positive and negative weights, useful for unbalanced classes. A typical value to consider: sum(negative instances) / sum(positive instances).
             }

cv = KFold(n_splits=5,shuffle=True,random_state=1)
optimal_params = RandomizedSearchCV(estimator=xgb.XGBClassifier(objective = 'binary:logistic',random_state=1,use_label_encoder=False),
                             param_distributions = param_grid, n_iter = 150,
                             scoring = 'accuracy',
                             verbose = 1,
                             n_jobs=-1,
                             cv = cv)
optimal_params.fit(X_train_non_scaled_poly_df.loc[:, non_zero_coefficients.index],y_train)
print(optimal_params.best_params_,optimal_params.best_score_)
print("Time taken = ", (time.time()-start_time)/60, " minutes")

Fitting 5 folds for each of 150 candidates, totalling 750 fits
{'scale_pos_weight': 1.5, 'reg_lambda': 0.001, 'n_estimators': 3750, 'max_depth': 7, 'learning_rate': 0.2, 'gamma': 0.005} 0.8846735686464451
Time taken =  35.687345600128175  minutes


In [569]:
xgb_model = xgb.XGBClassifier(objective = 'binary:logistic',random_state=1,gamma=0.005,learning_rate = 0.2,max_depth=7,
                              n_estimators = 3700,reg_lambda = 0.001,scale_pos_weight=1.5).fit(X_train_non_scaled_poly_df.loc[:, non_zero_coefficients.index],y_train)

### <font color>CatBoost</font>

In [603]:
start_time = time.time()
param_grid = {'max_depth': [4,6,8,10],
              'num_leaves': [20, 31, 40, 60],
              'learning_rate': [0.02, 0.05, 0.08],
               'reg_lambda':[0.01,2.5,5,8],
                'n_estimators':[1000, 1250, 1500],
                'subsample': [0.5, 0.75, 1.0],
             'colsample_bylevel': [0.25, 0.5, 0.75, 1.0]}

cv = KFold(n_splits=5,shuffle=True,random_state=1)
optimal_params = RandomizedSearchCV(estimator=CatBoostClassifier(random_state=1, verbose=False, 
                            grow_policy='Lossguide'),                                                       
                             param_distributions = param_grid, n_iter = 200,
                             verbose = 1,random_state = 1, scoring='accuracy',
                             n_jobs=-1,
                             cv = cv)
optimal_params.fit(X_train_non_scaled_poly_df.loc[:, non_zero_coefficients.index],y_train)
print("Time taken = ", (time.time()-start_time)/60, " minutes")

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
Time taken =  340.8866595784823  minutes


In [604]:
print(optimal_params.best_params_,optimal_params.best_score_)

{'subsample': 0.5, 'reg_lambda': 0.01, 'num_leaves': 31, 'n_estimators': 1000, 'max_depth': 8, 'learning_rate': 0.02, 'colsample_bylevel': 1.0} 0.892909527557466


## Step 3) Developing the model

In [387]:
gradient_model = GradientBoostingClassifier(random_state=1,max_depth=5,learning_rate=0.1,subsample=0.75,
                          n_estimators=5000).fit(X_train_non_scaled_poly_df.loc[:, non_zero_coefficients.index],y_train)

In [513]:
xgb_model = xgb.XGBClassifier(objective = 'binary:logistic',random_state=1,gamma=0.3,learning_rate = 0.2,max_depth=5,
                              n_estimators = 5000,reg_lambda = 0,scale_pos_weight=1.5).fit(X_train_non_scaled_poly_df.loc[:, non_zero_coefficients.index],y_train)

In [516]:
untuned_catboost = CatBoostClassifier(random_state=1, verbose=0).fit(X_train_non_scaled_poly_df.loc[:, non_zero_coefficients.index],y_train)

In [514]:
tuned_catboost = CatBoostClassifier(grow_policy='Lossguide', random_state=1, verbose=0, subsample=0.5, reg_lambda=0.01, num_leaves=20,
                                  n_estimators=1250, max_depth=6, learning_rate=0.08, colsample_bylevel=0.5).fit(X_train_non_scaled_poly_df.loc[:, non_zero_coefficients.index],y_train)

### Ensembling with (Hard Voting) (Performed better than Stacking)

In [440]:
hard_model_catboost = VotingClassifier(estimators=[('tuned',catboost_model),('untuned',untuned_catboost)])
hard_model_catboost.fit(X_train_non_scaled_poly_df.loc[:, non_zero_coefficients.index],y_train)
#hard_model.fit(X_train_non_scaled_poly_df.loc[:, better_non_zero],y_train)

VotingClassifier(estimators=[('tuned',
                              <catboost.core.CatBoostClassifier object at 0x7fad40685910>),
                             ('untuned',
                              <catboost.core.CatBoostClassifier object at 0x7fad7833df10>)])

In [441]:
#hard_ensembled_pred = hard_model.predict(X_test_non_scaled_poly_df.loc[:, non_zero_coefficients.index])
hard_ensembled_pred_catboost = hard_model_catboost.predict(X_test_non_scaled_poly_df.loc[:, non_zero_coefficients.index])

In [None]:
hard_model = VotingClassifier(estimators=[('xg',xgb_model),('gradient',gradient_model)])
# hard_model.fit(X_train_non_scaled_poly_df.loc[:, non_zero_coefficients.index],y_train)
hard_model.fit(X_train_non_scaled_poly_df.loc[:, non_zero_coefficients.index],y_train)

In [None]:
hard_ensemble_pred = hard_model.predict(X_test_non_scaled_poly_df.loc[:, non_zero_coefficients.index])

### Making the predictions

In [515]:
tuned_catboost_pred = tuned_catboost.predict(X_test_non_scaled_poly_df.loc[:, non_zero_coefficients.index])

In [517]:
untuned_catboost_pred = untuned_catboost.predict(X_test_non_scaled_poly_df.loc[:, non_zero_coefficients.index])

In [518]:
gradient_pred = gradient_model.predict(X_test_non_scaled_poly_df.loc[:, non_zero_coefficients.index])

In [570]:
xgb_pred = xgb_model.predict(X_test_non_scaled_poly_df.loc[:, non_zero_coefficients.index])

### Ensemble of ensemble

In [479]:
meta_hard_model = VotingClassifier(estimators=[('cat',hard_model_catboost),('others',hard_model)])
# hard_model.fit(X_train_non_scaled_poly_df.loc[:, non_zero_coefficients.index],y_train)
meta_hard_model.fit(X_train_non_scaled_poly_df.loc[:, non_zero_coefficients.index],y_train)

VotingClassifier(estimators=[('cat',
                              VotingClassifier(estimators=[('tuned',
                                                            <catboost.core.CatBoostClassifier object at 0x7fad40685910>),
                                                           ('untuned',
                                                            <catboost.core.CatBoostClassifier object at 0x7fad7833df10>)])),
                             ('others',
                              VotingClassifier(estimators=[('cat',
                                                            VotingClassifier(estimators=[('tuned',
                                                                                          <catboost.core.CatBoostClassifier object at 0x7fad40685910>),
                                                                                         ('un...
                                                                                                        max_cat_threshold

In [481]:
meta_hard_model.predict(X_test_non_scaled_poly_df.loc[:, non_zero_coefficients.index])

0.41335740072202165

In [578]:
# Creating a column in the dataset called `predicted`

cleaned_test.insert(1, "predicted", xgb_pred)
to_submit = cleaned_test.iloc[:, :2]

## Step 4) Ad-hoc steps for further improving model accuracy

#### <font color = 'red'>Matching known host_id's</font>

In [579]:
known = train.loc[:, ['id', 'host_id','host_is_superhost']]
known['host_is_superhost'] = known['host_is_superhost'].map({'t': 1, 'f': 0})

not_known = test.loc[:, ['id', 'host_id']]

overlapping_host_ids = set(known['host_id']).intersection(set(not_known['host_id']))

In [580]:
for host_id in overlapping_host_ids:
    host_is_superhost = known.loc[known['host_id'] == host_id, 'host_is_superhost'].iloc[0]
    not_known.loc[not_known['host_id'] == host_id, 'host_is_superhost'] = host_is_superhost

In [581]:
merged_df = pd.merge(to_submit, not_known[['id', 'host_is_superhost']], on='id', how='left')
merged_df.host_is_superhost = merged_df.host_is_superhost.fillna(merged_df.predicted)

In [582]:
final_submission = merged_df.iloc[:, [0,2]].rename(columns = {'host_is_superhost':'predicted'})

## Step 5) Exporting the predictions in the format required to submit on Kaggle

In [583]:
final_submission.to_csv('Ensembling Classification - Trial 24 (Tuned XGB on selected predictors).csv', index=False)  