In [97]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, \
cross_validate, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.ensemble import BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, \
accuracy_score, precision_score, confusion_matrix, mean_squared_error, r2_score, mean_absolute_error
from skopt import BayesSearchCV
from sklearn import impute
import ast
import itertools as it

from sklearn.tree import export_graphviz 
from six import StringIO
from IPython.display import Image  
import pydotplus
import time as time

import os
os.getcwd()
os.chdir('/Users/kevin/Downloads/Northwestern University/Data Science/STAT_303-3/Prediction Problems/Datasets')

## Step 0) Reading the data

In [98]:
train = pd.read_csv('train_classification.csv')
test = pd.read_csv('test_classification.csv')

## Step 1) Data pre-processing

### <font color = 'red'>Pre-processing training data</font>

In [99]:
first_ten = train.iloc[:, :10]

# Removing: ['id', 'host_location', 'host_neighbourhood']
cleaned_ten = first_ten.drop(columns=['id', 'host_location', 'host_neighbourhood'])

# Converting: ['host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_since']
cleaned_ten['host_response_rate'] = pd.to_numeric(cleaned_ten['host_response_rate'].str.strip('%')) / 100
cleaned_ten['host_acceptance_rate'] = pd.to_numeric(cleaned_ten['host_acceptance_rate'].str.strip('%')) / 100
cleaned_ten['host_is_superhost'] = cleaned_ten['host_is_superhost'].map({'t': 1, 'f': 0})
cleaned_ten['host_since'] = pd.to_datetime(cleaned_ten['host_since'])
cleaned_ten['days_since_host'] = (pd.datetime.now() - cleaned_ten['host_since']).dt.days
cleaned_ten = cleaned_ten.drop(columns=['host_since'])



second_ten = train.iloc[:, 10:20]

# Converting: ['host_has_profile_pic', 'neighbourhood_cleansed', 'host_identity_verified','latitude', 'longitude', 'property_type', 'room_type']
cleaned_twenty = second_ten
neighbourhood_counts = cleaned_twenty.neighbourhood_cleansed.value_counts()
neighbourhoods_to_replace = neighbourhood_counts[neighbourhood_counts < 107].index.tolist()
cleaned_twenty['neighbourhood_cleansed'] = cleaned_twenty['neighbourhood_cleansed'].replace(neighbourhoods_to_replace, 'Other')
cleaned_twenty['num_verifications'] = cleaned_twenty['host_verifications'].apply(lambda x: len(ast.literal_eval(x)))
cleaned_twenty = cleaned_twenty.drop(columns=['host_verifications'])
cleaned_twenty['host_has_profile_pic'] = cleaned_twenty['host_has_profile_pic'].map({'t': 1, 'f': 0})
cleaned_twenty['host_identity_verified'] = cleaned_twenty['host_identity_verified'].map({'t': 1, 'f': 0})
cleaned_twenty['latitude'] = pd.to_numeric(cleaned_twenty['latitude'])
cleaned_twenty['longitude'] = pd.to_numeric(cleaned_twenty['longitude'])
cleaned_twenty['property_category'] = "Entire property"
cleaned_twenty.loc[cleaned_twenty['property_type'].str.contains('room', case=False), 'property_category'] = 'Room'
cleaned_twenty = cleaned_twenty.drop(columns=['property_type'])



third_ten = train.iloc[:, 20:30]

# Converting: ['bathrooms_text', 'price']
third_ten['bathrooms_text'] = third_ten['bathrooms_text'].replace({"Half-bath": "0.5", "Shared half-bath": "0.5", "Private half-bath": "0.5"})
third_ten['num_bathrooms'] = third_ten['bathrooms_text'].str.extract(r'(\d+(\.\d+)?)')[0].astype(float)
cleaned_third = third_ten.drop(columns=['bathrooms_text'])



fourth_ten = train.iloc[:, 30:40]

# Removing: ['first_review', 'last_review']
cleaned_fourth = fourth_ten.drop(columns=['first_review'])
cleaned_fourth = cleaned_fourth.drop(columns=['last_review'])

# Converting: ['has_availability']
cleaned_fourth['has_availability'] = cleaned_fourth['has_availability'].map({'t': 1, 'f': 0})



fifth_ten = train.iloc[:, 40:50]
fifth_ten

# Converting: ['instant_bookable']
cleaned_fifth = fifth_ten
cleaned_fifth['instant_bookable'] = cleaned_fifth['instant_bookable'].map({'t': 1, 'f': 0})


last_three = train.iloc[:, 50:]

In [100]:
# Combining the cleaned datasets
cleaned_train = pd.concat([cleaned_ten, cleaned_twenty, cleaned_third, cleaned_fourth, cleaned_fifth, last_three], axis=1)

In [101]:
# Imputing missing values

columns_with_missing = ['host_response_rate', 'host_acceptance_rate', 'beds', \
                        'num_bathrooms', 'review_scores_rating']

cleaned_train['reviews_per_month'].fillna(cleaned_train['reviews_per_month'].mode()[0], inplace=True)
cleaned_train['host_response_time'].fillna(cleaned_train['host_response_time'].mode()[0], inplace=True)

# Computing the missing values of numeric variables using KNN

knn_imputer = impute.KNNImputer(n_neighbors=10)
cleaned_train_imputed = knn_imputer.fit_transform(cleaned_train[columns_with_missing])
cleaned_train_imputed_df = pd.DataFrame(cleaned_train_imputed, columns=columns_with_missing)
cleaned_train[columns_with_missing] = cleaned_train_imputed_df

In [102]:
to_be_removed = ['review_scores_communication','review_scores_cleanliness', 'number_of_reviews_l30d', \
                                'review_scores_accuracy', 'review_scores_value','review_scores_location', \
                                'review_scores_checkin', 'minimum_minimum_nights', 'maximum_minimum_nights', \
                                'minimum_maximum_nights', 'maximum_maximum_nights', 'availability_60', \
                                'availability_90', 'availability_365','calculated_host_listings_count',
                                'calculated_host_listings_count_entire_homes', 'host_listings_count']

In [103]:
y_train = cleaned_train.host_is_superhost
X_train = cleaned_train.drop("host_is_superhost", axis = 1).iloc[:, 1:]

In [104]:
X_train_non_redundant = X_train.copy()
X_train_non_redundant.drop(columns = to_be_removed, inplace = True)
X_train_non_redundant = pd.get_dummies(X_train_non_redundant, drop_first = True)

#### PolynomialFeatures

In [105]:
poly = PolynomialFeatures(2, interaction_only = True, include_bias = False)
X_train_poly = poly.fit_transform(X_train_non_redundant)

In [106]:
X_train_non_scaled_poly_df = pd.DataFrame(X_train_poly, columns = poly.get_feature_names_out(X_train_non_redundant.columns))

### <font color = 'red'>Pre-processing test data</font>

In [107]:
first_ten = test.iloc[:, :10]

# Removing: ['id', 'host_id', 'host_since', 'host_location', 'host_response_time', 'host_neighbourhood']
cleaned_ten = first_ten.drop(columns=['host_id', 'host_location', 'host_neighbourhood'])

# Converting: ['host_response_rate', 'host_acceptance_rate', 'host_is_superhost']
cleaned_ten['host_response_rate'] = pd.to_numeric(cleaned_ten['host_response_rate'].str.strip('%')) / 100
cleaned_ten['host_acceptance_rate'] = pd.to_numeric(cleaned_ten['host_acceptance_rate'].str.strip('%')) / 100
cleaned_ten['host_since'] = pd.to_datetime(cleaned_ten['host_since'])
cleaned_ten['days_since_host'] = (pd.datetime.now() - cleaned_ten['host_since']).dt.days
cleaned_ten = cleaned_ten.drop(columns=['host_since'])



second_ten = test.iloc[:, 10:20]

# Converting: ['host_has_profile_pic', 'neighbourhood_cleansed', 'host_verifications', 'host_identity_verified','latitude', 'longitude', 'property_type', 'room_type']
cleaned_twenty = second_ten
neighbourhood_counts = cleaned_twenty.neighbourhood_cleansed.value_counts()
neighbourhoods_to_replace = neighbourhood_counts[neighbourhood_counts < 64].index.tolist()
cleaned_twenty['neighbourhood_cleansed'] = cleaned_twenty['neighbourhood_cleansed'].replace(neighbourhoods_to_replace, 'Other')
cleaned_twenty['num_verifications'] = cleaned_twenty['host_verifications'].apply(lambda x: len(ast.literal_eval(x)))
cleaned_twenty = second_ten.drop(columns=['host_verifications'])
cleaned_twenty['host_has_profile_pic'] = cleaned_twenty['host_has_profile_pic'].map({'t': 1, 'f': 0})
cleaned_twenty['host_identity_verified'] = cleaned_twenty['host_identity_verified'].map({'t': 1, 'f': 0})
cleaned_twenty['latitude'] = pd.to_numeric(cleaned_twenty['latitude'])
cleaned_twenty['longitude'] = pd.to_numeric(cleaned_twenty['longitude'])
cleaned_twenty['bathrooms_text'] = cleaned_twenty['bathrooms_text'].replace({"Half-bath": "0.5", "Shared half-bath": "0.5", "Private half-bath": "0.5"})
cleaned_twenty['num_bathrooms'] = cleaned_twenty['bathrooms_text'].str.extract(r'(\d+(\.\d+)?)')[0].astype(float)
cleaned_twenty = cleaned_twenty.drop(columns=['bathrooms_text'])

cleaned_twenty['property_category'] = "Entire property"
cleaned_twenty.loc[cleaned_twenty['property_type'].str.contains('room', case=False), 'property_category'] = 'Room'
cleaned_twenty = cleaned_twenty.drop(columns=['property_type'])




third_ten = test.iloc[:, 20:30]

# Converting: ['has_availability']
third_ten['has_availability'] = third_ten['has_availability'].map({'t': 1, 'f': 0})




fourth_ten = test.iloc[:, 30:40]

# Removing: ['first_review', 'last_review']
# Converting: []
cleaned_fourth = fourth_ten.drop(columns=['first_review', 'last_review'])



fifth_ten = test.iloc[:, 40:50]
# Converting: ['instant_bookable']
cleaned_fifth = fifth_ten
cleaned_fifth['instant_bookable'] = cleaned_fifth['instant_bookable'].map({'t': 1, 'f': 0})



last_three = test.iloc[:, 50:]

In [108]:
# Combining test data
cleaned_test = pd.concat([cleaned_ten, cleaned_twenty, third_ten, cleaned_fourth, cleaned_fifth, last_three], axis=1)

In [109]:
# Imputing missing values

columns_with_missing = ['host_response_rate', 'host_acceptance_rate', 'beds', \
                        'num_bathrooms', 'review_scores_rating']

cleaned_test['reviews_per_month'].fillna(cleaned_test['reviews_per_month'].mode()[0], inplace=True)
cleaned_test['host_response_time'].fillna(cleaned_test['host_response_time'].mode()[0], inplace=True)

# Computing the missing values of numeric variables using KNN

knn_imputer = impute.KNNImputer(n_neighbors=10)
cleaned_test_imputed = knn_imputer.fit_transform(cleaned_test[columns_with_missing])
cleaned_test_imputed_df = pd.DataFrame(cleaned_test_imputed, columns=columns_with_missing)
cleaned_test[columns_with_missing] = cleaned_test_imputed_df

In [110]:
X_test_non_redundant = cleaned_test.drop(columns = to_be_removed)
X_test_non_redundant = pd.get_dummies(X_test_non_redundant, drop_first = True)
X_test_non_redundant = X_test_non_redundant.iloc[:, 1:]

In [111]:
poly_test = PolynomialFeatures(2, include_bias = False)
poly_test.fit(X_test_non_redundant)
X_test_poly = poly_test.transform(X_test_non_redundant)

In [112]:
X_test_non_scaled_poly_df = pd.DataFrame(X_test_poly, columns = poly_test.get_feature_names_out(X_test_non_redundant.columns))

## Step 2) Hyperparameter tuning

### How many attempts did it take you to tune the model hyperparameters?

The first time that I tuned the model hyperparameters, I was able to achieve a classification accuracy of over 92% on Kaggle. I tuned the model hyperparameters using all of the predictors in the dataset after applying PolynomialFeatures with order 2. 

### Which tuning method did you use (grid search / Bayes search / etc.)?

To tune the random forest model, I used a method similar to grid search: I tested each possible hyperparameter combination using a `for` loop and selected the one that resulted in the highest classification accuracy.

### What challenges did you face while tuning the hyperparameters, and what actions did you take to address those challenges?

The main challenge that I faced was that the optimal hyperparameter values sometimes fluctated, which resulted in different accuracy scores on Kaggle (though they were all above the 92% threshold). Since I was worried about the possibility of the accuracy score dipping below 92% when this assignment will be evaluated, I decided to use `random_state=1` to stabilize the hyperparameter values and the resulting accuracy score.

Another challenge that I faced was that some of the predictor names in my test dataset after utilizing PolynomialFeatures were the reverse of their names in my train dataset (e.g. 'days_since_host host_total_listings_count' vs 'host_total_listings_count days_since_host'). To address this issue, I created a `for` loop to ensure that all of the predictor names in my test dataset matched those in the train dataset.

### How many hours did you spend on hyperparameter tuning?

I spent about 30 minutes writing the code for the hyperparameter tuning process. The code only required 3 to 5 minutes to finish running.

### Hyperparameter tuning code

In [113]:
analysis_results = pd.DataFrame(columns = ['Threshold', 'Parameters', 'Accuracy'])

In [95]:
start_time = time.time()

params = {'n_estimators': [300],
          'max_features': [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.6, 1.0]}

param_list=list(it.product(*(params[Name] for Name in params)))
thresholds = np.arange(0.01, 1.0, 0.01)
    
for pr in param_list:
    model = RandomForestClassifier(random_state=1, oob_score=True, n_estimators=pr[0],max_features=pr[1], \
                                   n_jobs=-1).fit(X_train_non_scaled_poly_df, y_train)
    
    oob_prob = model.oob_decision_function_[:, 1]
    
    for threshold in thresholds:
        oob_pred = (oob_prob >= threshold).astype(int)
        oob_accuracy = accuracy_score(y_train, oob_pred)
        analysis_results = analysis_results.append({'Threshold': threshold, 'Parameters': pr, 'Accuracy': oob_accuracy}, ignore_index=True)
    
    
end_time = time.time()
print("time taken = ", (end_time-start_time)/60, " minutes")

time taken =  3.0391077677408855  minutes


### Optimal hyperparameter values

In [96]:
analysis_results.sort_values(by='Accuracy', ascending=False).iloc[0]

Threshold           0.49
Parameters    (300, 0.3)
Accuracy        0.883464
Name: 543, dtype: object

**The optimal value of `max_features` is 0.3 and the optimal decision threshold probability is 0.49.**

## Step 3) Developing the model

In [114]:
reversed_columns = ['days_since_host host_total_listings_count', 'beds num_bathrooms', 'minimum_nights num_bathrooms', 'maximum_nights num_bathrooms', 'minimum_nights_avg_ntm num_bathrooms', 'maximum_nights_avg_ntm num_bathrooms']

In [115]:
for column in reversed_columns:
    predictors = column.split(' ')
    old_column = predictors[1] + ' ' + predictors[0]
    X_test_non_scaled_poly_df[column] = X_test_non_scaled_poly_df[old_column]

In [80]:
tuned_model = RandomForestClassifier(random_state=1, oob_score=True, n_estimators=500,
                               max_features=0.3, n_jobs=-1).fit(X_train_non_scaled_poly_df, y_train)

In [81]:
y_pred = tuned_model.predict_proba(X_test_non_scaled_poly_df.loc[:, X_train_non_scaled_poly_df.columns])
predicted_class = y_pred[:,1] > 0.49
test_pred = predicted_class.astype(int)

In [82]:
# Creating a column in the dataset called `predicted`

cleaned_test.insert(1, "predicted", test_pred)
to_submit = cleaned_test.iloc[:, :2]

## Step 4) Ad-hoc steps for further improving model accuracy

#### <font color = 'red'>Matching known host_id's</font>

In [83]:
known = train.loc[:, ['id', 'host_id','host_is_superhost']]
known['host_is_superhost'] = known['host_is_superhost'].map({'t': 1, 'f': 0})

not_known = test.loc[:, ['id', 'host_id']]

overlapping_host_ids = set(known['host_id']).intersection(set(not_known['host_id']))

In [84]:
for host_id in overlapping_host_ids:
    host_is_superhost = known.loc[known['host_id'] == host_id, 'host_is_superhost'].iloc[0]
    not_known.loc[not_known['host_id'] == host_id, 'host_is_superhost'] = host_is_superhost

In [85]:
merged_df = pd.merge(to_submit, not_known[['id', 'host_is_superhost']], on='id', how='left')
merged_df.host_is_superhost = merged_df.host_is_superhost.fillna(merged_df.predicted)

In [86]:
final_submission = merged_df.iloc[:, [0,2]].rename(columns = {'host_is_superhost':'predicted'})

## Step 5) Exporting the predictions in the format required to submit on Kaggle

In [87]:
final_submission.to_csv('Random Forest Classification - Final Submission.csv', index=False)  