In [67]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, \
cross_validate, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, ParameterGrid
from sklearn.ensemble import BaggingRegressor,BaggingClassifier,RandomForestRegressor,RandomForestClassifier
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, \
accuracy_score, precision_score, confusion_matrix, mean_squared_error, r2_score, mean_absolute_error
from skopt import BayesSearchCV
from sklearn import impute
import ast
import itertools as it

from sklearn.tree import export_graphviz 
from six import StringIO
from IPython.display import Image  
import pydotplus
import time as time

os.getcwd()
os.chdir('/Users/kevin/Downloads/Northwestern University/Data Science/STAT_303-3/Prediction Problems/Datasets')

## Step 0) Read data

In [68]:
train = pd.read_csv('train_regression.csv')
test = pd.read_csv('test_regression.csv')

## Step 1) Data pre-processing

### <font color = 'red'>Pre-processing training data</font>

In [69]:
# Cleaning the columns


first_ten = train.iloc[:, :10]

# Removing: ['id', 'host_location', 'host_neighbourhood']
cleaned_ten = first_ten.drop(columns=['id', 'host_id', 'host_location', 'host_neighbourhood'])

# Converting: ['host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_since']
cleaned_ten['host_response_rate'] = pd.to_numeric(cleaned_ten['host_response_rate'].str.strip('%')) / 100
cleaned_ten['host_acceptance_rate'] = pd.to_numeric(cleaned_ten['host_acceptance_rate'].str.strip('%')) / 100
cleaned_ten['host_is_superhost'] = cleaned_ten['host_is_superhost'].map({'t': 1, 'f': 0})
cleaned_ten['host_since'] = pd.to_datetime(cleaned_ten['host_since'])
cleaned_ten['days_since_host'] = (pd.datetime.now() - cleaned_ten['host_since']).dt.days
cleaned_ten = cleaned_ten.drop(columns=['host_since'])



second_ten = train.iloc[:, 10:20]

# Converting: ['host_has_profile_pic', 'neighbourhood_cleansed', 'host_identity_verified','latitude', 'longitude', 'property_type', 'room_type']
cleaned_twenty = second_ten
neighbourhood_counts = cleaned_twenty.neighbourhood_cleansed.value_counts()
neighbourhoods_to_replace = neighbourhood_counts[neighbourhood_counts < 107].index.tolist()
cleaned_twenty['neighbourhood_cleansed'] = cleaned_twenty['neighbourhood_cleansed'].replace(neighbourhoods_to_replace, 'Other')
cleaned_twenty['num_verifications'] = cleaned_twenty['host_verifications'].apply(lambda x: len(ast.literal_eval(x)))
cleaned_twenty = cleaned_twenty.drop(columns=['host_verifications'])
cleaned_twenty['host_has_profile_pic'] = cleaned_twenty['host_has_profile_pic'].map({'t': 1, 'f': 0})
cleaned_twenty['host_identity_verified'] = cleaned_twenty['host_identity_verified'].map({'t': 1, 'f': 0})
cleaned_twenty['latitude'] = pd.to_numeric(cleaned_twenty['latitude'])
cleaned_twenty['longitude'] = pd.to_numeric(cleaned_twenty['longitude'])
cleaned_twenty['property_category'] = "Entire property"
cleaned_twenty.loc[cleaned_twenty['property_type'].str.contains('room', case=False), 'property_category'] = 'Room'
cleaned_twenty = cleaned_twenty.drop(columns=['property_type'])



third_ten = train.iloc[:, 20:30]

# Converting: ['bathrooms_text', 'price']

third_ten['bathrooms_text'] = third_ten['bathrooms_text'].replace({"Half-bath": "0.5", "Shared half-bath": "0.5", "Private half-bath": "0.5"})
third_ten['num_bathrooms'] = third_ten['bathrooms_text'].str.extract(r'(\d+(\.\d+)?)')[0].astype(float)
cleaned_third = third_ten.drop(columns=['bathrooms_text'])
cleaned_third['price'] = cleaned_third['price'].str.replace('[$,]', '', regex=True).astype(float)



fourth_ten = train.iloc[:, 30:40]
fourth_ten.dtypes

# Removing: ['first_review']
# Converting: ['has_availability']

cleaned_fourth = fourth_ten.drop(columns=['first_review'])
cleaned_fourth['has_availability'] = cleaned_fourth['has_availability'].map({'t': 1, 'f': 0})



fifth_ten = train.iloc[:, 40:50]
fifth_ten

# Removing: ['last_review']
# Converting: ['instant_bookable']

cleaned_fifth = fifth_ten.drop(columns=['last_review'])
cleaned_fifth['instant_bookable'] = cleaned_fifth['instant_bookable'].map({'t': 1, 'f': 0})



last_four = train.iloc[:, 50:]

In [70]:
# Combining the cleaned datasets

cleaned_train = pd.concat([cleaned_ten, cleaned_twenty, cleaned_third, cleaned_fourth, cleaned_fifth, last_four], axis=1)

In [71]:
columns_with_missing = ['num_bathrooms', 'reviews_per_month', 'host_is_superhost', 
                        'review_scores_rating', 'host_response_rate', 
                        'host_acceptance_rate', 'beds', 'review_scores_communication', 
                        'review_scores_cleanliness', 'review_scores_accuracy', 
                        'review_scores_value', 'review_scores_location', 'review_scores_checkin']

In [72]:
# Computing the missing values of dummy variables using mode

cleaned_train['host_is_superhost'].fillna(cleaned_train['host_is_superhost'].mode()[0], inplace=True)
cleaned_train['host_response_time'].fillna(cleaned_train['host_response_time'].mode()[0], inplace=True)

In [73]:
# Computing the missing values of numeric variables using KNN

knn_imputer = impute.KNNImputer(n_neighbors=10)
cleaned_train_imputed = knn_imputer.fit_transform(cleaned_train[columns_with_missing])
cleaned_train_imputed_df = pd.DataFrame(cleaned_train_imputed, columns=columns_with_missing)
cleaned_train[columns_with_missing] = cleaned_train_imputed_df

In [74]:
to_be_removed = ['review_scores_communication','review_scores_cleanliness', 'number_of_reviews_l30d', \
                                'review_scores_accuracy', 'review_scores_value','review_scores_location', \
                                'review_scores_checkin', 'minimum_minimum_nights', 'maximum_minimum_nights', \
                                'minimum_maximum_nights', 'maximum_maximum_nights', 'availability_60', \
                                'availability_90', 'availability_365','calculated_host_listings_count',
                                'calculated_host_listings_count_entire_homes', 'host_listings_count']

In [75]:
y_train = np.log(cleaned_train.price)
X_train = cleaned_train.drop(columns = 'price')

In [76]:
X_train_non_redundant = X_train.copy()
X_train_non_redundant.drop(columns = to_be_removed, inplace = True)
X_train_non_redundant = pd.get_dummies(X_train_non_redundant, drop_first = True)

#### <font color = 'black'>PolynomialFeatures</font>

In [77]:
poly = PolynomialFeatures(2, interaction_only = True, include_bias = False)
X_train_poly = poly.fit_transform(X_train_non_redundant)

In [78]:
X_train_non_scaled_poly_df = pd.DataFrame(X_train_poly, columns = poly.get_feature_names_out(X_train_non_redundant.columns))

### <font color = 'red'>Pre-processing test data</font>

In [79]:
# Cleaning the test data

first_ten = test.iloc[:, :10]
first_ten

# Removing: ['host_location', 'host_neighbourhood']
# Converting: ['host_response_rate', 'host_acceptance_rate', 'host_is_superhost']

cleaned_ten = first_ten.drop(columns=['host_location', 'host_neighbourhood'])

cleaned_ten['host_response_rate'] = pd.to_numeric(cleaned_ten['host_response_rate'].str.strip('%')) / 100
cleaned_ten['host_acceptance_rate'] = pd.to_numeric(cleaned_ten['host_acceptance_rate'].str.strip('%')) / 100
cleaned_ten['host_is_superhost'] = cleaned_ten['host_is_superhost'].map({'t': 1, 'f': 0})
cleaned_ten['host_since'] = pd.to_datetime(cleaned_ten['host_since'])
cleaned_ten['days_since_host'] = (pd.datetime.now() - cleaned_ten['host_since']).dt.days
cleaned_ten = cleaned_ten.drop(columns=['host_since'])



second_ten = test.iloc[:, 10:20]

# Consider removing: []
# Consider converting: ['host_has_profile_pic', 'host_identity_verified','latitude', 'longitude', 'property_type', 'room_type']

cleaned_twenty = second_ten
neighbourhood_counts = cleaned_twenty.neighbourhood_cleansed.value_counts()
neighbourhoods_to_replace = neighbourhood_counts[neighbourhood_counts < 64].index.tolist()
cleaned_twenty['neighbourhood_cleansed'] = cleaned_twenty['neighbourhood_cleansed'].replace(neighbourhoods_to_replace, 'Other')
cleaned_twenty['num_verifications'] = cleaned_twenty['host_verifications'].apply(lambda x: len(ast.literal_eval(x)))
cleaned_twenty = cleaned_twenty.drop(columns=['host_verifications'])
cleaned_twenty['host_has_profile_pic'] = cleaned_twenty['host_has_profile_pic'].map({'t': 1, 'f': 0})
cleaned_twenty['host_identity_verified'] = cleaned_twenty['host_identity_verified'].map({'t': 1, 'f': 0})
cleaned_twenty['latitude'] = pd.to_numeric(cleaned_twenty['latitude'])
cleaned_twenty['longitude'] = pd.to_numeric(cleaned_twenty['longitude'])
cleaned_twenty['property_category'] = "Entire property"
cleaned_twenty.loc[cleaned_twenty['property_type'].str.contains('room', case=False), 'property_category'] = 'Room'
cleaned_twenty = cleaned_twenty.drop(columns=['property_type'])



third_ten = test.iloc[:, 20:30]

# Converting: ['bathrooms_text']

third_ten['bathrooms_text'] = third_ten['bathrooms_text'].replace({"Half-bath": "0.5", "Shared half-bath": "0.5", "Private half-bath": "0.5"})
third_ten['num_bathrooms'] = third_ten['bathrooms_text'].str.extract(r'(\d+(\.\d+)?)')[0].astype(float)
cleaned_third = third_ten.drop(columns=['bathrooms_text'])



fourth_ten = test.iloc[:, 30:40]

# Removing: ['first_review', 'last_review']
# Converting: ['has_availability']

cleaned_fourth = fourth_ten.drop(columns=['first_review', 'last_review'])
cleaned_fourth['has_availability'] = cleaned_fourth['has_availability'].map({'t': 1, 'f': 0})



fifth_ten = test.iloc[:, 40:50]

# Consider removing: []
# Consider converting: ['instant_bookable']

fifth_ten['instant_bookable'] = fifth_ten['instant_bookable'].map({'t': 1, 'f': 0})

last_three = test.iloc[:, 50:]

In [80]:
# Combining the test datasets
cleaned_test = pd.concat([cleaned_ten, cleaned_twenty, cleaned_third, cleaned_fourth, fifth_ten, last_three], axis=1)

In [81]:
copy_ct = cleaned_test.copy()

copy_ct['host_is_superhost'].fillna(copy_ct['host_is_superhost'].mode()[0], inplace=True)
copy_ct['host_response_time'].fillna(copy_ct['host_response_time'].mode()[0], inplace=True)

columns_with_missing = ['num_bathrooms', 'reviews_per_month', 'host_is_superhost', 
                        'review_scores_rating', 'host_response_rate', 
                        'host_acceptance_rate', 'beds', 'review_scores_communication', 
                        'review_scores_cleanliness', 'review_scores_accuracy', 
                        'review_scores_value', 'review_scores_location', 'review_scores_checkin']

knn_imputer = impute.KNNImputer(n_neighbors=10)
copy_ct_imputed = knn_imputer.fit_transform(copy_ct[columns_with_missing])
copy_ct_imputed_df = pd.DataFrame(copy_ct_imputed, columns=columns_with_missing)
copy_ct[columns_with_missing] = copy_ct_imputed_df

In [82]:
X_test_non_redundant = copy_ct.drop(columns = to_be_removed)
X_test_non_redundant = pd.get_dummies(X_test_non_redundant, drop_first = True)
X_test_non_redundant = X_test_non_redundant.iloc[:, 2:]

In [83]:
poly_test = PolynomialFeatures(2, include_bias = False)
poly_test.fit(X_test_non_redundant)
X_test_poly = poly_test.transform(X_test_non_redundant)

In [84]:
X_test_non_scaled_poly_df = pd.DataFrame(X_test_poly, columns = poly_test.get_feature_names_out(X_test_non_redundant.columns))

## 2) Hyperparameter tuning

### How many attempts did it take you to tune the model hyperparameters?

The first time that I tuned the model hyperparameters, I was able to achieve a RMSE that was barely under the 128 threshold; this model was tuned with all of the predictors after utilizing PolynomialFeatures with order 2. However, since I wanted to add some cushion to my score, I decided to tune the model using only the most relevant predictors, which were selected using Lasso. On my second attempt, I was able to achieve a RMSE that I was satisfied with. 

### Which tuning method did you use (grid search / Bayes search / etc.)?

I used grid search (`GridSearchCV`) to determine the number of predictors to use in the random forest model. To tune the random forest model, I used a tuning method similar to grid search, where I tested each possible hyperparameter combination using a `for` loop and selected the one that resulted in the lowest RMSE.

### What challenges did you face while tuning the hyperparameters, and what actions did you take to address those challenges?

The main challenge that I faced was that the optimal hyperparameter value (`max_features`) for my random forest model sometimes fluctuated, which resulted in slightly different scores on Kaggle (although they were all below the 128 RMSE threshold). In order to eliminate this variation, I decided to use `random_state=1` in both the tuning and training stage of my random forest model; this stabilized both the optimal hyperparameter value and the resulting score on Kaggle.

### How many hours did you spend on hyperparameter tuning?

It took about 1 hour for the code to perform the variable selection process. However, tuning the hyperparameters for the random forest model was very quick, as the code finished in under a minute. I spent about 1 hour on the code that performs the hyperparameter tuning. 

### Hyperparameter tuning code

#### <font color = 'red'>Lasso for variable selection</font>

In [42]:
scaler = StandardScaler()
scaler.fit(X_train_poly)
X_train_scaled = scaler.transform(X_train_poly)

In [23]:
# Lasso for variable selection
alphas = np.logspace(-1,-5,200)
lassocv = LassoCV(alphas = alphas, cv = 10, max_iter = 1000)
lassocv.fit(X_train_scaled, y_train)

lassocv.alpha_

0.0032550885998350564

In [24]:
lasso = Lasso(alpha = lassocv.alpha_)
lasso.fit(X_train_scaled, y_train)
coefficients = {}
for i in range(len(lasso.coef_)):
    coefficients[poly.get_feature_names_out()[i]] = lasso.coef_[i]

In [25]:
sorted_predictors = pd.Series(coefficients).sort_values(key = abs, ascending = False).index.to_list()

In [26]:
analysis_results = pd.DataFrame(columns = ['r', 'predictors', 'n_neighbors', 'weights', 'p', 'Optimal RMSE'])

In [27]:
for i in range(20, 51):
    predictors = sorted_predictors[:i]
    
    X = X_train_non_scaled_poly_df.loc[:, predictors]
    sc  = StandardScaler()
    Xstd = sc.fit_transform(X)
    
    # Using GridSearchCV to tune the hyperparameters:

    # 1) Create the model
    model = KNeighborsRegressor(metric = 'minkowski')

    # 2) Create a hyperparameter grid (as a dict)   
    grid = {'n_neighbors': np.arange(1, 21), 'weights':['uniform', 'distance'], 'p': [1, 1.1]}

    # 3) Create the Kfold object
    kfold = KFold(n_splits = 5, shuffle = True, random_state = 1)

    # 4) Create the CV object
    gcv = GridSearchCV(model, param_grid = grid, cv = kfold, scoring = 'neg_root_mean_squared_error', n_jobs = -1)

    # Fit the models, and cross-validate
    gcv.fit(Xstd, y_train)    
    analysis_results = analysis_results.append({'r': i, 'predictors': predictors, 'n_neighbors': gcv.best_params_['n_neighbors'], 'weights': gcv.best_params_['weights'], 'p': gcv.best_params_['p'], 'Optimal RMSE': -gcv.best_score_}, ignore_index=True)

In [64]:
analysis_results.sort_values(by = 'Optimal RMSE').iloc[:1]

Unnamed: 0,r,predictors,n_neighbors,weights,p,Optimal RMSE
19,39,"[num_bathrooms property_category_Room, num_bat...",11,distance,1,0.420202


#### <font color = 'red'>Tuning the Random Forest model</font>

In [65]:
import time

start_time = time.time()

params = {'n_estimators': [300],
          'max_features': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]}

param_list=list(it.product(*(params[Name] for Name in params)))

oob_score = [0]*len(param_list)
i=0
for pr in param_list:
    model = RandomForestRegressor(random_state=1, oob_score=True,verbose=False,n_estimators = pr[0], max_features=pr[1], \
                                  n_jobs=-1).fit(X_train_non_scaled_poly_df[sorted_predictors[:39]],y_train)
    oob_score[i] = mean_absolute_error(model.oob_prediction_, y_train)
    i=i+1
    
end_time = time.time()
print("time taken = ", (end_time-start_time)/60, " minutes")

time taken =  0.3111697832743327  minutes


### Optimal hyperparameter values

In [66]:
print("Best params = ", param_list[np.argmin(oob_score)])

Best params =  (300, 0.5)


**The optimal value of `max_features` is 0.50 and the optimal number of predictors is 39.**

## Step 3) Developing the model

In [53]:
model_tuned = RandomForestRegressor(random_state=1, oob_score=True,n_estimators=1000, \
                                    max_features=0.5,n_jobs=-1).fit(X_train_non_scaled_poly_df[sorted_predictors[:39]], y_train)

## Step 4) Ad-hoc steps for further improving model accuracy

In [60]:
test_pred = np.exp(model_tuned.predict(X_test_non_scaled_poly_df[sorted_predictors[:39]]))

## Step 5) Exporting the predictions in the format required to submit on Kaggle

In [61]:
copy_ct.insert(1, "predicted", test_pred)
to_submit = copy_ct.iloc[:, :2]

In [62]:
to_submit.to_csv('Random Forest - Final.csv', index=False)  