In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import Ridge, RidgeCV, Lasso, LassoCV, LogisticRegressionCV, ElasticNetCV, LogisticRegression, LinearRegression
from sklearn.metrics import r2_score, accuracy_score, recall_score, confusion_matrix, mean_squared_error
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score, cross_validate
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, \
cross_validate, GridSearchCV, RandomizedSearchCV, KFold, StratifiedKFold, RepeatedKFold, RepeatedStratifiedKFold
from sklearn import impute
import ast
import itertools
import os
import warnings
warnings.filterwarnings('ignore')

os.getcwd()
os.chdir('/Users/kevin/Downloads/Northwestern University/Data Science/STAT_303-3/Prediction Problems')

## Step 0) Read data

In [3]:
train = pd.read_csv('train_regression.csv')
test = pd.read_csv('test_regression.csv')

## Step 1) Data pre-processing

### <font color = 'red'>Pre-processing training data</font>

In [4]:
# Cleaning the columns


first_ten = train.iloc[:, :10]

# Removing: ['id', 'host_location', 'host_neighbourhood']
cleaned_ten = first_ten.drop(columns=['id', 'host_id', 'host_location', 'host_neighbourhood'])

# Converting: ['host_response_rate', 'host_acceptance_rate', 'host_is_superhost', 'host_since']
cleaned_ten['host_response_rate'] = pd.to_numeric(cleaned_ten['host_response_rate'].str.strip('%')) / 100
cleaned_ten['host_acceptance_rate'] = pd.to_numeric(cleaned_ten['host_acceptance_rate'].str.strip('%')) / 100
cleaned_ten['host_is_superhost'] = cleaned_ten['host_is_superhost'].map({'t': 1, 'f': 0})
cleaned_ten['host_since'] = pd.to_datetime(cleaned_ten['host_since'])
cleaned_ten['days_since_host'] = (pd.datetime.now() - cleaned_ten['host_since']).dt.days
cleaned_ten = cleaned_ten.drop(columns=['host_since'])



second_ten = train.iloc[:, 10:20]

# Converting: ['host_has_profile_pic', 'neighbourhood_cleansed', 'host_identity_verified','latitude', 'longitude', 'property_type', 'room_type']
cleaned_twenty = second_ten
neighbourhood_counts = cleaned_twenty.neighbourhood_cleansed.value_counts()
neighbourhoods_to_replace = neighbourhood_counts[neighbourhood_counts < 107].index.tolist()
cleaned_twenty['neighbourhood_cleansed'] = cleaned_twenty['neighbourhood_cleansed'].replace(neighbourhoods_to_replace, 'Other')
cleaned_twenty['num_verifications'] = cleaned_twenty['host_verifications'].apply(lambda x: len(ast.literal_eval(x)))
cleaned_twenty = cleaned_twenty.drop(columns=['host_verifications'])
cleaned_twenty['host_has_profile_pic'] = cleaned_twenty['host_has_profile_pic'].map({'t': 1, 'f': 0})
cleaned_twenty['host_identity_verified'] = cleaned_twenty['host_identity_verified'].map({'t': 1, 'f': 0})
cleaned_twenty['latitude'] = pd.to_numeric(cleaned_twenty['latitude'])
cleaned_twenty['longitude'] = pd.to_numeric(cleaned_twenty['longitude'])
cleaned_twenty['property_category'] = "Entire property"
cleaned_twenty.loc[cleaned_twenty['property_type'].str.contains('room', case=False), 'property_category'] = 'Room'
cleaned_twenty = cleaned_twenty.drop(columns=['property_type'])



third_ten = train.iloc[:, 20:30]

# Converting: ['bathrooms_text', 'price']

third_ten['bathrooms_text'] = third_ten['bathrooms_text'].replace({"Half-bath": "0.5", "Shared half-bath": "0.5", "Private half-bath": "0.5"})
third_ten['num_bathrooms'] = third_ten['bathrooms_text'].str.extract(r'(\d+(\.\d+)?)')[0].astype(float)
cleaned_third = third_ten.drop(columns=['bathrooms_text'])
cleaned_third['price'] = cleaned_third['price'].str.replace('[$,]', '', regex=True).astype(float)



fourth_ten = train.iloc[:, 30:40]
fourth_ten.dtypes

# Removing: ['first_review']
# Converting: ['has_availability']

cleaned_fourth = fourth_ten.drop(columns=['first_review'])
cleaned_fourth['has_availability'] = cleaned_fourth['has_availability'].map({'t': 1, 'f': 0})



fifth_ten = train.iloc[:, 40:50]
fifth_ten

# Removing: ['last_review']
# Converting: ['instant_bookable']

cleaned_fifth = fifth_ten.drop(columns=['last_review'])
cleaned_fifth['instant_bookable'] = cleaned_fifth['instant_bookable'].map({'t': 1, 'f': 0})



last_four = train.iloc[:, 50:]

In [5]:
# Combining the cleaned datasets

cleaned_train = pd.concat([cleaned_ten, cleaned_twenty, cleaned_third, cleaned_fourth, cleaned_fifth, last_four], axis=1)

In [6]:
columns_with_missing = ['num_bathrooms', 'reviews_per_month', 'host_is_superhost', 
                        'review_scores_rating', 'host_response_rate', 
                        'host_acceptance_rate', 'beds', 'review_scores_communication', 
                        'review_scores_cleanliness', 'review_scores_accuracy', 
                        'review_scores_value', 'review_scores_location', 'review_scores_checkin']

In [7]:
# Computing the missing values of dummy variables using mode

cleaned_train['host_is_superhost'].fillna(cleaned_train['host_is_superhost'].mode()[0], inplace=True)
cleaned_train['host_response_time'].fillna(cleaned_train['host_response_time'].mode()[0], inplace=True)

In [8]:
# Computing the missing values of numeric variables using KNN

knn_imputer = impute.KNNImputer(n_neighbors=10)
cleaned_train_imputed = knn_imputer.fit_transform(cleaned_train[columns_with_missing])
cleaned_train_imputed_df = pd.DataFrame(cleaned_train_imputed, columns=columns_with_missing)
cleaned_train[columns_with_missing] = cleaned_train_imputed_df

In [9]:
to_be_removed = ['review_scores_communication','review_scores_cleanliness', 'number_of_reviews_l30d', \
                                'review_scores_accuracy', 'review_scores_value','review_scores_location', \
                                'review_scores_checkin', 'minimum_minimum_nights', 'maximum_minimum_nights', \
                                'minimum_maximum_nights', 'maximum_maximum_nights', 'availability_60', \
                                'availability_90', 'availability_365','calculated_host_listings_count',
                                'calculated_host_listings_count_entire_homes', 'host_listings_count']

In [10]:
y_train = np.log(cleaned_train.price)
X_train = cleaned_train.drop(columns = 'price')

In [11]:
X_train_non_redundant = X_train.copy()
X_train_non_redundant.drop(columns = to_be_removed, inplace = True)
X_train_non_redundant = pd.get_dummies(X_train_non_redundant, drop_first = True)

#### <font color = 'black'>PolynomialFeatures</font>

In [12]:
poly = PolynomialFeatures(2, interaction_only = True, include_bias = False)
X_train_poly = poly.fit_transform(X_train_non_redundant)

In [13]:
X_train_non_scaled_poly_df = pd.DataFrame(X_train_poly, columns = poly.get_feature_names_out(X_train_non_redundant.columns))

#### <font color = 'black'>Scaling</font>

In [14]:
scaler = StandardScaler()
scaler.fit(X_train_poly)
X_train_scaled = scaler.transform(X_train_poly)

### <font color = 'red'>Pre-processing test data</font>

In [34]:
# Cleaning the test data

first_ten = test.iloc[:, :10]
first_ten

# Removing: ['host_location', 'host_neighbourhood']
# Converting: ['host_response_rate', 'host_acceptance_rate', 'host_is_superhost']

cleaned_ten = first_ten.drop(columns=['host_location', 'host_neighbourhood'])

cleaned_ten['host_response_rate'] = pd.to_numeric(cleaned_ten['host_response_rate'].str.strip('%')) / 100
cleaned_ten['host_acceptance_rate'] = pd.to_numeric(cleaned_ten['host_acceptance_rate'].str.strip('%')) / 100
cleaned_ten['host_is_superhost'] = cleaned_ten['host_is_superhost'].map({'t': 1, 'f': 0})
cleaned_ten['host_since'] = pd.to_datetime(cleaned_ten['host_since'])
cleaned_ten['days_since_host'] = (pd.datetime.now() - cleaned_ten['host_since']).dt.days
cleaned_ten = cleaned_ten.drop(columns=['host_since'])



second_ten = test.iloc[:, 10:20]

# Consider removing: []
# Consider converting: ['host_has_profile_pic', 'host_identity_verified','latitude', 'longitude', 'property_type', 'room_type']

cleaned_twenty = second_ten
neighbourhood_counts = cleaned_twenty.neighbourhood_cleansed.value_counts()
neighbourhoods_to_replace = neighbourhood_counts[neighbourhood_counts < 64].index.tolist()
cleaned_twenty['neighbourhood_cleansed'] = cleaned_twenty['neighbourhood_cleansed'].replace(neighbourhoods_to_replace, 'Other')
cleaned_twenty['num_verifications'] = cleaned_twenty['host_verifications'].apply(lambda x: len(ast.literal_eval(x)))
cleaned_twenty = cleaned_twenty.drop(columns=['host_verifications'])
cleaned_twenty['host_has_profile_pic'] = cleaned_twenty['host_has_profile_pic'].map({'t': 1, 'f': 0})
cleaned_twenty['host_identity_verified'] = cleaned_twenty['host_identity_verified'].map({'t': 1, 'f': 0})
cleaned_twenty['latitude'] = pd.to_numeric(cleaned_twenty['latitude'])
cleaned_twenty['longitude'] = pd.to_numeric(cleaned_twenty['longitude'])
cleaned_twenty['property_category'] = "Entire property"
cleaned_twenty.loc[cleaned_twenty['property_type'].str.contains('room', case=False), 'property_category'] = 'Room'
cleaned_twenty = cleaned_twenty.drop(columns=['property_type'])



third_ten = test.iloc[:, 20:30]

# Converting: ['bathrooms_text']

third_ten['bathrooms_text'] = third_ten['bathrooms_text'].replace({"Half-bath": "0.5", "Shared half-bath": "0.5", "Private half-bath": "0.5"})
third_ten['num_bathrooms'] = third_ten['bathrooms_text'].str.extract(r'(\d+(\.\d+)?)')[0].astype(float)
cleaned_third = third_ten.drop(columns=['bathrooms_text'])



fourth_ten = test.iloc[:, 30:40]

# Removing: ['first_review', 'last_review']
# Converting: ['has_availability']

cleaned_fourth = fourth_ten.drop(columns=['first_review', 'last_review'])
cleaned_fourth['has_availability'] = cleaned_fourth['has_availability'].map({'t': 1, 'f': 0})



fifth_ten = test.iloc[:, 40:50]

# Consider removing: []
# Consider converting: ['instant_bookable']

fifth_ten['instant_bookable'] = fifth_ten['instant_bookable'].map({'t': 1, 'f': 0})

last_three = test.iloc[:, 50:]

In [35]:
# Combining the test datasets
cleaned_test = pd.concat([cleaned_ten, cleaned_twenty, cleaned_third, cleaned_fourth, fifth_ten, last_three], axis=1)

In [36]:
copy_ct = cleaned_test.copy()

copy_ct['host_is_superhost'].fillna(copy_ct['host_is_superhost'].mode()[0], inplace=True)
copy_ct['host_response_time'].fillna(copy_ct['host_response_time'].mode()[0], inplace=True)

columns_with_missing = ['num_bathrooms', 'reviews_per_month', 'host_is_superhost', 
                        'review_scores_rating', 'host_response_rate', 
                        'host_acceptance_rate', 'beds', 'review_scores_communication', 
                        'review_scores_cleanliness', 'review_scores_accuracy', 
                        'review_scores_value', 'review_scores_location', 'review_scores_checkin']

knn_imputer = impute.KNNImputer(n_neighbors=10)
copy_ct_imputed = knn_imputer.fit_transform(copy_ct[columns_with_missing])
copy_ct_imputed_df = pd.DataFrame(copy_ct_imputed, columns=columns_with_missing)
copy_ct[columns_with_missing] = copy_ct_imputed_df

In [37]:
X_test_non_redundant = copy_ct.drop(columns = to_be_removed)
X_test_non_redundant = pd.get_dummies(X_test_non_redundant, drop_first = True)
X_test_non_redundant = X_test_non_redundant.iloc[:, 2:]

In [38]:
poly_test = PolynomialFeatures(2, include_bias = False)
poly_test.fit(X_test_non_redundant)
X_test_poly = poly_test.transform(X_test_non_redundant)

In [39]:
X_test_non_scaled_poly_df = pd.DataFrame(X_test_poly, columns = poly_test.get_feature_names_out(X_test_non_redundant.columns))

#### Scaling

In [40]:
sc = StandardScaler()
X_test_scaled = sc.fit_transform(X_test_poly)

## Step 2) Hyperparameter tuning

### How many attempts did it take you to tune the model hyperparameters?

It took me three attempts to tune the model hyperparameters. The first attempt was performed without any variable interactions, so the resulting RMSE of the model was quite far from the 130 threshold on Kaggle. My second attempt to tune the hyperparameters came after I utilized `PolynomialFeatures` to add variable interactions to the model, which resulted in a lower RMSE, but just barely over the threshold. After looking over my code, I discovered that I was not scaling the datasets properly, so my third and final attempt was performed after the scaling issues were resolved. This attempt resulted in a RMSE under the 130 threshold. 

### Which tuning method did you use (grid search / Bayes search / etc.)?

I used grid search (`GridSearchCV`) to tune the hyperparameters.

### What challenges did you face while tuning the hyperparameters, and what actions did you take to address those challenges?

The challenge that I encountered was determining the appropriate place to scale my datasets before tuning the hyperparameters. At first, I only scaled my training datatset once, which took place before the process of selecting predictors and tuning the hyperparameters. However, since this resulted in a RMSE that was above the threshold, I decided to scale the training data before using Lasso to eliminate predictors, as well as every time I indexed a set of predictors to perform grid search (in the section **Tuning the Hyperparameters**). By addressing this challenge, I was able to reach the target RMSE. 

### How many hours did you spend on hyperparameter tuning?

It took about 1.5 hours / 90 minutes for the code to tune the hyperparameters. The variable selection process with Lasso lasted roughly one hour, and the GridSearchCV search method took about 30 minutes to finish. I spent about 2 hours on the code that performs the hyperparameter tuning. 

### Hyperparameter tuning code

#### <font color = 'red'>Lasso for variable selection</font>

In [22]:
alphas = np.logspace(-1,-5,200)
lassocv = LassoCV(alphas = alphas, cv = 10, max_iter = 10000)
lassocv.fit(X_train_scaled, y_train)

LassoCV(alphas=array([1.00000000e-01, 9.54771611e-02, 9.11588830e-02, 8.70359136e-02,
       8.30994195e-02, 7.93409667e-02, 7.57525026e-02, 7.23263390e-02,
       6.90551352e-02, 6.59318827e-02, 6.29498899e-02, 6.01027678e-02,
       5.73844165e-02, 5.47890118e-02, 5.23109931e-02, 4.99450512e-02,
       4.76861170e-02, 4.55293507e-02, 4.34701316e-02, 4.15040476e-02,
       3.96268864e-02, 3.78346262e-0...
       2.89942285e-05, 2.76828663e-05, 2.64308149e-05, 2.52353917e-05,
       2.40940356e-05, 2.30043012e-05, 2.19638537e-05, 2.09704640e-05,
       2.00220037e-05, 1.91164408e-05, 1.82518349e-05, 1.74263339e-05,
       1.66381689e-05, 1.58856513e-05, 1.51671689e-05, 1.44811823e-05,
       1.38262217e-05, 1.32008840e-05, 1.26038293e-05, 1.20337784e-05,
       1.14895100e-05, 1.09698580e-05, 1.04737090e-05, 1.00000000e-05]),
        cv=10, max_iter=10000)

In [23]:
lasso = Lasso(alpha = lassocv.alpha_)
lasso.fit(X_train_scaled, y_train)
coefficients = {}
for i in range(len(lasso.coef_)):
    coefficients[poly.get_feature_names_out()[i]] = lasso.coef_[i]

In [1]:
sorted_predictors = pd.Series(coefficients).sort_values(key = abs, ascending = False).index.to_list()

NameError: name 'pd' is not defined

### <font color = 'red'>Tuning the Hyperparameters</font>

In [25]:
analysis_results = pd.DataFrame(columns = ['r', 'predictors', 'n_neighbors', 'weights', 'p', 'Optimal RMSE'])

In [26]:
for i in range(20, 51):
    predictors = sorted_predictors[:i]
    
    X = X_train_non_scaled_poly_df.loc[:, predictors]
    sc  = StandardScaler()
    Xstd = sc.fit_transform(X)
    
    # Using GridSearchCV to tune the hyperparameters:

    # 1) Create the model
    model = KNeighborsRegressor(metric = 'minkowski')

    # 2) Create a hyperparameter grid (as a dict)   
    grid = {'n_neighbors': np.arange(1, 21), 'weights':['uniform', 'distance'], 'p': [1, 1.1]}

    # 3) Create the Kfold object
    kfold = KFold(n_splits = 5, shuffle = True, random_state = 1)

    # 4) Create the CV object
    gcv = GridSearchCV(model, param_grid = grid, cv = kfold, scoring = 'neg_root_mean_squared_error', n_jobs = -1)

    # Fit the models, and cross-validate
    gcv.fit(Xstd, y_train)    
    analysis_results = analysis_results.append({'r': i, 'predictors': predictors, 'n_neighbors': gcv.best_params_['n_neighbors'], 'weights': gcv.best_params_['weights'], 'p': gcv.best_params_['p'], 'Optimal RMSE': -gcv.best_score_}, ignore_index=True)

### Optimal hyperparameter values

In [41]:
analysis_results.sort_values(by = 'Optimal RMSE').iloc[:1]

Unnamed: 0,r,predictors,n_neighbors,weights,p,Optimal RMSE
19,39,"[num_bathrooms property_category_Room, num_bat...",11,distance,1,0.42027


The optimal value of `n_neighbors` is 11, the optimal number of predictors is 39, the optimal `weights` is `distance`, and the optimal value of `p`is 1.

## Step 3) Developing the model

In [42]:
# sorted_predictors is defined in Step 2)
predictors = sorted_predictors[:39]
Xtrain = X_train_non_scaled_poly_df.loc[:, predictors]
Xtest = X_test_non_scaled_poly_df.loc[:, predictors]
sc  = StandardScaler()
Xtrain_std = sc.fit_transform(Xtrain)
Xtest_std = sc.transform(Xtest)

In [43]:
model = KNeighborsRegressor(n_neighbors = 11, metric = 'minkowski', weights = 'distance', p = 1).fit(Xtrain_std, y_train)

## Step 4) Ad-hoc steps for further improving model accuracy

In [44]:
test_pred = np.exp(model.predict(Xtest_std))

## Step 5) Exporting the predictions in the format required to submit on Kaggle

In [45]:
copy_ct.insert(1, "predicted", test_pred)
to_submit = copy_ct.iloc[:, :2]

In [46]:
to_submit.to_csv('KNN Regression - Final Submission.csv', index=False)  