In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
from sklearn import linear_model
from sklearn import model_selection
from sklearn.neighbors import KNeighborsClassifier

In [2]:
# read in listing data set
listDta = pd.read_csv("listings.csv")  # Dataset accessible from https://drive.google.com/file/d/1p-4NjlnR99eqF9YSuRaL0rbB5NcqqHef/view?usp=sharing

FileNotFoundError: File b'listings.csv' does not exist

## Impute `review_scores_location`

### S1 extract relevant col's from original dataset

In [None]:
listDtaSubset = listDta[['latitude','longitude',"neighbourhood_cleansed", "review_scores_location"]]

In [None]:
# listDtaSubset.shape
# listDtaSubset.isna().sum(axis = 0)

### S2 one-hot encode neighbourhood variable

In [None]:
neighbourhood = pd.get_dummies(listDtaSubset.neighbourhood_cleansed)  # a matrix with number of rows equal to the number of rows in the original dataset and number cols = number unique neighborhood values. For each row, only one element is 1, everything else are 0's.
listDtaSubset = listDtaSubset.join(neighbourhood)
listDtaSubset.drop(columns = "neighbourhood_cleansed", inplace= True)

In [None]:
# listDtaSubset.shape

### S3 use subset of data to train KNN (rows with all model variable and label non-missing)

In [None]:
# drop all observations with either x or y missing w.r.t. the model
# Note: I remember none of the latitude, longiture info is missing
data = listDtaSubset.dropna(axis = 0, subset = ['latitude','longitude',"review_scores_location"])
# data.shape

### S4 Use KNN model to predict all location review scores

In [None]:
# split valid data set into training and testing data sets
tmp_train, tmp_test = model_selection.train_test_split(data, test_size = 0.3)
tmp_train_X = tmp_train.drop(columns="review_scores_location")
tmp_train_Y = tmp_train["review_scores_location"]

tmp_test_X = tmp_test.drop(columns="review_scores_location")
tmp_test_Y = tmp_test["review_scores_location"]

In [None]:
# tmp_test_X.shape
# tmp_train_Y.shape

In [None]:
# Train KNN model
KNN = KNeighborsClassifier(n_neighbors = 11)  #  I tried k = 3-5. 11 is significantly better than 3, but similar to 12-15 or 10

KNN.fit(tmp_train_X, tmp_train_Y)
KNN_train_score = KNN.score(tmp_train_X,tmp_train_Y)
KNN_test_score = KNN.score(tmp_test_X, tmp_test_Y)

print("training accuracy: ", KNN_train_score, "\ntesting accuracy: ", KNN_test_score)

### S5 fill missing location review score with predicted missing value

In [None]:
data2 = listDtaSubset.dropna(axis = 0, subset = ['latitude','longitude'])  # should not make any change since none of the latitude and longitude is missing in LA listing dataset
# data2.shape
data2 = data2.drop(columns="review_scores_location")

# predict the location score for ALL observations
review_scores_location_imputed = KNN.predict(data2)

In [None]:
# For observation with NA as review score, substitue in the predicted score
mask = listDta["review_scores_location"].isna()
listDta.loc[mask,"review_scores_location"] = review_scores_location_imputed[mask]
# listDta = listDta.is (value = pd.Series(review_scores_location_imputed), axis = 0)

In [None]:
# listDta.isna().sum(axis=0)["review_scores_location"]
# # review_scores_location_imputed.view()
# listDta["review_scores_location"][2]
# review_scores_location_imputed[2]

# listDta["review_scores_location"].isna()

In [None]:
## S6 Output listing data file with imputed location review score
# listDta.to_csv("listings_review_location_imputed.csv")