## Load Data

In [None]:
airbnb_data <- read.csv("AirBnB.csv", header = T, na.strings = "") # set "" to NA
airbnb_data

In [None]:
colnames(airbnb_data)

## Clean Data

#### Get corresponding response and predictor columns

In [None]:
new_data <- subset(as.data.frame(airbnb_data), 
                   select = c(price, review_scores_rating, minimum_nights, property_type, 
                              room_type, accommodates))

In [None]:
head(new_data)
dim(new_data)

#### Remove rows that contains NA

In [None]:
# Remove rows with NA values
clean_data <- na.omit(new_data)

In [None]:
head(clean_data)
dim(clean_data)

#### Check if there are still any empty entries

In [None]:
has_empty_entries <- sum(is.na(clean_data)) > 0
has_empty_entries

#### Remove duplicates

In [None]:
unique(clean_data)

In [None]:
install.packages("dplyr")

library(dplyr)

In [None]:
new_clean_data <- clean_data %>% distinct() # remove duplicate rows and only keep the distinct rows
dim(new_clean_data)

#### Get data summary

In [None]:
summary(new_clean_data)

In [None]:
mode(new_clean_data$price)

#### Convert random variables to appropriate data types

In [None]:
columns_to_convert <- c("price")

for (col_name in columns_to_convert) {
  new_clean_data[, col_name] <- as.numeric(gsub("[^0-9.]", "", new_clean_data[, col_name]))
}

In [None]:
# check summary again after conversion
summary(new_clean_data)

In [None]:
hist(new_clean_data$price, main="Histogram", xlab="price")

From the histogram above, the response is a right skewed, convert it to normal.

#### Convert the data so that it has a normal price distribution.

In [None]:
install.packages("MASS")
library(MASS)

In [None]:
boxcox(lm(new_clean_data$price~1))

In [None]:
newprice <- log(new_clean_data$price)
hist(newprice, main="Histogram", xlab="price")

In [None]:
data <- data.frame(cbind(newprice, new_clean_data[,-1]))

In [None]:
head(data)
dim(data)

In [None]:
summary(data)

## Fit Model

In [None]:
attach(data)

In [None]:
lm(newprice ~ review_scores_rating + minimum_nights + property_type + room_type + accommodates)

In [None]:
model <- lm(price ~ instant_bookable + room.type + Construction.year +
            service.fee + minimum.nights + review.rate.number + availability.365)
summary(model)

## Check Assumptions

### Check Response Normality

#### Residual Plots

In [None]:
e_hat <- resid(model)

In [None]:
qqnorm(e_hat)

In [None]:
# residual vs fitted
y_hat <- fitted(model)
plot(e_hat, y_hat, main="Residuals vs Fitted", ylab="Residuals", xlab="Fitted")

In [None]:
# Scatterplots
# Residual vs. Construction year
plot(x=Construction.year, y=e_hat, main="Residual vs. Construction year", ylab="Residuals", xlab="Construction year")

# Residual vs. Service fee
plot(x=service.fee, y=e_hat, main="Residual vs. Service fee", ylab="Residuals", xlab="Service fee")

# Residual vs. Minimum nights
plot(x=minimum.nights, y=e_hat, main="Residual vs. Minimum nights", ylab="Residuals", xlab="Minimum nights")

# Residual vs. Availability
plot(x=availability.365, y=e_hat, main="Residual vs. Availability", ylab="Residuals", xlab="Availability")

# Boxplots
# Residual vs. Instant bookable
boxplot(e_hat ~ instant_bookable , main="Residual vs. Instant bookable", ylab="Residuals", xlab="Instant bookable")

# Residual vs. Room type
boxplot(e_hat ~ room.type, main="Residual vs. Room type", ylab="Residual", xlab="Room type")

# Residual vs. Review rating
boxplot(e_hat ~ review.rate.number, main="Residual vs. Review rating", ylab="Residuals", xlab="Review rating")

#### Condition mean predictor and response