### Load Data

In [None]:
airbnb_data <- read.csv("Airbnb_Open_Data.csv", header = T, na.strings = "") # set "" to NA
airbnb_data

### Clean Data

#### Get corresponding response and predictor columns

In [None]:
new_data <- subset(as.data.frame(airbnb_data), 
                   select = c(price, instant_bookable, room.type, Construction.year, 
                              service.fee, minimum.nights, review.rate.number, availability.365))

In [None]:
head(new_data)
dim(new_data)

#### Remove rows that contains NA

In [None]:
# Remove rows with NA values
clean_data <- na.omit(new_data)

In [None]:
head(clean_data)
dim(clean_data)

#### Check if there are still any empty entries

In [None]:
has_empty_entries <- sum(is.na(clean_data)) > 0
has_empty_entries

#### Remove duplicates

In [None]:
unique(clean_data)

In [None]:
install.packages("dplyr")

library(dplyr)

In [None]:
new_clean_data <- clean_data %>% distinct() # remove duplicate rows and only keep the distinct rows
dim(new_clean_data)

#### Get data summary

In [None]:
summary(new_clean_data)

In [None]:
mode(new_clean_data$Construction.year)
mode(new_clean_data$minimum.nights)

#### Convert random variables to appropriate data types

In [None]:
columns_to_convert <- c("price", "service.fee")

for (col_name in columns_to_convert) {
  new_clean_data[, col_name] <- as.numeric(gsub("[^0-9.]", "", new_clean_data[, col_name]))
}

In [None]:
# check summary again after conversion
summary(new_clean_data)

### Fit Model

In [None]:
attach(new_clean_data)

In [None]:
lm(price ~ instant_bookable + room.type + Construction.year +
   service.fee + minimum.nights + review.rate.number + availability.365)

### Check Assumptions

#### Residual Plots

#### Condition mean predictor and response