# Airbnb price prediction modeling competition hosted by SIGNATE

* The competition is already terminated and is no longer accessible on the site
* The submission with the following code was ranked 8th / 931 participants with RMSE around 140.9

### 1. Preparation : word extraction for the scoring of natural language columns

##### 1-1. Extract words from the description column and the name column, and assign the median value to each word

In [None]:

# ================== Import necessary libraries==================

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import re
from collections import defaultdict
import optuna.integration.lightgbm as lgb_tune
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error
import lightgbm as lgb

In [None]:
# ================== Extract words from description/name columns ==================

df=pd.read_csv('train.csv', index_col=0)

# Create a new column 'descname' by concatenating 'description' and 'name' columns with a space separator
df['descname'] = df['description'] + ' ' + df['name']
df['descname'] = df['descname'].str.lower()  # 小文字化

# Initialize CountVectorizer to convert text to a binary matrix representation
vectorizer = CountVectorizer(stop_words='english', binary=True)
X = vectorizer.fit_transform(df['descname'])  

# Calculate the total count of each unique word across all documents
word_list = vectorizer.get_feature_names_out()
word_counts = np.asarray(X.sum(axis=0)).flatten()

# Set the column y as a target variable
y_values = df['y'].values

# Compute median price for each word
median_prices = []
for i in range(len(word_list)):
    doc_indices = X[:, i].nonzero()[0]  
    if len(doc_indices) > 0:
        median_price = np.median(y_values[doc_indices])
    else:
        median_price = np.nan
    median_prices.append(median_price)

# Store the words in a dataframe
word_frequency = pd.DataFrame({
    'word': word_list,
    'count': word_counts,
    'y_median': median_prices})    #DataFrame化

# Filter the words by the count > 10
word_frequency_filtered = word_frequency[word_frequency['count'] >= 10].sort_values(by='y_median', ascending=False)

# Score the words by subtracting the median value of all properties from each y_median
overall_median = np.median(y_values)
word_frequency_filtered['score'] = word_frequency_filtered['y_median'] - overall_median

# Export the DataFrame to .csv
word_frequency_filtered.to_csv('word_score.csv')　


##### 1-2. Decompose amenity column strings into words
* *After exporting the dataframe, I gave a point to each amenity item manually on a scale of 0 to 4 by how luxurious the facility I thought it would imply.*<br>
* *However, if it is too cumbersome, it is also possible to use the median value by item, with the same method as the description/name columns*

In [None]:
# Initialize a default dictionary to count the occurrence of each amenity
dic = defaultdict(int)

# Split the string by commas and count each amenity
for _, row in df.iterrows():
    match = re.search(r'\{(.*?)\}', row['amenities'])
    if match:
        keys = [k.strip() for k in match.group(1).split(',')]
        for key in keys:
            dic[key] += 1

# Convert the dictionary into a DataFrame
amenities_df = pd.DataFrame(list(dic.items()), columns=['amenity', 'count'])

# Clean up the strings and export it to .csv
amenities_df['amenity' ] = amenities_df['amenity'].str.replace('"', '').str.strip()
amenities_df.to_csv('amenity_score.csv')

'''
【Note】As a result of the above processing, there are 130 amenity elements found.
With only 130 elements, I thought it is better to manually assign scores rather than linking them to the median in terms of the accuracy.
Therefore, I've manually adopted a 5-point scale from 0 to 4 to each amenity on the above csv.
(Elements likely to be highly correlated with high-priced properties were assigned 4 points, 
while those likely to be with low correlation received lower points.)
'''

### 2. Data Preprocessinng

In [None]:
# Import files
df=pd.read_csv('train.csv', index_col=0)
amenity_scores_df = pd.read_csv('amenity_score.csv')
word_scores_df = pd.read_csv('word_score.csv')

# Convert scoring csv files to dictionaries
amenity_scores_dict = pd.Series(amenity_scores_df.score.values, index=amenity_scores_df.amenity).to_dict()
word_scores_dict = pd.Series(word_scores_df.score.values, index=word_scores_df.word).to_dict()


In [None]:
def data_pre(df):
  
  # ====== Missing values imputation ======

  # Fill missing values in bathroom/bedroom/bed columns by median value grouped by df['accommodates']
  for column in ['bathrooms', 'bedrooms', 'beds']:
      if df[column].isnull().sum() > 0:
          medians = df.groupby('accommodates')[column].median()

          df[column] = df.apply(
              lambda row: medians[row['accommodates']] if pd.isnull(row[column]) else row[column], axis=1
          )
  # Fill missing rating with overall median
  df['review_scores_rating'] = df['review_scores_rating'].fillna(df['review_scores_rating'].median())

  # Fill missing categorical host data with 'f' (false)
  df['host_identity_verified'] = df['host_identity_verified'].fillna('f')
  df["host_has_profile_pic"] = df["host_has_profile_pic"].fillna('f')

  # Clean and fill missing 'host_response_rate'
  df["host_response_rate"] = df["host_response_rate"].str.rstrip('%').astype('float')
  df['host_response_rate'] = df['host_response_rate'].fillna(df['host_response_rate'].median())
  
  # Convert availability of thumbnail URL into binary (1 if exists, 0 otherwise)
  df['thumbnail_url'] = df['thumbnail_url'].notna().astype(int)


  # ====== Handle datetime columns ======

  # Convert 'host_since' to datetime and fill missing with median
  df['host_since'] = pd.to_datetime(df['host_since'])
  df['host_since']=df['host_since'].fillna(df['host_since'].median())

  # Convert 'first_review' and 'last_review' to datetime
  df["first_review"] = pd.to_datetime(df["first_review"])
  df['first_review']=df['first_review'].fillna(df['host_since'])

  df["last_review"] = pd.to_datetime(df["last_review"])
  df['last_review']=df['last_review'].fillna(df['host_since'])

  # Convert datetime to numeric values (days or UNIX timestamp)
  df['host_days_since'] = (pd.Timestamp('2017-10-05') - df['host_since']).dt.days
  df["first_review"] = df["first_review"].astype('int64') // 10**9
  df["last_review"] = df["last_review"].astype('int64') // 10**9


  # ====== Feature engineering ======

  # Clean the amenities string
  df['cleaned_amenities'] = df['amenities'].str.replace('"', '', regex=False)

  # Calculate the amenity score from predefined dictionary
  def calculate_amenity_score(cleaned_amenities):
      total_score = 0
      keys = re.findall(r'\{(.*)\}', cleaned_amenities)
      if keys:
          for key in keys[0].split(","):
              key = key.strip()
              total_score += amenity_scores_dict.get(key, 0)
      return total_score
  
  df['amenity_scores'] = df['cleaned_amenities'].apply(calculate_amenity_score)

  # Calculate the name score from predefined dictionary
  def calculate_name_score(name):
      total_score = 0
      words = name.split() 
      for word in words:
          word = word.strip().lower()  
          if word in word_scores_dict:
              total_score += word_scores_dict[word] 
      return total_score
  df['name_scores'] = df['name'].apply(calculate_name_score)

  # Calculate the description score from predefined dictionary
  def calculate_description_score(description):
      total_score = 0
      words = description.split()
      for word in words:
          word = word.strip().lower()
          if word in word_scores_dict:
              total_score += word_scores_dict[word]
      return total_score
  df['description_scores'] = df['description'].apply(calculate_description_score)

  # Count number of words in the 'description' field
  df["description_wordcount"] = df["description"].apply(lambda x: len(str(x).split()))


 # ====== Drop unused columns ======

  drop_list=["amenities","city","description","neighbourhood","name",'cleaned_amenities',"host_since","zipcode"]
  df=df.drop(drop_list,axis=1)


  # Convert object-type columns to category dtype
  object_cols = df.select_dtypes(include='object').columns
  df[object_cols] = df[object_cols].astype('category')

  return df

In [None]:
# Apply the preprocessing function
df=data_pre(df)

In [None]:
# ================== Target Encoding ==================

# Define a function for target encoding
def apply_target_encoding(train_df, test_df, target_col):
    categorical_cols = train_df.select_dtypes(include=['object', 'category']).columns
    global_mean = train_df[target_col].mean()
    encoding_maps = {}

    for col in categorical_cols:
        # Computing the average y value by category
        mean_map = train_df.groupby(col)[target_col].mean().to_dict()
        mean_map['Other'] = global_mean
        encoding_maps[col] = mean_map

        # Replace unseen categories in the test data with 'Other'
        test_df[col] = np.where(test_df[col].isin(mean_map), test_df[col], 'Other')

        # Add encoded columns
        train_df[f'{col}_encoded'] = train_df[col].map(mean_map).astype(float)
        test_df[f'{col}_encoded'] = test_df[col].map(mean_map).astype(float)

    # Drop the original columns
    train_df.drop(columns=categorical_cols, inplace=True)
    test_df.drop(columns=categorical_cols, inplace=True)

    return train_df, test_df, encoding_maps

# Load the test data and convert it into a dataframe
df_test = pd.read_csv('test.csv', index_col=0)
df_test = data_pre(df_test)

# apply the target encoding function to the train data and the test data
df, df_test, encodings = apply_target_encoding(df, df_test, target_col='y')

### 3. Data Modeling using LightGBM

In [None]:
import optuna.integration.lightgbm as lgb_tune
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

# Split the dataset into training and validation sets
df_train, df_val = train_test_split(df, test_size=0.2)

# Define target and feature columns for training data
col = "y"
train_y = df_train[col]
train_x = df_train.drop(col, axis=1)

# Define target and feature columns for validation data
val_y = df_val[col]
val_x = df_val.drop(col, axis=1)

# Create LightGBM dataset objects
trains = lgb.Dataset(train_x, train_y)
valids = lgb.Dataset(val_x, val_y)

# Set basic parameters for LightGBM with Optuna tuning
params = {
    "objective": "regression",
    "metric": "rmse",
}

# Train the model with Optuna-tuned LightGBM
model_tune = lgb_tune.train(
    params, trains, valid_sets=[valids],
    callbacks=[
        lgb.early_stopping(100),  
        lgb.log_evaluation(100)   
    ]
)

# Predict on the validation set and calculate RMSE
val_preds = model_tune.predict(val_x)
val_rmse = mean_squared_error(val_y, val_preds, squared=False)
print(f'Validation RMSE: {val_rmse}')

In [None]:
# Load and preprocess the test dataset
df_test=pd.read_csv("test.csv", index_col=0)
df_test=data_pre(df_test)

# Predict on the test dataset
predict = model_tune.predict(df_test)

### 4. Postprocessing and export of the submission file

In [None]:
# Replace predicted prices lower than 10 with 10 to handle unrealistic prediction
predict = [max(10, pred) for pred in predict]

# Assign the adjusted predictions back to the test DataFrame
df_test["y"] = predict

# Export the predictions as a CSV file without header
df_test["y"].to_csv("submission.csv", header=False)